In [2]:
import numpy as np
import pandas as pd

df = pd.DataFrame(
    {
        "id":[1001, 1002, 1003, 1004, 1005, 1006],
        "date":pd.date_range('20130102', periods=6),
        "city":["Beijing", "SH", ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
        "age":[23,44, 54, 32, 34, 32],
        "category":['100-A', '100-B', '110-A', '110-C', '210-A', '130-F'],
        "price":[1200, np.nan, 2133, 5433, np.nan, 4432]
    },
    columns=['id', 'date', 'city', 'category', 'age', 'price']
)

In [3]:
df

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,Beijing,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


In [5]:
# 查看数据的维度
print(df.shape)

(6, 6)


In [6]:
# 查看数据表信息
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 6 columns):
id          6 non-null int64
date        6 non-null datetime64[ns]
city        6 non-null object
category    6 non-null object
age         6 non-null int64
price       4 non-null float64
dtypes: datetime64[ns](1), float64(1), int64(2), object(2)
memory usage: 368.0+ bytes
None


In [7]:
# 查看数据表各列格式
print(df.dtypes)

id                   int64
date        datetime64[ns]
city                object
category            object
age                  int64
price              float64
dtype: object


In [8]:
df['id'].dtype

dtype('int64')

In [10]:
df.isnull()

Unnamed: 0,id,date,city,category,age,price
0,False,False,False,False,False,False
1,False,False,False,False,False,True
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,True
5,False,False,False,False,False,False


In [11]:
#检查特定值是否为空
df['price'].isnull()

0    False
1     True
2    False
3    False
4     True
5    False
Name: price, dtype: bool

In [12]:
#查看city列的唯一值
df['city'].unique()

array(['Beijing', 'SH', ' guangzhou ', 'Shenzhen', 'shanghai', 'BEIJING '],
      dtype=object)

In [13]:
#查看数据表的值
df.values

array([[1001, Timestamp('2013-01-02 00:00:00'), 'Beijing', '100-A', 23,
        1200.0],
       [1002, Timestamp('2013-01-03 00:00:00'), 'SH', '100-B', 44, nan],
       [1003, Timestamp('2013-01-04 00:00:00'), ' guangzhou ', '110-A',
        54, 2133.0],
       [1004, Timestamp('2013-01-05 00:00:00'), 'Shenzhen', '110-C', 32,
        5433.0],
       [1005, Timestamp('2013-01-06 00:00:00'), 'shanghai', '210-A', 34,
        nan],
       [1006, Timestamp('2013-01-07 00:00:00'), 'BEIJING ', '130-F', 32,
        4432.0]], dtype=object)

In [15]:
#查看列名称
df.keys()

Index(['id', 'date', 'city', 'category', 'age', 'price'], dtype='object')

In [16]:
#查看前三行数据
df.head(3)

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,Beijing,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0


In [17]:
#查看后三行的数据
df.tail(3)

Unnamed: 0,id,date,city,category,age,price
3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


In [18]:
#删除数据表中含有空值的行
df.dropna(how='any')

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,Beijing,100-A,23,1200.0
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


In [19]:
#使用数字0填充数据表中的空值
df.fillna(value=0)

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,Beijing,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,0.0
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,Shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,0.0
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


In [20]:
#使用price均值对NA进行填充
df['price'].fillna(df['price'].mean())

0    1200.0
1    3299.5
2    2133.0
3    5433.0
4    3299.5
5    4432.0
Name: price, dtype: float64

In [45]:
df.fillna(value=df.price.mean())

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,beijing,100-A,23,1200.0
1,1002,2013-01-03,sh,100-B,44,3299.5
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,3299.5
5,1006,2013-01-07,beijing,130-F,32,4432.0


In [28]:
#清除city字段中的字符空格
df['city']=df['city'].map(str.strip)
df

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,BEIJING,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,GUANGZHOU,110-A,54,2133.0
3,1004,2013-01-05,SHENZHEN,110-C,32,5433.0
4,1005,2013-01-06,SHANGHAI,210-A,34,
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


In [23]:
df['city']=df['city'].str.strip()

In [43]:
#city列大小写转换
# df['city']=df['city'].str.lower()
# df

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,beijing,100-A,23,1200.0
1,1002,2013-01-03,sh,100-B,44,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0
3,1004,2013-01-05,shenzhen,110-C,32,5433.0
4,1005,2013-01-06,shanghai,210-A,34,
5,1006,2013-01-07,beijing,130-F,32,4432.0


In [27]:
# df['city']=df['city'].map(str.upper)
# df

Unnamed: 0,id,date,city,category,age,price
0,1001,2013-01-02,BEIJING,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,GUANGZHOU,110-A,54,2133.0
3,1004,2013-01-05,SHENZHEN,110-C,32,5433.0
4,1005,2013-01-06,SHANGHAI,210-A,34,
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


In [47]:
#更改数据格式
# df['price'].astype('int')

In [35]:
#更改列名称
df.rename(columns={'category':'category-size'})

Unnamed: 0,id,date,city,category-size,age,price
0,1001,2013-01-02,BEIJING,100-A,23,1200.0
1,1002,2013-01-03,SH,100-B,44,
2,1003,2013-01-04,GUANGZHOU,110-A,54,2133.0
3,1004,2013-01-05,SHENZHEN,110-C,32,5433.0
4,1005,2013-01-06,SHANGHAI,210-A,34,
5,1006,2013-01-07,BEIJING,130-F,32,4432.0


In [36]:
df['city']

0      BEIJING
1           SH
2    GUANGZHOU
3     SHENZHEN
4     SHANGHAI
5      BEIJING
Name: city, dtype: object

In [37]:
#删除相同的值 保留第一位
df['city'].drop_duplicates()

0      BEIJING
1           SH
2    GUANGZHOU
3     SHENZHEN
4     SHANGHAI
Name: city, dtype: object

In [38]:
#删除相同的值 保留最后一位
df['city'].drop_duplicates(keep='last')

1           SH
2    GUANGZHOU
3     SHENZHEN
4     SHANGHAI
5      BEIJING
Name: city, dtype: object

In [44]:
#数据替换
df['city'].replace('sh', 'shanghai')

0      beijing
1     shanghai
2    guangzhou
3     shenzhen
4     shanghai
5      beijing
Name: city, dtype: object

In [50]:
df1=pd.DataFrame({
        'id':[1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008],
        'gender':['male', 'female', 'male', 'female', 'male', 'female', 'male', 'female'],
        'pay':['Y', 'N', 'Y', 'Y', 'N', 'Y', 'N', 'Y'],
        'm-point':[10, 12, 20, 40, 40, 40, 30, 20]
})
df1

Unnamed: 0,gender,id,m-point,pay
0,male,1001,10,Y
1,female,1002,12,N
2,male,1003,20,Y
3,female,1004,40,Y
4,male,1005,40,N
5,female,1006,40,Y
6,male,1007,30,N
7,female,1008,20,Y


In [52]:
#合并数据表
df_inner = pd.merge(df, df1, how='inner')
df_inner

Unnamed: 0,id,date,city,category,age,price,gender,m-point,pay
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,10,Y
1,1002,2013-01-03,sh,100-B,44,,female,12,N
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,20,Y
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,40,Y
4,1005,2013-01-06,shanghai,210-A,34,,male,40,N
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,40,Y


In [53]:
df_left = pd.merge(df, df1, how='left')
df_left

Unnamed: 0,id,date,city,category,age,price,gender,m-point,pay
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,10,Y
1,1002,2013-01-03,sh,100-B,44,,female,12,N
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,20,Y
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,40,Y
4,1005,2013-01-06,shanghai,210-A,34,,male,40,N
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,40,Y


In [54]:
df_right = pd.merge(df, df1, how='right')
df_right

Unnamed: 0,id,date,city,category,age,price,gender,m-point,pay
0,1001,2013-01-02,beijing,100-A,23.0,1200.0,male,10,Y
1,1002,2013-01-03,sh,100-B,44.0,,female,12,N
2,1003,2013-01-04,guangzhou,110-A,54.0,2133.0,male,20,Y
3,1004,2013-01-05,shenzhen,110-C,32.0,5433.0,female,40,Y
4,1005,2013-01-06,shanghai,210-A,34.0,,male,40,N
5,1006,2013-01-07,beijing,130-F,32.0,4432.0,female,40,Y
6,1007,NaT,,,,,male,30,N
7,1008,NaT,,,,,female,20,Y


In [55]:
df_outer = pd.merge(df, df1, how='outer')
df_outer

Unnamed: 0,id,date,city,category,age,price,gender,m-point,pay
0,1001,2013-01-02,beijing,100-A,23.0,1200.0,male,10,Y
1,1002,2013-01-03,sh,100-B,44.0,,female,12,N
2,1003,2013-01-04,guangzhou,110-A,54.0,2133.0,male,20,Y
3,1004,2013-01-05,shenzhen,110-C,32.0,5433.0,female,40,Y
4,1005,2013-01-06,shanghai,210-A,34.0,,male,40,N
5,1006,2013-01-07,beijing,130-F,32.0,4432.0,female,40,Y
6,1007,NaT,,,,,male,30,N
7,1008,NaT,,,,,female,20,Y


In [56]:
#按特定列的值排序
df_inner.sort_values(by=['age'])

Unnamed: 0,id,date,city,category,age,price,gender,m-point,pay
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,10,Y
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,40,Y
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,40,Y
4,1005,2013-01-06,shanghai,210-A,34,,male,40,N
1,1002,2013-01-03,sh,100-B,44,,female,12,N
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,20,Y


In [57]:
#按索引列排序
df_inner.sort_index()

Unnamed: 0,id,date,city,category,age,price,gender,m-point,pay
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,10,Y
1,1002,2013-01-03,sh,100-B,44,,female,12,N
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,20,Y
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,40,Y
4,1005,2013-01-06,shanghai,210-A,34,,male,40,N
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,40,Y


In [61]:
#如果price的值>3000 group 显示为high ，low 数据分组
df_inner['group'] = np.where(df_inner['price'] > 3000, 'high', 'low')
df_inner

Unnamed: 0,id,date,city,category,age,price,gender,m-point,pay,group
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,10,Y,low
1,1002,2013-01-03,sh,100-B,44,,female,12,N,low
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,20,Y,low
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,40,Y,high
4,1005,2013-01-06,shanghai,210-A,34,,male,40,N,low
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,40,Y,high


In [63]:
df_inner['sign1']=np.where((df_inner['city']=='beijing')&(df_inner['price']>=4000), 1, np.nan)
df_inner

Unnamed: 0,id,date,city,category,age,price,gender,m-point,pay,group,sign,sign1
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,10,Y,low,,
1,1002,2013-01-03,sh,100-B,44,,female,12,N,low,,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,20,Y,low,,
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,40,Y,high,,
4,1005,2013-01-06,shanghai,210-A,34,,male,40,N,low,,
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,40,Y,high,1.0,1.0


In [64]:
df_inner.loc[(df_inner['city']=='beijing')&(df_inner['price']>=4000), 'sign2']=1
df_inner

Unnamed: 0,id,date,city,category,age,price,gender,m-point,pay,group,sign,sign1,sign2
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,10,Y,low,,,
1,1002,2013-01-03,sh,100-B,44,,female,12,N,low,,,
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,20,Y,low,,,
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,40,Y,high,,,
4,1005,2013-01-06,shanghai,210-A,34,,male,40,N,low,,,
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,40,Y,high,1.0,1.0,1.0


In [68]:
#数据分列
# 对category字段的值依次进行分列 并创建数据表 索引值为df_inner的索引列 列名称为category和size
split=pd.DataFrame((x.split('-') for x in df_inner['category']), index=df_inner.index, columns=['category', 'size'])
split

Unnamed: 0,category,size
0,100,A
1,100,B
2,110,A
3,110,C
4,210,A
5,130,F


In [71]:
#将完成分列后的数据表与原df_inner数据表进行匹配
df_inner=pd.merge(df_inner, split, right_index=True, left_index=True)
df_inner

Unnamed: 0,id,date,city,category_x,age,price,gender,m-point,pay,group,sign,sign1,sign2,category_y,size_x,category,size_y
0,1001,2013-01-02,beijing,100-A,23,1200.0,male,10,Y,low,,,,100,A,100,A
1,1002,2013-01-03,sh,100-B,44,,female,12,N,low,,,,100,B,100,B
2,1003,2013-01-04,guangzhou,110-A,54,2133.0,male,20,Y,low,,,,110,A,110,A
3,1004,2013-01-05,shenzhen,110-C,32,5433.0,female,40,Y,high,,,,110,C,110,C
4,1005,2013-01-06,shanghai,210-A,34,,male,40,N,low,,,,210,A,210,A
5,1006,2013-01-07,beijing,130-F,32,4432.0,female,40,Y,high,1.0,1.0,1.0,130,F,130,F
