### 7.1 数据处理概述

### 7.2 Pandas数据结构

In [16]:
#通过列表创建Series
import pandas as pd
s=pd.Series([4,5,6,4],index=['a','b','c','d'])
s

a    4
b    5
c    6
d    4
dtype: int64

In [17]:
#省略index参数
s=pd.Series([4,5,6,4])
s

0    4
1    5
2    6
3    4
dtype: int64

In [18]:
#通过字典创建Series
d = {'a' : 1.5, 'b' : 'hello', 'c' :'li','d':2}
s=pd.Series(d)
s

a      1.5
b    hello
c       li
d        2
dtype: object

In [19]:
#data是一个常量
s=pd.Series(2.5,index=['a','b','c','d'])
s

a    2.5
b    2.5
c    2.5
d    2.5
dtype: float64

In [20]:
#像列表一样的切片引用
s=pd.Series([4,5,6,4],index=['a','b','c','d'])
s[0]       #取序号为0的元素
s[0]=9     #对序号为0的元素赋值
s[:3]      #取序号为0到3的元素
s[2:]      #取序号为2的元素到最后一个元素

c    6
d    4
dtype: int64

In [21]:
#像array一样进行向量运算
import numpy as np
s=s+1         #每个元素加1
np.mean(s)  #调用nump的方法求均值
np.std(s)   #调用nump的方法求标准差

1.8708286933869707

In [22]:
#像字典一样引用与操作
s['c']     #取索引为'c'的元素
s['e']=9   #添加一个索引为'e'的元素

In [23]:
#多个Series在进行算术运算时会自动对齐不同索引的数据
s1= pd.Series([1,2,3,4],index=['a','b','f','d'])
s2= pd.Series([3,4,5,6],index=['b','d','a','e'])
s=s1+s2
s

a    6.0
b    5.0
d    8.0
e    NaN
f    NaN
dtype: float64

In [24]:
#Series对象本身具有name属性，索引也具有name属性
s = pd.Series([1.51,1.62,1.67,1.72],index=['Alex','Bob','Jenny','Ad'])   
s.name='Height'    #将Series对象的name属性赋值为'Height'
s.index.name='Person'  #将index的name属性赋值为'Person'
s

Person
Alex     1.51
Bob      1.62
Jenny    1.67
Ad       1.72
Name: Height, dtype: float64

In [25]:
#由Series为元素的字典创建DataFrame
import pandas as pd
import numpy as np
s1= pd.Series([32,41,29,18],index=['a','b','c','f'])
s2=pd.Series([1.71,1.82,1.67,1.86],index=['a','b','c','e'])
d={'height': s1,'age':s2}
df = pd.DataFrame(d)
df

Unnamed: 0,age,height
a,1.71,32.0
b,1.82,41.0
c,1.67,29.0
e,1.86,
f,,18.0


In [26]:
#由列表为元素的字典创建DataFame
d={'height':[32,41,29,18],
   'age':[1.71,1.82,1.67,1.86]}
df=pd.DataFrame(d)
df

Unnamed: 0,age,height
0,1.71,32
1,1.82,41
2,1.67,29
3,1.86,18


In [27]:
#由Series为元素的列表创建DataFame
s1= pd.Series([32,41,29,18])
s2=pd.Series([1.71,1.82,1.67,1.86])
df=pd.DataFrame([s1,s2])
df

Unnamed: 0,0,1,2,3
0,32.0,41.0,29.0,18.0
1,1.71,1.82,1.67,1.86


### 7.3 DataFrame的基本操作

In [41]:
#列的引用与插入
d = {'one' : [1.2, 2.3, 3.6, 4.8],
  'two' : [4.9, 3.9, 2.9, 1.3]}
df=pd.DataFrame(d)
df

Unnamed: 0,one,two
0,1.2,4.9
1,2.3,3.9
2,3.6,2.9
3,4.8,1.3


In [42]:
df['three']=df['one']+df['two']  #引用原有列，并对新的一列赋值
df

Unnamed: 0,one,two,three
0,1.2,4.9,6.1
1,2.3,3.9,6.2
2,3.6,2.9,6.5
3,4.8,1.3,6.1


In [43]:
#同时引用多列
df[['one','three']]

Unnamed: 0,one,three
0,1.2,6.1
1,2.3,6.2
2,3.6,6.5
3,4.8,6.1


In [44]:
df['five']='male'                #插入标量
df['four']=pd.Series([1,3,5,6])  #插入Series，index能够完全对齐
df['six']=pd.Series([2,3])       #插入Series，index不能够完全对齐
df

Unnamed: 0,one,two,three,five,four,six
0,1.2,4.9,6.1,male,1,2.0
1,2.3,3.9,6.2,male,3,3.0
2,3.6,2.9,6.5,male,5,
3,4.8,1.3,6.1,male,6,


In [32]:
#列的筛选
#筛选出two列的值大于2的数据
df[df['two']>2]
#筛选出two列值大于2，并且four列值小于5的数据
df[(df['two']>2) & (df['four']<5)] 
#筛选出two列值大于3，或者five列的值等于male的数据
df[(df['two']>3) | (df['five']=='male')]

Unnamed: 0,one,two,three,five,four,six
0,1.2,4.9,6.1,male,1,2.0
1,2.3,3.9,6.2,male,3,3.0
2,3.6,2.9,6.5,male,5,
3,4.8,1.3,6.1,male,6,


In [34]:
#列的删除
s=df.pop('four') #从df中弹出列，并将弹出的列赋值给s
del df['three']  #从df中直接删除列
df0= df.drop('one',axis=1) #删除one列后赋值给新的df0,df的值不变
df.drop('one',axis=1,inplace=True) #从df中直接删one除列

In [47]:
#行的引用
df.index=['a','b','c','d']   #给df设置索引
s1=df.loc['b']  #引用索引为'b'的行
s2=df.iloc[3]   #引用序号为3的行
s2

one       4.8
two       1.3
three     6.1
five     male
four        6
six       NaN
Name: d, dtype: object

In [36]:
#以iloc引用行数据
df.iloc[1:3]  #引用序号1到序号3但不包含序号3的行
df.iloc[:3]   #引用前3行
df.iloc[-2:]  #引用后2行
df.iloc[2:]   #引用前2行后的数据
df.iloc[:-2]  #引用后2行前的数据

Unnamed: 0,two,five,six
a,4.9,male,2.0
b,3.9,male,3.0


In [None]:
#行的更新与插入
df.iloc[2]=[3.3,2.8,7,'femal',6.0]    #对序号为2的行重新赋值
df.loc['b']= [3.9,2.6,7,'male',6.0]   #对索引为b的行重新赋值
df.loc['e']= [4.9,2.6,5,'femal',3.5]   #插入新的一行，行索引为e

In [None]:
#行的删除
df0=df.drop('a') #删除索引为a的行后赋值给新的df0，df的值不变
df.drop('a',inplace=1)  #直接从df中删除索引为a的行

In [None]:
#行列子集的引用
df.ix[:2,['one','five']] #引用前两行，one和five列的数据
df.ix[['b','c'],-2:]     #引用索引为b,c的行，后两列的数据

In [86]:
# 排序
df.sort_values(by=['one','two'],ascending=[True,False])

Unnamed: 0,one,two,three,five,four,six
d,4.8,1.3,6.1,male,6,
a,1.2,4.9,6.1,male,1,2.0
b,2.3,3.9,6.2,male,3,3.0
c,3.6,2.9,6.5,male,5,


### 7.4 DataFrame数据的连接

#### 1 用于轴向连接的concat方法

In [29]:

import pandas as pd
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                    'B': ['B0', 'B1', 'B2', 'B3'],
                    'C': ['C0', 'C1', 'C2', 'C3'],
                    'D': ['D0', 'D1', 'D2', 'D3']},
                     index=[0, 1, 2, 3])

df2 = pd.DataFrame({'A': ['A7', 'A8', 'A9', 'A10'],
                    'B': ['B7', 'B8', 'B9', 'B10'],
                    'C': ['C7', 'C8', 'C9', 'C10'],
                    'D': ['D7', 'D8', 'D9', 'D10']},
                    index=[7, 8, 9, 10])
df3 = pd.DataFrame({'E': ['E0', 'A3', 'E4', 'E5'],
                    'F': ['F0', 'F3', 'F4', 'F5'],
                    'G': ['G0', 'G3', 'G4', 'G5'],
                    'H': ['H0', 'H3', 'H4', 'H5']},
                    index=[0, 3, 4, 5])

In [36]:
#纵向拼接
frame1 =pd.concat([df1,df2])        #在列方向上对齐，纵向拼接
frame1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9
10,A10,B10,C10,D10


In [37]:
#横向拼接
frame2=pd.concat([df1,df2],axis=1)  #在行方向上对齐，横向拼接
frame2

Unnamed: 0,A,B,C,D,A.1,B.1,C.1,D.1
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
7,,,,,A7,B7,C7,D7
8,,,,,A8,B8,C8,D8
9,,,,,A9,B9,C9,D9
10,,,,,A10,B10,C10,D10


In [38]:
#通过keys参数增加层次索引
frame1 =pd.concat([df1,df2],keys=['df1', 'df2']) 
frame1

Unnamed: 0,Unnamed: 1,A,B,C,D
df1,0,A0,B0,C0,D0
df1,1,A1,B1,C1,D1
df1,2,A2,B2,C2,D2
df1,3,A3,B3,C3,D3
df2,7,A7,B7,C7,D7
df2,8,A8,B8,C8,D8
df2,9,A9,B9,C9,D9
df2,10,A10,B10,C10,D10


In [39]:
frame2=pd.concat([df1,df2],keys=['df1', 'df2'],axis=1) #通过keys参数增加层次索引
frame2

Unnamed: 0_level_0,df1,df1,df1,df1,df2,df2,df2,df2
Unnamed: 0_level_1,A,B,C,D,A,B,C,D
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
7,,,,,A7,B7,C7,D7
8,,,,,A8,B8,C8,D8
9,,,,,A9,B9,C9,D9
10,,,,,A10,B10,C10,D10


In [40]:
#舍去连接方向上的索引
frame1 =pd.concat([df1,df2],ignore_index=True)  
frame1

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A7,B7,C7,D7
5,A8,B8,C8,D8
6,A9,B9,C9,D9
7,A10,B10,C10,D10


In [41]:
frame2=pd.concat([df1,df2],ignore_index=True,axis=1)
frame2

Unnamed: 0,0,1,2,3,4,5,6,7
0,A0,B0,C0,D0,,,,
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,,,,
7,,,,,A7,B7,C7,D7
8,,,,,A8,B8,C8,D8
9,,,,,A9,B9,C9,D9
10,,,,,A10,B10,C10,D10


In [43]:
#join参数指定内连接
frame3 = pd.concat([df1, df3], axis=1, join='inner')  
frame3

Unnamed: 0,A,B,C,D,E,F,G,H
0,A0,B0,C0,D0,E0,F0,G0,H0
3,A3,B3,C3,D3,A3,F3,G3,H3


In [45]:
#df1为主表的左外连接
frame3=pd.concat([df1, df3],axis=1,join_axes=[df1.index]) 
frame3

Unnamed: 0,A,B,C,D,E,F,G,H
0,A0,B0,C0,D0,E0,F0,G0,H0
1,A1,B1,C1,D1,,,,
2,A2,B2,C2,D2,,,,
3,A3,B3,C3,D3,A3,F3,G3,H3


In [47]:
#df3为主表的右外连接
frame4=pd.concat([df1, df3],axis=1,join_axes=[df3.index]) 
frame4

Unnamed: 0,A,B,C,D,E,F,G,H
0,A0,B0,C0,D0,E0,F0,G0,H0
3,A3,B3,C3,D3,A3,F3,G3,H3
4,,,,,E4,F4,G4,H4
5,,,,,E5,F5,G5,H5


#### 2 用于关系型数据库的连接方法merge

In [49]:
import pandas as pd
leftdf = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
                       'key2': ['K0', 'K1', 'K1', 'K0'],
                       'A': ['A0', 'A1', 'A2', 'A3'],
                       'B': ['B0', 'B1', 'B2', 'B3']})
    
rightdf = pd.DataFrame({'key1': ['K0', 'K1', 'K2', 'K3'],
                        'key2': ['K0', 'K0', 'K1', 'K1'],
                        'C': ['C0', 'C1', 'C2', 'C3'],
                        'D': ['D0', 'D1', 'D2', 'D3']})

In [50]:
#默认情况下，merge方法按照两个dataframe具有相同列名的重叠列进行“内连接”
df1=pd.merge(leftdf,rightdf)
df1

Unnamed: 0,A,B,key1,key2,C,D
0,A0,B0,K0,K0,C0,D0
1,A2,B2,K2,K1,C2,D2


In [51]:
##指定对齐的重叠列
df1=pd.merge(leftdf,rightdf, on=['key1', 'key2']) 
df1

Unnamed: 0,A,B,key1,key2,C,D
0,A0,B0,K0,K0,C0,D0
1,A2,B2,K2,K1,C2,D2


In [52]:
#可通过on参数指定某一个重叠列对齐连接
df1=pd.merge(leftdf,rightdf, on='key1')
df1

Unnamed: 0,A,B,key1,key2_x,C,D,key2_y
0,A0,B0,K0,K0,C0,D0,K0
1,A1,B1,K1,K1,C1,D1,K0
2,A2,B2,K2,K1,C2,D2,K1
3,A3,B3,K3,K0,C3,D3,K1


In [53]:
rightdf0=rightdf.copy()     #将rightdf复制给新的rightdf0
rightdf0.columns = ['c', 'd', 'key01', 'key2']   #重命名rightdf0的列名
df1=pd.merge(leftdf,rightdf0,left_on='key1',right_on='key01')
df1

Unnamed: 0,A,B,key1,key2_x,c,d,key01,key2_y
0,A0,B0,K0,K0,C0,D0,K0,K0
1,A1,B1,K1,K1,C1,D1,K1,K0
2,A2,B2,K2,K1,C2,D2,K2,K1
3,A3,B3,K3,K0,C3,D3,K3,K1


In [None]:
#通过how参数指定连接的类型
df1=pd.merge(leftdf, rightdf, how='left', on=['key1', 'key2'])
df2=pd.merge(leftdf, rightdf, how='right', on=['key1', 'key2'])


#### 3 行索引index上的连接方法join

In [60]:
leftdf = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                       'B': ['B0', 'B1', 'B2']},
                       index=['K0', 'K1', 'K2'])
rightdf = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                        'D': ['D0', 'D2', 'D3']},
                        index=['K0', 'K2', 'K3'])

In [61]:
df1=leftdf.join(rightdf)
df1

Unnamed: 0,A,B,C,D
K0,A0,B0,C0,D0
K1,A1,B1,,
K2,A2,B2,C2,D2


In [62]:
leftdf = pd.DataFrame({'A': ['A0', 'A1', 'A2', 'A3'],
                       'B': ['B0', 'B1', 'B2', 'B3'],
                       'key': ['K0', 'K1', 'K0', 'K1']})


rightdf = pd.DataFrame({'C': ['C0', 'C1','C2'],
                        'D': ['D0', 'D1','D2']},
                        index=['K0', 'K1','K3'])
df1=leftdf.join(rightdf,on='key')
df1

Unnamed: 0,A,B,key,C,D
0,A0,B0,K0,C0,D0
1,A1,B1,K1,C1,D1
2,A2,B2,K0,C0,D0
3,A3,B3,K1,C1,D1


### 7.5 Pandas数据输入输出

In [63]:
import pandas as pd
df = pd.DataFrame({'name': ['Bob', 'Alex', 'Anna', 'Lisi'],
                    'Hei': ['1.82', '1.73', '1.61', '1.63'],
                    'Old': ['23', '31', '22', '41'],
                    'Id': ['BS001', 'BS092', 'BS612', 'BS008']}
                     )
#将df写入文本文件dftext
df.to_csv('dftext.txt',sep=',',index=False)  
#读取文件，序号为1的列为索引
df1=pd.read_csv('dftext.txt',index_col=1)
#读取文件，逗号分隔，重新命名列名
df2=pd.read_table('dftext.txt',sep=',',names=['H','O','N']) 

In [64]:
df.to_excel('dfexcel.xlsx',index=0) #写入excel文件，省略索引 
df1=pd.read_excel('dfexcel.xlsx')  #读入excel文件，默认读取sheet1