# pandas——基本函数

练习pandas的主要的基本函数的使用。

列转行方法
- stack函数：pandas.DataFrame.stack(self, level=-1, dropna=True)，对于普通的DataFrame而言，直接列索引转换到最内层行索引，生一个Series对象。对于层次化索引的DataFrame而言，可以将指定的索引层转换到行上，默认是将最内层的列索引转换到最内层行。
- unstack函数：pandas.DataFrame.unstack(self, level=-1, fill_value=None)，对于普通的DataFrame而言，直接将列索引转换到行索引的最外层索引，生成一个Series对象，对于层次化索引的DataFrame而言，和stack函数类似，似乎把两层索引当作一个整体，当level为列表时报错。
- melt函数：pandas.melt(frame, id_vars=None, value_vars=None, var_name=None, value_name='value', col_level=None),id_vars可以理解为结果需要保留的原始列，value_vars可以理解为需要列转行的列名；var_name把列转行的列变量重新命名，默认为variable；value_name列转行对应变量的值的名称。

行转列方法
+ unstack函数：pandas.DataFrame.unstack(self, level=-1, fill_value=None)

In [1]:
import numpy as np  
import pandas as pd  
df = pd.DataFrame({'A':np.random.randint(1, 100, 4),'B':pd.date_range(start='20130101', periods=4, freq='D'),'C':pd.Series([1, 2, 3, 4],index=['zhang', 'li', 'zhou', 'wang'],dtype='float32'),'D':np.array([3] * 4,dtype='int32'), 'E':pd.Categorical(["test","train","test","train"]),'F':'foo'})  
df  

Unnamed: 0,A,B,C,D,E,F
zhang,96,2013-01-01,1.0,3,test,foo
li,24,2013-01-02,2.0,3,train,foo
zhou,72,2013-01-03,3.0,3,test,foo
wang,1,2013-01-04,4.0,3,train,foo


二维数据查看

In [2]:
df.head()    # 默认显示前5行  

Unnamed: 0,A,B,C,D,E,F
zhang,96,2013-01-01,1.0,3,test,foo
li,24,2013-01-02,2.0,3,train,foo
zhou,72,2013-01-03,3.0,3,test,foo
wang,1,2013-01-04,4.0,3,train,foo


In [3]:
df.head(3)  # 查看前3行  

Unnamed: 0,A,B,C,D,E,F
zhang,96,2013-01-01,1.0,3,test,foo
li,24,2013-01-02,2.0,3,train,foo
zhou,72,2013-01-03,3.0,3,test,foo


In [4]:
df.tail(2)  # 查看最后2行  

Unnamed: 0,A,B,C,D,E,F
zhou,72,2013-01-03,3.0,3,test,foo
wang,1,2013-01-04,4.0,3,train,foo


查看二维数据的索引、列名和数据。

In [5]:
df.index  

Index(['zhang', 'li', 'zhou', 'wang'], dtype='object')

In [6]:
df.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [7]:
df.values

array([[96, Timestamp('2013-01-01 00:00:00'), 1.0, 3, 'test', 'foo'],
       [24, Timestamp('2013-01-02 00:00:00'), 2.0, 3, 'train', 'foo'],
       [72, Timestamp('2013-01-03 00:00:00'), 3.0, 3, 'test', 'foo'],
       [1, Timestamp('2013-01-04 00:00:00'), 4.0, 3, 'train', 'foo']],
      dtype=object)

查看数据的统计信息

In [9]:
df.describe()   # 平均值、标准差、最小值、最大值等信息

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,48.25,2.5,3.0
std,43.453999,1.290994,0.0
min,1.0,1.0,3.0
25%,18.25,1.75,3.0
50%,48.0,2.5,3.0
75%,78.0,3.25,3.0
max,96.0,4.0,3.0


二维数据转置

In [10]:
df.T

Unnamed: 0,zhang,li,zhou,wang
A,96,24,72,1
B,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00
C,1,2,3,4
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


排序

In [11]:
df.sort_index(axis=0, ascending=False)     # 对索引进行降序排序 

Unnamed: 0,A,B,C,D,E,F
zhou,72,2013-01-03,3.0,3,test,foo
zhang,96,2013-01-01,1.0,3,test,foo
wang,1,2013-01-04,4.0,3,train,foo
li,24,2013-01-02,2.0,3,train,foo


In [12]:
df.sort_index(axis=1, ascending=True)     # 对列名进行升序排序 

Unnamed: 0,A,B,C,D,E,F
zhang,96,2013-01-01,1.0,3,test,foo
li,24,2013-01-02,2.0,3,train,foo
zhou,72,2013-01-03,3.0,3,test,foo
wang,1,2013-01-04,4.0,3,train,foo


In [13]:
df.sort_values(by='A')                     # 对A列的值进行升序排序 

Unnamed: 0,A,B,C,D,E,F
wang,1,2013-01-04,4.0,3,train,foo
li,24,2013-01-02,2.0,3,train,foo
zhou,72,2013-01-03,3.0,3,test,foo
zhang,96,2013-01-01,1.0,3,test,foo


重复值处理

In [34]:
data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4, 'k2':[1, 1, 2, 3, 3, 4, 4]})  
data  


Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [15]:
data.duplicated() # 检测重复行

0    False
1     True
2    False
3    False
4     True
5    False
6     True
dtype: bool

In [16]:
data.drop_duplicates() # 返回新数组，删除重复行

Unnamed: 0,k1,k2
0,one,1
2,one,2
3,two,3
5,two,4


In [17]:
data.drop_duplicates(['k1']) # 删除k1列的重复数据，保留首行重复数据。

Unnamed: 0,k1,k2
0,one,1
3,two,3


In [36]:
data.drop_duplicates(['k1'],keep='last') # 删除k1列的重复数据，保留末行重复数据。

Unnamed: 0,k1,k2
2,ONE,2
6,TWO,4


映射

In [35]:
# 使用函数进行映射，将data中k1列的值转换为大写。
data['k1']=data['k1'].map(str.upper)  
data  

Unnamed: 0,k1,k2
0,ONE,1
1,ONE,1
2,ONE,2
3,TWO,3
4,TWO,3
5,TWO,4
6,TWO,4


In [37]:
# 使用字典表示映射关系，将data中k1列的值转换为小写。
data['k1']=data['k1'].map({'ONE':'one','TWO':'two'})  
data

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [38]:
# 使用lambda表达式表示映射关系，将data中k2列的值加5。
data['k2'] = data['k2'].map(lambda x:x+5)  
data   

Unnamed: 0,k1,k2
0,one,6
1,one,6
2,one,7
3,two,8
4,two,8
5,two,9
6,two,9


In [39]:
# 使用lambda表达式表示映射关系，将data中索引的值加5。
data.index = data.index.map(lambda x:x+5)  
data  

Unnamed: 0,k1,k2
5,one,6
6,one,6
7,one,7
8,two,8
9,two,8
10,two,9
11,two,9


In [40]:
# 使用lambda表达式表示映射关系，将data中列名转换为大写。
data.columns=data.columns.map(str.upper)  
data  

Unnamed: 0,K1,K2
5,one,6
6,one,6
7,one,7
8,two,8
9,two,8
10,two,9
11,two,9


数据离散化

In [44]:
from random import randrange  
data=[randrange(100) for _ in range(10)]  
data

[28, 96, 81, 61, 68, 70, 99, 63, 77, 36]

In [45]:
category=[0,25,50,100]  
pd.cut(data,category)  

[(25, 50], (50, 100], (50, 100], (50, 100], (50, 100], (50, 100], (50, 100], (50, 100], (50, 100], (25, 50]]
Categories (3, interval[int64]): [(0, 25] < (25, 50] < (50, 100]]

In [46]:
# 按category对data数据进行切分，使得参数right=False形成左闭右开区间。
pd.cut(data,category,right=False)  

[[25, 50), [50, 100), [50, 100), [50, 100), [50, 100), [50, 100), [50, 100), [50, 100), [50, 100), [25, 50)]
Categories (3, interval[int64]): [[0, 25) < [25, 50) < [50, 100)]

In [47]:
# 按category对data数据进行切分，使得参数right=False形成左闭右开区间，并对每个区间打标签。
labels = ['low', 'middle', 'high']  
pd.cut(data,category,right=False,labels=labels)  

[middle, high, high, high, high, high, high, high, high, middle]
Categories (3, object): [low < middle < high]

In [48]:
data  

[28, 96, 81, 61, 68, 70, 99, 63, 77, 36]

In [49]:
# 对data数据按4分位进行切分。
pd.cut(data,4) 

[(27.929, 45.75], (81.25, 99.0], (63.5, 81.25], (45.75, 63.5], (63.5, 81.25], (63.5, 81.25], (81.25, 99.0], (45.75, 63.5], (63.5, 81.25], (27.929, 45.75]]
Categories (4, interval[float64]): [(27.929, 45.75] < (45.75, 63.5] < (63.5, 81.25] < (81.25, 99.0]]

频次统计与移位

In [50]:
# 将df数据通过copy方法赋值为df1，然后对df1数据使用shift方法下移一行（负数表示上移）。
df1=df.copy()  
df1.shift(1)  

Unnamed: 0,A,B,C,D,E,F
zhang,,NaT,,,,
li,96.0,2013-01-01,1.0,3.0,test,foo
zhou,24.0,2013-01-02,2.0,3.0,train,foo
wang,72.0,2013-01-03,3.0,3.0,test,foo


In [52]:
df1.shift(-1)  

Unnamed: 0,A,B,C,D,E,F
zhang,24.0,2013-01-02,2.0,3.0,train,foo
li,72.0,2013-01-03,3.0,3.0,test,foo
zhou,1.0,2013-01-04,4.0,3.0,train,foo
wang,,NaT,,,,


In [53]:
# 对df1中D列数据进行直方图统计。
df1['D'].value_counts()  


3    4
Name: D, dtype: int64

透视转换

In [54]:
df = pd.DataFrame({'a':[1,2,3,4],'b':[2,3,4,5],'c':[3,4,5,6], 'd':[3,3,3,3]})  
df  

Unnamed: 0,a,b,c,d
0,1,2,3,3
1,2,3,4,3
2,3,4,5,3
3,4,5,6,3


In [55]:
# 将df的a列值作为索引，b列值作为列名，c列值作为值，构建透视图。
df.pivot(index='a', columns='b', values='c')  

b,2,3,4,5
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.0,,,
2,,4.0,,
3,,,5.0,
4,,,,6.0


In [56]:
# 将df的a列值作为索引，b列值作为列名，d列值作为值，构建透视图。
df.pivot(index='a', columns='b', values='d')  

b,2,3,4,5
a,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,3.0,,,
2,,3.0,,
3,,,3.0,
4,,,,3.0


数据差分

In [57]:
# 新建数据帧名为df。
df = pd.DataFrame({'a':np.random.randint(1, 100, 10),'b':np.random.randint(1, 100, 10)},index=map(str, range(10)))  
df  

Unnamed: 0,a,b
0,82,62
1,28,84
2,31,73
3,89,8
4,24,72
5,32,61
6,99,21
7,2,27
8,17,61
9,85,17


In [58]:
# 对df的行进行一阶差分
df.diff()   

Unnamed: 0,a,b
0,,
1,-54.0,22.0
2,3.0,-11.0
3,58.0,-65.0
4,-65.0,64.0
5,8.0,-11.0
6,67.0,-40.0
7,-97.0,6.0
8,15.0,34.0
9,68.0,-44.0


In [59]:
# 对df的列进行一阶差分
df.diff(axis=1)  

Unnamed: 0,a,b
0,,-20.0
1,,56.0
2,,42.0
3,,-81.0
4,,48.0
5,,29.0
6,,-78.0
7,,25.0
8,,44.0
9,,-68.0


In [60]:
# 对df的行进行二阶差分。
df.diff(periods=2)  

Unnamed: 0,a,b
0,,
1,,
2,-51.0,11.0
3,61.0,-76.0
4,-7.0,-1.0
5,-57.0,53.0
6,75.0,-51.0
7,-30.0,-34.0
8,-82.0,40.0
9,83.0,-10.0


计算相关系数

In [61]:
# 新建一个DataFrame名为df。
df = pd.DataFrame({'A':np.random.randint(1, 100, 10),'B':np.random.randint(1, 100, 10),'C':np.random.randint(1, 100, 10)})  
df  

Unnamed: 0,A,B,C
0,86,98,79
1,91,31,57
2,96,33,55
3,16,31,44
4,88,24,3
5,61,13,91
6,93,35,20
7,21,42,77
8,62,6,29
9,57,95,20


In [62]:
# 计算df的相关系数, pearson相关系数.
df.corr()

Unnamed: 0,A,B,C
A,1.0,0.044248,-0.216153
B,0.044248,1.0,0.069292
C,-0.216153,0.069292,1.0


In [65]:
# 计算df的相关系数, Kendall相关系数.
# df.corr('kendall') # in module scipy

In [66]:
# 计算df的相关系数, spearman秩相关.
# df.corr('spearman')  

重塑Reshaping

In [67]:
# 新进一个DataFrame，为名df，将df的前4行赋值给df2.
tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]))  
index = pd.MultiIndex.from_tuples(tuples, names=['A', 'B'])  
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])  
df2 = df[:4]  
df2  


Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.109453,-1.382815
bar,two,-0.002622,-0.924601
baz,one,-1.146334,1.484481
baz,two,0.68622,-0.733791


In [68]:
# 使用stack方法对df2进行列转行，将结果返回给stacked.
stacked = df2.stack()  
print(stacked)  

A    B     
bar  one  A   -0.109453
          B   -1.382815
     two  A   -0.002622
          B   -0.924601
baz  one  A   -1.146334
          B    1.484481
     two  A    0.686220
          B   -0.733791
dtype: float64


In [69]:
# 使用unstack方法对stacked进行行转列，默认level=2，解压最内层。
stacked.unstack()  

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-0.109453,-1.382815
bar,two,-0.002622,-0.924601
baz,one,-1.146334,1.484481
baz,two,0.68622,-0.733791


In [70]:
# 使用unstack方法对stacked进行列转行，设置level=1，解压中间层。
stacked.unstack(1)  

Unnamed: 0_level_0,B,one,two
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,A,-0.109453,-0.002622
bar,B,-1.382815,-0.924601
baz,A,-1.146334,0.68622
baz,B,1.484481,-0.733791


In [71]:
# 使用unstack方法对stacked进行列转行，默认level=0，解压最外层。
stacked.unstack(0)  

Unnamed: 0_level_0,A,bar,baz
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.109453,-1.146334
one,B,-1.382815,1.484481
two,A,-0.002622,0.68622
two,B,-0.924601,-0.733791


In [72]:
# 使用unstack方法对stacked进行列转行，默认level=‘A’，解压最外层。
stacked.unstack('A')  

Unnamed: 0_level_0,A,bar,baz
B,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
one,A,-0.109453,-1.146334
one,B,-1.382815,1.484481
two,A,-0.002622,0.68622
two,B,-0.924601,-0.733791


melt函数：将DataFrame的列转行。

In [73]:
# 新建一个DataFrame，名为df.
df=pd.DataFrame(np.arange(8).reshape(2,4),index=['AA','BB'],columns=['A','B','C','D'])  
df  

Unnamed: 0,A,B,C,D
AA,0,1,2,3
BB,4,5,6,7


使用melt函数，将df进行列转行操作，保留A,C两个原始列，将B，D两列进行列转行，将列转行的列变量重新命名为B|D，列转行对应变量的值的名称命名为B|D_value 。

In [74]:
pd.melt(df,id_vars=['A','C'],value_vars=['B','D'],var_name='B|D',value_name='(B|D)_value')  

Unnamed: 0,A,C,B|D,(B|D)_value
0,0,2,B,1
1,4,6,B,5
2,0,2,D,3
3,4,6,D,7


sub函数：截取DataFrame中的行或列。

In [75]:
# 新建一个DataFrame，名为df
df=pd.DataFrame({ 'one' : pd.Series(np.random.randn(3), index=['a', 'b', 'c']),'two' : pd.Series(np.random.randn(4), index=['a', 'b', 'c', 'd']),'three' : pd.Series(np.random.randn(3), index=['b', 'c', 'd'])})  
print(df)  

one       two     three
a -0.299276  0.787134       NaN
b  0.135591 -0.605735  0.802243
c -0.741154  0.642397  0.598000
d       NaN -0.034408  1.058796


取df中索引为1的行，赋值给row,取列名为two的列赋值给column,使用sub方法将df的row行截取掉,axis='columns'或1。

In [76]:
#取df中索引为1的行，赋值给row  
row=df.iloc[1]  
#取列名为two的列赋值给column  
column=df['two']  
#使用sub方法将df的row行截取掉,axis='columns'或1。  
df.sub(row,axis='columns')  

Unnamed: 0,one,two,three
a,-0.434867,1.39287,
b,0.0,0.0,0.0
c,-0.876745,1.248132,-0.204243
d,,0.571327,0.256553


In [77]:
df.sub(row,axis=1)  

Unnamed: 0,one,two,three
a,-0.434867,1.39287,
b,0.0,0.0,0.0
c,-0.876745,1.248132,-0.204243
d,,0.571327,0.256553


In [78]:
#使用sub方法将df的column列截取掉,axis='index'或0。  
df.sub(column,axis='index')  

Unnamed: 0,one,two,three
a,-1.086411,0.0,
b,0.741326,0.0,1.407978
c,-1.383551,0.0,-0.044398
d,,0.0,1.093204


In [79]:
df.sub(column,axis=0)  

Unnamed: 0,one,two,three
a,-1.086411,0.0,
b,0.741326,0.0,1.407978
c,-1.383551,0.0,-0.044398
d,,0.0,1.093204


删除操作

In [80]:
# 删除指定行，返回一个删除后的DataFrame，对原始DataFrame不做改变。
data = pd.DataFrame({'k1':['one'] * 3 + ['two'] * 4,'k2':[1, 1, 2, 3, 3, 4, 4]})  
data  

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
5,two,4
6,two,4


In [81]:
data.drop(5,axis=0)  

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
3,two,3
4,two,3
6,two,4


In [82]:
# 删除指定行，对data本身进行删除操作。
data.drop(3,inplace=True)  
data  

Unnamed: 0,k1,k2
0,one,1
1,one,1
2,one,2
4,two,3
5,two,4
6,two,4


In [83]:
# 删除指定列，返回一个删除后的DataFrame，对原始DataFrame不做改变。
data.drop('k1',axis=1)  

Unnamed: 0,k2
0,1
1,1
2,2
4,3
5,4
6,4
