In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# 一、 创建对象

1. 可以通过传递一个list对象来创建一个Series，pandas 会默认创建整型索引：

In [3]:
s = pd.Series([1,3,5,np.nan,6,8])

In [4]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

2、通过传递一个 numpyarray，时间索引以及列标签来创建一个DataFrame：

In [5]:
dates = pd.date_range('20130101', periods=6)

In [6]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [7]:
 df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [8]:
 df

Unnamed: 0,A,B,C,D
2013-01-01,-0.714074,0.777404,-0.985489,0.296632
2013-01-02,0.526748,1.359652,-1.536886,-0.175826
2013-01-03,1.829744,0.619726,0.467371,-0.790373
2013-01-04,0.841779,1.073806,0.801315,0.700767
2013-01-05,-0.539924,-0.359301,0.138966,-2.218633
2013-01-06,0.064927,-0.185003,0.214145,-0.934077


3、通过传递一个能够被转换成类似序列结构的字典对象来创建一个DataFrame：

In [11]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })

In [12]:
df2 

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


4、查看不同列的数据类型：

In [13]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# 二、 查看数据

 1、 查看DataFrame中头部和尾部的行：

In [15]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.714074,0.777404,-0.985489,0.296632
2013-01-02,0.526748,1.359652,-1.536886,-0.175826
2013-01-03,1.829744,0.619726,0.467371,-0.790373
2013-01-04,0.841779,1.073806,0.801315,0.700767
2013-01-05,-0.539924,-0.359301,0.138966,-2.218633


In [16]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.841779,1.073806,0.801315,0.700767
2013-01-05,-0.539924,-0.359301,0.138966,-2.218633
2013-01-06,0.064927,-0.185003,0.214145,-0.934077


2、 显示索引、列和底层的 numpy 数据：

In [17]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [18]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [19]:
df.values

array([[-0.71407382,  0.77740379, -0.98548927,  0.29663188],
       [ 0.52674822,  1.35965185, -1.53688598, -0.17582551],
       [ 1.82974365,  0.61972616,  0.46737108, -0.79037256],
       [ 0.84177874,  1.0738059 ,  0.80131495,  0.70076682],
       [-0.53992423, -0.35930131,  0.13896585, -2.21863343],
       [ 0.06492742, -0.18500295,  0.21414453, -0.93407716]])

3、 describe()函数对于数据的快速统计汇总：

In [20]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.334867,0.547714,-0.150096,-0.520252
std,0.945181,0.686077,0.908157,1.039763
min,-0.714074,-0.359301,-1.536886,-2.218633
25%,-0.388711,0.016179,-0.704375,-0.898151
50%,0.295838,0.698565,0.176555,-0.483099
75%,0.763021,0.999705,0.404064,0.178518
max,1.829744,1.359652,0.801315,0.700767


4、 对数据的转置：

In [21]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.714074,0.526748,1.829744,0.841779,-0.539924,0.064927
B,0.777404,1.359652,0.619726,1.073806,-0.359301,-0.185003
C,-0.985489,-1.536886,0.467371,0.801315,0.138966,0.214145
D,0.296632,-0.175826,-0.790373,0.700767,-2.218633,-0.934077


5、 按轴进行排序

In [22]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,0.296632,-0.985489,0.777404,-0.714074
2013-01-02,-0.175826,-1.536886,1.359652,0.526748
2013-01-03,-0.790373,0.467371,0.619726,1.829744
2013-01-04,0.700767,0.801315,1.073806,0.841779
2013-01-05,-2.218633,0.138966,-0.359301,-0.539924
2013-01-06,-0.934077,0.214145,-0.185003,0.064927


6、 按值进行排序

In [23]:
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-05,-0.539924,-0.359301,0.138966,-2.218633
2013-01-06,0.064927,-0.185003,0.214145,-0.934077
2013-01-03,1.829744,0.619726,0.467371,-0.790373
2013-01-01,-0.714074,0.777404,-0.985489,0.296632
2013-01-04,0.841779,1.073806,0.801315,0.700767
2013-01-02,0.526748,1.359652,-1.536886,-0.175826


# 三、 选择

## 3.1 获取单独列/ 切片

1、 选择一个单独的列，这将会返回一个Series，等同于df.A：

In [25]:
df['A']

2013-01-01   -0.714074
2013-01-02    0.526748
2013-01-03    1.829744
2013-01-04    0.841779
2013-01-05   -0.539924
2013-01-06    0.064927
Freq: D, Name: A, dtype: float64

2、 通过[]进行选择，这将会对行进行切片

In [26]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.714074,0.777404,-0.985489,0.296632
2013-01-02,0.526748,1.359652,-1.536886,-0.175826
2013-01-03,1.829744,0.619726,0.467371,-0.790373


In [27]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,0.526748,1.359652,-1.536886,-0.175826
2013-01-03,1.829744,0.619726,0.467371,-0.790373
2013-01-04,0.841779,1.073806,0.801315,0.700767


# 3.2 通过标签选择

1、 使用标签来获取一个交叉的区域

In [28]:
df.loc[dates[0]]

A   -0.714074
B    0.777404
C   -0.985489
D    0.296632
Name: 2013-01-01 00:00:00, dtype: float64

2、 通过标签来在多个轴上进行选择

In [29]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,-0.714074,0.777404
2013-01-02,0.526748,1.359652
2013-01-03,1.829744,0.619726
2013-01-04,0.841779,1.073806
2013-01-05,-0.539924,-0.359301
2013-01-06,0.064927,-0.185003


3、 标签切片

In [30]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,0.526748,1.359652
2013-01-03,1.829744,0.619726
2013-01-04,0.841779,1.073806


4、 对于返回的对象进行维度缩减

In [33]:
df.loc['20130102',['A','B']]

A    0.526748
B    1.359652
Name: 2013-01-02 00:00:00, dtype: float64

5、 获取一个标量

In [34]:
df.loc[dates[0],'A']

-0.7140738244446047

快速访问一个标量（与上一个方法等价）

In [35]:
df.at[dates[0],'A']

-0.7140738244446047

# 3.3 通过位置选择

1、 通过传递数值进行位置选择（选择的是行）

In [36]:
df.iloc[3]

A    0.841779
B    1.073806
C    0.801315
D    0.700767
Name: 2013-01-04 00:00:00, dtype: float64

2、 通过数值进行切片，与 numpy/python 中的情况类似

In [37]:
df.iloc[3:5,0:2]

Unnamed: 0,A,B
2013-01-04,0.841779,1.073806
2013-01-05,-0.539924,-0.359301


3、 通过指定一个位置的列表，与 numpy/python 中的情况类似

In [39]:
df.iloc[[1,2,4],[0,2]]

Unnamed: 0,A,C
2013-01-02,0.526748,-1.536886
2013-01-03,1.829744,0.467371
2013-01-05,-0.539924,0.138966


4、 对行进行切片

In [40]:
df.iloc[1:3,:]

Unnamed: 0,A,B,C,D
2013-01-02,0.526748,1.359652,-1.536886,-0.175826
2013-01-03,1.829744,0.619726,0.467371,-0.790373


5、 对列进行切片

In [41]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2013-01-01,0.777404,-0.985489
2013-01-02,1.359652,-1.536886
2013-01-03,0.619726,0.467371
2013-01-04,1.073806,0.801315
2013-01-05,-0.359301,0.138966
2013-01-06,-0.185003,0.214145


6、 获取特定的值

In [42]:
df.iloc[1,1]

1.3596518526153447

# 3.4 布尔索引

1、 使用一个单独列的值来选择数据：

In [43]:
df[df.A > 0]

Unnamed: 0,A,B,C,D
2013-01-02,0.526748,1.359652,-1.536886,-0.175826
2013-01-03,1.829744,0.619726,0.467371,-0.790373
2013-01-04,0.841779,1.073806,0.801315,0.700767
2013-01-06,0.064927,-0.185003,0.214145,-0.934077


2、 使用where操作来选择数据：

In [44]:
df[df > 0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.777404,,0.296632
2013-01-02,0.526748,1.359652,,
2013-01-03,1.829744,0.619726,0.467371,
2013-01-04,0.841779,1.073806,0.801315,0.700767
2013-01-05,,,0.138966,
2013-01-06,0.064927,,0.214145,


3、 使用isin()方法来过滤：

In [45]:
df2 = df.copy()

In [46]:
df2['E'] = ['one', 'one','two','three','four','three']

In [47]:
df2

Unnamed: 0,A,B,C,D,E
2013-01-01,-0.714074,0.777404,-0.985489,0.296632,one
2013-01-02,0.526748,1.359652,-1.536886,-0.175826,one
2013-01-03,1.829744,0.619726,0.467371,-0.790373,two
2013-01-04,0.841779,1.073806,0.801315,0.700767,three
2013-01-05,-0.539924,-0.359301,0.138966,-2.218633,four
2013-01-06,0.064927,-0.185003,0.214145,-0.934077,three


In [48]:
df2[df2['E'].isin(['two','four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,1.829744,0.619726,0.467371,-0.790373,two
2013-01-05,-0.539924,-0.359301,0.138966,-2.218633,four


# 设置

1、 设置一个新的列：

In [49]:
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

In [50]:
s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

2、 通过标签设置新的值：

In [51]:
 df.at[dates[0],'A'] = 0

3、 通过位置设置新的值：

In [52]:
df.iat[0,1] = 0

4、 通过一个numpy数组设置一组新值：

In [53]:
df.loc[:,'D'] = np.array([5] * len(df))

上述操作结果如下：

In [54]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.985489,5
2013-01-02,0.526748,1.359652,-1.536886,5
2013-01-03,1.829744,0.619726,0.467371,5
2013-01-04,0.841779,1.073806,0.801315,5
2013-01-05,-0.539924,-0.359301,0.138966,5
2013-01-06,0.064927,-0.185003,0.214145,5


5、 通过where操作来设置新的值：

In [55]:
df2 = df.copy()

In [56]:
df2[df2 > 0] = -df2

In [57]:
df2

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.985489,-5
2013-01-02,-0.526748,-1.359652,-1.536886,-5
2013-01-03,-1.829744,-0.619726,-0.467371,-5
2013-01-04,-0.841779,-1.073806,-0.801315,-5
2013-01-05,-0.539924,-0.359301,-0.138966,-5
2013-01-06,-0.064927,-0.185003,-0.214145,-5


#  四、 缺失值处理

在 pandas 中，使用np.nan来代替缺失值，这些值将默认不会包含在计算中

1、 reindex()方法可以对指定轴上的索引进行改变/增加/删除操作，这将返回原始数据的一个拷贝：

In [59]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) 

In [60]:
df1.loc[dates[0]:dates[1],'E'] = 1

In [61]:
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.985489,5,1.0
2013-01-02,0.526748,1.359652,-1.536886,5,1.0
2013-01-03,1.829744,0.619726,0.467371,5,
2013-01-04,0.841779,1.073806,0.801315,5,


2、 去掉包含缺失值的行：

In [62]:
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.985489,5,1.0
2013-01-02,0.526748,1.359652,-1.536886,5,1.0


3、 对缺失值进行填充：

In [63]:
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,E
2013-01-01,0.0,0.0,-0.985489,5,1.0
2013-01-02,0.526748,1.359652,-1.536886,5,1.0
2013-01-03,1.829744,0.619726,0.467371,5,5.0
2013-01-04,0.841779,1.073806,0.801315,5,5.0


4、 对数据进行布尔填充：

In [64]:
pd.isnull(df1)

Unnamed: 0,A,B,C,D,E
2013-01-01,False,False,False,False,False
2013-01-02,False,False,False,False,False
2013-01-03,False,False,False,False,True
2013-01-04,False,False,False,False,True


# 五、 相关操作

# 5.1 统计（相关操作通常情况下不包括缺失值）

1、 执行描述性统计：

In [65]:
df.mean()

A    0.453879
B    0.418147
C   -0.150096
D    5.000000
dtype: float64

2、 在其他轴上进行相同的操作：

In [66]:
df.mean(1)

2013-01-01    1.003628
2013-01-02    1.337379
2013-01-03    1.979210
2013-01-04    1.929225
2013-01-05    1.059935
2013-01-06    1.273517
Freq: D, dtype: float64

?? 3、 对于拥有不同维度，需要对齐的对象进行操作。Pandas 会自动的沿着指定的维度进行广播：

In [67]:
s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

In [68]:
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [69]:
df.sub(s, axis='index')

Unnamed: 0,A,B,C,D
2013-01-01,,,,
2013-01-02,,,,
2013-01-03,0.829744,-0.380274,-0.532629,4.0
2013-01-04,-2.158221,-1.926194,-2.198685,2.0
2013-01-05,-5.539924,-5.359301,-4.861034,0.0
2013-01-06,,,,


## 5.2 Apply

1、 对数据应用函数：

In [71]:
df.apply(np.cumsum)

Unnamed: 0,A,B,C,D
2013-01-01,0.0,0.0,-0.985489,5
2013-01-02,0.526748,1.359652,-2.522375,10
2013-01-03,2.356492,1.979378,-2.055004,15
2013-01-04,3.198271,3.053184,-1.253689,20
2013-01-05,2.658346,2.693883,-1.114723,25
2013-01-06,2.723274,2.50888,-0.900579,30
