In [2]:
import pandas as pd
import numpy as np

## 创建Pandas对象

In [6]:
# 会自动创建一个整数序号索引
s = pd.Series([1, 3, 5, np.nan, 6, 8])
print(s)

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64


In [10]:
# 指定Index和列标签，通过NDArray来创建DataFrame
dates = pd.date_range('20130101', periods=6)
print(dates)
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
print(df)

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')
                   A         B         C         D
2013-01-01 -0.821168  0.686918 -0.574372 -0.513816
2013-01-02 -1.305996 -0.425252  1.045873 -1.211460
2013-01-03  0.390111  1.420477 -0.456445 -0.963785
2013-01-04  1.013269  0.339514 -0.258735  0.162547
2013-01-05  0.958733  0.569078 -0.939646 -0.491461
2013-01-06 -0.780485 -1.732532 -0.776505  0.875838


In [15]:
# 通过字典来创建一个DataFrame

df2 = pd.DataFrame({'A': 1.,
                   'B': pd.Timestamp('20130102'),
                   'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                   'D': np.array([3] * 4, dtype='int32'),
                   'E': pd.Categorical(['test', 'train', 'test', 'train']),
                   'F': 'foo'})
print(df2)

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3   test  foo
1  1.0 2013-01-02  1.0  3  train  foo
2  1.0 2013-01-02  1.0  3   test  foo
3  1.0 2013-01-02  1.0  3  train  foo
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


In [16]:
print(df2.dtypes)

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object


## 查看Pandas中的元素

In [17]:
# top rows
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.821168,0.686918,-0.574372,-0.513816
2013-01-02,-1.305996,-0.425252,1.045873,-1.21146
2013-01-03,0.390111,1.420477,-0.456445,-0.963785
2013-01-04,1.013269,0.339514,-0.258735,0.162547
2013-01-05,0.958733,0.569078,-0.939646,-0.491461


In [18]:
# bottom rows
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,1.013269,0.339514,-0.258735,0.162547
2013-01-05,0.958733,0.569078,-0.939646,-0.491461
2013-01-06,-0.780485,-1.732532,-0.776505,0.875838


In [20]:
# display index
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [21]:
# display columns
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

## DataFrame与Numpy之间的转换

`DataFrame.to_numpy()`能够将DataFrame内部的数据转换为Numpy的格式，但是如果DataFrame的各列的数据类型不同，这个操作的性能不好。因为Numpy的数组中所有元素都是同一种类型的，但是DataFrame不同列一般有各自的类型，如果要把DataFrame转化为Numpy，则Numpy只能统一用`object`这种类型来存储这些数据。

In [25]:
# df的转换是很直接的，它不会括Index和Column label
df.to_numpy()

array([[-0.82116825,  0.68691811, -0.57437224, -0.51381593],
       [-1.30599631, -0.42525199,  1.04587325, -1.21146024],
       [ 0.39011073,  1.42047673, -0.45644481, -0.96378537],
       [ 1.01326861,  0.33951391, -0.2587354 ,  0.16254726],
       [ 0.9587333 ,  0.56907815, -0.93964561, -0.49146054],
       [-0.7804852 , -1.73253211, -0.77650545,  0.87583804]])

In [26]:
# df2只能把所有的元素都转换为object
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

## 基本的操作

In [85]:
# describe()可以对data按列进行基本的统计
df.describe()

Unnamed: 0,A,B,C,D,F
count,6.0,6.0,6.0,6.0,6.0
mean,0.045939,0.143034,-0.326638,5.0,3.5
std,0.938103,1.094393,0.713411,0.0,1.870829
min,-1.305996,-1.732532,-0.939646,5.0,1.0
25%,-0.585364,-0.234061,-0.725972,5.0,2.25
50%,0.195055,0.454296,-0.515409,5.0,3.5
75%,0.816578,0.657458,-0.308163,5.0,4.75
max,1.013269,1.420477,1.045873,5.0,6.0


In [84]:
df.mean()

A    0.045939
B    0.143034
C   -0.326638
D    5.000000
F    3.500000
dtype: float64

In [86]:
# 按指定axis进行统计
df.mean(axis=1)

2013-01-01    1.222509
2013-01-02    1.262925
2013-01-03    1.870829
2013-01-04    2.018809
2013-01-05    2.117633
2013-01-06    1.542095
Freq: D, dtype: float64

In [31]:
# DataFrame.T来对数据进行转置
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.821168,-1.305996,0.390111,1.013269,0.958733,-0.780485
B,0.686918,-0.425252,1.420477,0.339514,0.569078,-1.732532
C,-0.574372,1.045873,-0.456445,-0.258735,-0.939646,-0.776505
D,-0.513816,-1.21146,-0.963785,0.162547,-0.491461,0.875838


In [87]:
# 按索引排序
df.sort_index(axis=1, ascending=False)

Unnamed: 0,F,D,C,B,A
2013-01-01,1,5,-0.574372,0.686918,0.0
2013-01-02,2,5,1.045873,-0.425252,-1.305996
2013-01-03,3,5,-0.456445,1.420477,0.390111
2013-01-04,4,5,-0.258735,0.339514,1.013269
2013-01-05,5,5,-0.939646,0.569078,0.958733
2013-01-06,6,5,-0.776505,-1.732532,-0.780485


In [88]:
# Apply

df.apply(np.cumsum)

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.686918,-0.574372,5,1
2013-01-02,-1.305996,0.261666,0.471501,10,3
2013-01-03,-0.915886,1.682143,0.015056,15,6
2013-01-04,0.097383,2.021657,-0.243679,20,10
2013-01-05,1.056116,2.590735,-1.183325,25,15
2013-01-06,0.275631,0.858203,-1.95983,30,21


In [89]:
df.apply(lambda x: x.max() - x.min())

A    2.319265
B    3.153009
C    1.985519
D    0.000000
F    5.000000
dtype: float64

In [91]:
# 频次统计

s = pd.Series(np.random.randint(0, 7, size=10))
s.value_counts()

4    3
3    2
6    1
5    1
2    1
1    1
0    1
dtype: int64

## 取存元素

In [36]:
# 通过列名来取一整列，返回的是一个Series对象
df['A'] # 等价于df.A

2013-01-01   -0.821168
2013-01-02   -1.305996
2013-01-03    0.390111
2013-01-04    1.013269
2013-01-05    0.958733
2013-01-06   -0.780485
Freq: D, Name: A, dtype: float64

In [38]:
# 通过range可以对行进行切片
df[0:2]

Unnamed: 0,A,B,C,D
2013-01-01,-0.821168,0.686918,-0.574372,-0.513816
2013-01-02,-1.305996,-0.425252,1.045873,-1.21146


In [39]:
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-1.305996,-0.425252,1.045873,-1.21146
2013-01-03,0.390111,1.420477,-0.456445,-0.963785
2013-01-04,1.013269,0.339514,-0.258735,0.162547


In [40]:
dates[0]

Timestamp('2013-01-01 00:00:00', freq='D')

In [41]:
# 通过label来取内容

df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.821168,0.686918
2013-01-02,-1.305996,-0.425252
2013-01-03,0.390111,1.420477
2013-01-04,1.013269,0.339514
2013-01-05,0.958733,0.569078
2013-01-06,-0.780485,-1.732532


In [42]:
df.loc['20130102', ['C', 'D']]

C    1.045873
D   -1.211460
Name: 2013-01-02 00:00:00, dtype: float64

In [44]:
# 取一个scalar
df.loc['20130102', 'C']

1.0458732546941494

In [45]:
# 通过位置来取元素
df.iloc[3:4, 0:2]

Unnamed: 0,A,B
2013-01-04,1.013269,0.339514


In [46]:
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,-1.305996,1.045873
2013-01-03,0.390111,-0.456445
2013-01-05,0.958733,-0.939646


In [49]:
# 取一个数字
df.iloc[1,1] # 和df.iat[1,1]等价

-0.4252519872340934

In [54]:
# 布尔索引
df[df['A'] > 0] # 过滤第一行中大于0的所有行

Unnamed: 0,A,B,C,D
2013-01-03,0.390111,1.420477,-0.456445,-0.963785
2013-01-04,1.013269,0.339514,-0.258735,0.162547
2013-01-05,0.958733,0.569078,-0.939646,-0.491461


In [57]:
# 过滤df中所有大于0的元素，没取到的元素，会设置为NaN
df[df>0]

Unnamed: 0,A,B,C,D
2013-01-01,,0.686918,,
2013-01-02,,,1.045873,
2013-01-03,0.390111,1.420477,,
2013-01-04,1.013269,0.339514,,0.162547
2013-01-05,0.958733,0.569078,,
2013-01-06,,,,0.875838


In [64]:
# 用isin()来过滤

df2['E'] = {'one', 'two', 'three', 'four'}
print(df2)
df2[df2['E'].isin(['one', 'four'])]

     A          B    C  D      E    F
0  1.0 2013-01-02  1.0  3    two  foo
1  1.0 2013-01-02  1.0  3  three  foo
2  1.0 2013-01-02  1.0  3   four  foo
3  1.0 2013-01-02  1.0  3    one  foo


Unnamed: 0,A,B,C,D,E,F
2,1.0,2013-01-02,1.0,3,four,foo
3,1.0,2013-01-02,1.0,3,one,foo


In [69]:
# setting
s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130101', periods=6))
print(s1)
df['F'] = s1
df

2013-01-01    1
2013-01-02    2
2013-01-03    3
2013-01-04    4
2013-01-05    5
2013-01-06    6
Freq: D, dtype: int64


Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.686918,-0.574372,-0.513816,1
2013-01-02,-1.305996,-0.425252,1.045873,-1.21146,2
2013-01-03,0.390111,1.420477,-0.456445,-0.963785,3
2013-01-04,1.013269,0.339514,-0.258735,0.162547,4
2013-01-05,0.958733,0.569078,-0.939646,-0.491461,5
2013-01-06,-0.780485,-1.732532,-0.776505,0.875838,6


In [70]:
df.at[dates[0], 'A'] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.686918,-0.574372,-0.513816,1
2013-01-02,-1.305996,-0.425252,1.045873,-1.21146,2
2013-01-03,0.390111,1.420477,-0.456445,-0.963785,3
2013-01-04,1.013269,0.339514,-0.258735,0.162547,4
2013-01-05,0.958733,0.569078,-0.939646,-0.491461,5
2013-01-06,-0.780485,-1.732532,-0.776505,0.875838,6


In [73]:
# 用numpy.array来赋值
df.loc[:, 'D'] = np.array([5] * len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.686918,-0.574372,5,1
2013-01-02,-1.305996,-0.425252,1.045873,5,2
2013-01-03,0.390111,1.420477,-0.456445,5,3
2013-01-04,1.013269,0.339514,-0.258735,5,4
2013-01-05,0.958733,0.569078,-0.939646,5,5
2013-01-06,-0.780485,-1.732532,-0.776505,5,6


## Missing data

In [76]:
# reindex允许我们去增删改index，返回一个新的DataFrame
df1 = df.reindex(index=dates[0:4], columns=list(df.columns)+['E'])
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.686918,-0.574372,5,1,
2013-01-02,-1.305996,-0.425252,1.045873,5,2,
2013-01-03,0.390111,1.420477,-0.456445,5,3,
2013-01-04,1.013269,0.339514,-0.258735,5,4,


In [77]:
df1.loc[dates[0]:dates[1], 'E'] =1
df1

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.686918,-0.574372,5,1,1.0
2013-01-02,-1.305996,-0.425252,1.045873,5,2,1.0
2013-01-03,0.390111,1.420477,-0.456445,5,3,
2013-01-04,1.013269,0.339514,-0.258735,5,4,


In [80]:
# drop any rows that have missing data
df1.dropna(how='any')

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.686918,-0.574372,5,1,1.0
2013-01-02,-1.305996,-0.425252,1.045873,5,2,1.0


In [81]:
# Filling the missing data
df1.fillna(value=5)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,0.0,0.686918,-0.574372,5,1,1.0
2013-01-02,-1.305996,-0.425252,1.045873,5,2,1.0
2013-01-03,0.390111,1.420477,-0.456445,5,3,5.0
2013-01-04,1.013269,0.339514,-0.258735,5,4,5.0


In [83]:
# get the boolean mask where values are nan
pd.isna(df1)

Unnamed: 0,A,B,C,D,F,E
2013-01-01,False,False,False,False,False,False
2013-01-02,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,True


## Merge

In [94]:
# concat

df = pd.DataFrame(np.random.randn(10, 4))
pieces=[df[:3], df[3:7],df[7:]]
pd.concat(pieces)

Unnamed: 0,0,1,2,3
0,-0.371899,0.223176,-0.188703,0.792553
1,0.27115,1.110388,0.956348,-0.670135
2,0.031099,0.737204,0.235411,0.458317
3,-0.820532,0.675769,-1.038146,-0.567006
4,1.905358,0.494205,0.564847,-0.512699
5,-0.206188,1.06546,-0.360216,0.907558
6,-0.765769,-0.171226,-1.118026,-1.363486
7,1.087905,0.45,-0.462957,-1.002056
8,-0.521609,2.661019,0.947559,-0.692092
9,-1.114314,0.865277,0.499397,-1.045973


## Grouping

In [95]:
df = pd.DataFrame({
    'A': ['foo', 'bar', 'foo', 'bar', 'foo','bar', 'foo', 'foo'],
    'B': ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
    'C': np.random.randn(8),
    'D': np.random.randn(8)
})
df

Unnamed: 0,A,B,C,D
0,foo,one,-0.011457,0.647138
1,bar,one,-1.465725,-0.505399
2,foo,two,-0.045049,-0.326142
3,bar,three,0.310713,1.100831
4,foo,two,-0.79567,-0.190413
5,bar,two,0.835902,0.918426
6,foo,one,-0.439289,-0.401638
7,foo,three,-0.526225,-0.920646


In [96]:
df.groupby('A').sum()

Unnamed: 0_level_0,C,D
A,Unnamed: 1_level_1,Unnamed: 2_level_1
bar,-0.319109,1.513858
foo,-1.817691,-1.191701


In [97]:
df.groupby(['A', 'B']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,C,D
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,-1.465725,-0.505399
bar,three,0.310713,1.100831
bar,two,0.835902,0.918426
foo,one,-0.450746,0.2455
foo,three,-0.526225,-0.920646
foo,two,-0.84072,-0.516555


# Getting data in/out

### CSV

### HDF5

### Excel