In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# 通过一个列表的值创建一个系列，让pandas创建一个默认的整数索引：
s = pd.Series([1,3,5,np.nan,6,8])
s
# 注意：np.nan表示不存在

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
# 通过numpy创建的数组，可以创建一个pandas中特有的dataframe，使用datetime作为索引和标记列名。

dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
# 这里的date_range()是pandas下的一个方法，可以在jupyter notebook下使用如下方法查看使用说明：

?pd.date_range
# help(pd.date_range)

In [7]:
# 我们可以看到这个方法的使用说明和各个参数的作用
# 看到这个方法是返回固定频率日期时间索引，返回连续的六个日期信息

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.314963,-1.330998,-1.043611,1.367086
2013-01-02,0.795748,1.406374,-0.872199,0.426622
2013-01-03,0.516266,-0.65495,0.844677,-1.642708
2013-01-04,0.030852,0.353794,-1.114971,0.305245
2013-01-05,0.846724,0.813074,-0.058637,2.133081
2013-01-06,-1.09516,0.164834,0.006563,-0.041451


In [8]:
?np.random.randn()
# np.random.randn(6,4)是生成6行4列的标准正态分布数据
# 注：这里使用ABCD为列标签，之前生成的日期序列为行标签，6行4列标准正态分布数据为内容的dataframe

In [9]:
# 使用字典来创建dataframe

df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                     'D' : np.array([3] * 4,dtype='int32'),
                     'E' : pd.Categorical(["test","train","test","train"]),
                     'F' : 'foo' })
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [10]:
# 查看dataframe的类型信息
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [11]:
# 更多内容请参阅基础部分：http://pandas.pydata.org/pandas-docs/version/0.20/basics.html#basics
# 查看dataframe顶部和底部的行（默认五行）

df.head()#查看默认前五行

Unnamed: 0,A,B,C,D
2013-01-01,1.314963,-1.330998,-1.043611,1.367086
2013-01-02,0.795748,1.406374,-0.872199,0.426622
2013-01-03,0.516266,-0.65495,0.844677,-1.642708
2013-01-04,0.030852,0.353794,-1.114971,0.305245
2013-01-05,0.846724,0.813074,-0.058637,2.133081


In [12]:
df.tail()#查看默认后五行

Unnamed: 0,A,B,C,D
2013-01-02,0.795748,1.406374,-0.872199,0.426622
2013-01-03,0.516266,-0.65495,0.844677,-1.642708
2013-01-04,0.030852,0.353794,-1.114971,0.305245
2013-01-05,0.846724,0.813074,-0.058637,2.133081
2013-01-06,-1.09516,0.164834,0.006563,-0.041451


In [13]:
df.head(3)#查看前三行

Unnamed: 0,A,B,C,D
2013-01-01,1.314963,-1.330998,-1.043611,1.367086
2013-01-02,0.795748,1.406374,-0.872199,0.426622
2013-01-03,0.516266,-0.65495,0.844677,-1.642708


In [14]:
# 显示索引：

df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [15]:
# 显示列名：

df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [16]:
# 查看底层numpy数据：

df.values

array([[ 1.31496336, -1.33099803, -1.04361106,  1.36708557],
       [ 0.79574828,  1.40637369, -0.87219913,  0.42662223],
       [ 0.51626637, -0.65495019,  0.8446769 , -1.6427081 ],
       [ 0.03085214,  0.35379427, -1.11497063,  0.30524519],
       [ 0.84672366,  0.81307397, -0.05863658,  2.13308112],
       [-1.09515965,  0.16483368,  0.00656283, -0.04145067]])

In [18]:
# 对数据的描述性统计信息
# 注意：数据需要是数字类型（float，int等)
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.401566,0.125355,-0.37303,0.424646
std,0.846004,0.989702,0.771371,1.289372
min,-1.09516,-1.330998,-1.114971,-1.642708
25%,0.152206,-0.450004,-1.000758,0.045223
50%,0.656007,0.259314,-0.465418,0.365934
75%,0.83398,0.698254,-0.009737,1.13197
max,1.314963,1.406374,0.844677,2.133081


In [19]:
# 转置调换数据
# 注意：就像线性代数里面的转置矩阵
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,1.314963,0.795748,0.516266,0.030852,0.846724,-1.09516
B,-1.330998,1.406374,-0.65495,0.353794,0.813074,0.164834
C,-1.043611,-0.872199,0.844677,-1.114971,-0.058637,0.006563
D,1.367086,0.426622,-1.642708,0.305245,2.133081,-0.041451


In [20]:
# 按轴排序

df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.367086,-1.043611,-1.330998,1.314963
2013-01-02,0.426622,-0.872199,1.406374,0.795748
2013-01-03,-1.642708,0.844677,-0.65495,0.516266
2013-01-04,0.305245,-1.114971,0.353794,0.030852
2013-01-05,2.133081,-0.058637,0.813074,0.846724
2013-01-06,-0.041451,0.006563,0.164834,-1.09516


In [21]:
?df.sort_index(axis=1, ascending=False)

Signature: df.sort_index(axis=0, level=None, ascending=True, inplace=False, kind='quicksort', na_position='last', sort_remaining=True, by=None)

Docstring:
Sort object by labels (along an axis)

Parameters
----------
axis : index, columns to direct sorting

In [22]:
# 这说明axis可以取0或者1，我们试试看0：

df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,-1.09516,0.164834,0.006563,-0.041451
2013-01-05,0.846724,0.813074,-0.058637,2.133081
2013-01-04,0.030852,0.353794,-1.114971,0.305245
2013-01-03,0.516266,-0.65495,0.844677,-1.642708
2013-01-02,0.795748,1.406374,-0.872199,0.426622
2013-01-01,1.314963,-1.330998,-1.043611,1.367086


In [23]:
# 按值排序

df.sort_values(by='B')
# 这是B列数据排序，并且其他数据参照B列数据变化。

Unnamed: 0,A,B,C,D
2013-01-01,1.314963,-1.330998,-1.043611,1.367086
2013-01-03,0.516266,-0.65495,0.844677,-1.642708
2013-01-06,-1.09516,0.164834,0.006563,-0.041451
2013-01-04,0.030852,0.353794,-1.114971,0.305245
2013-01-05,0.846724,0.813074,-0.058637,2.133081
2013-01-02,0.795748,1.406374,-0.872199,0.426622


## 选择数据
更多内容请参阅索引文档索引：http://pandas.pydata.org/pandas-docs/version/0.20/indexing.html#indexing

和选择数据和多指标/高级索引：http://pandas.pydata.org/pandas-docs/version/0.20/advanced.html#advanced



选择一个列，返回一个Series，相当于df.A

In [24]:
df['A'] #取A列数据，返回的是一个序列


2013-01-01    1.314963
2013-01-02    0.795748
2013-01-03    0.516266
2013-01-04    0.030852
2013-01-05    0.846724
2013-01-06   -1.095160
Freq: D, Name: A, dtype: float64

In [25]:
# 选择通过[]，切片出行数据

df[0:3] #使用默认索引取出前三行

Unnamed: 0,A,B,C,D
2013-01-01,1.314963,-1.330998,-1.043611,1.367086
2013-01-02,0.795748,1.406374,-0.872199,0.426622
2013-01-03,0.516266,-0.65495,0.844677,-1.642708


In [26]:
df['20130102':'20130104'] #使用新索引取出对应行

Unnamed: 0,A,B,C,D
2013-01-02,0.795748,1.406374,-0.872199,0.426622
2013-01-03,0.516266,-0.65495,0.844677,-1.642708
2013-01-04,0.030852,0.353794,-1.114971,0.305245


## 标签选择
在标签选择中查看更多内容：http://pandas.pydata.org/pandas-docs/version/0.20/indexing.html#indexing-label



使用标签获取一块数据：



In [27]:
df.loc[dates[0]] #取出第一行


A    1.314963
B   -1.330998
C   -1.043611
D    1.367086
Name: 2013-01-01 00:00:00, dtype: float64

In [28]:
# 通过标签选择多列：

df.loc[:,['A','B']] #这里选择的是A,B列数据

Unnamed: 0,A,B
2013-01-01,1.314963,-1.330998
2013-01-02,0.795748,1.406374
2013-01-03,0.516266,-0.65495
2013-01-04,0.030852,0.353794
2013-01-05,0.846724,0.813074
2013-01-06,-1.09516,0.164834


In [29]:
# 显示标签切片：

df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,0.795748,1.406374
2013-01-03,0.516266,-0.65495
2013-01-04,0.030852,0.353794


In [30]:
# 减少返回对象的维度：

df.loc['20130102',['A','B']]

A    0.795748
B    1.406374
Name: 2013-01-02 00:00:00, dtype: float64

In [31]:
# 获得标量值（就是定位一个具体的值）：

df.loc[dates[0],'A']

1.3149633573595536

In [32]:
# 快速访问标量（等同于先前的方法）：

df.at[dates[0],'A']

1.3149633573595536