In [3]:
!pip install pandas
import pandas as pd

Collecting pandas
  Downloading pandas-1.0.1-cp36-cp36m-macosx_10_9_x86_64.whl (9.9 MB)
[K     |████████████████████████████████| 9.9 MB 3.6 MB/s 
[?25hCollecting pytz>=2017.2
  Downloading pytz-2019.3-py2.py3-none-any.whl (509 kB)
[K     |████████████████████████████████| 509 kB 21.5 MB/s 
Installing collected packages: pytz, pandas
Successfully installed pandas-1.0.1 pytz-2019.3


## 1. Series类型
类似于一维数组的对象  
由一组数据以及一组与之相关的数组标签组成（索引）  
仅由一组数据即可产生最简单的series

In [5]:
obj = pd.Series([1,2,3,4,5])
print(obj)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [6]:
print(obj.values)
print(obj.index)

[1 2 3 4 5]
RangeIndex(start=0, stop=5, step=1)


In [7]:
# 自定义索引
obj1 = pd.Series(['a','b','c','d','e'], index=[1,2,3,4,5])
print(obj1)

1    a
2    b
3    c
4    d
5    e
dtype: object


In [8]:
obj[2]

3

In [10]:
# 我们也可以吧series当作字典来使用
data = {'a':10000, 'b':20000, 'c':30000}
obj2 = pd.Series(data)
obj2

a    10000
b    20000
c    30000
dtype: int64

In [12]:
keys = ['a','c']
obj3 = pd.Series(data,index=keys)
obj3

a    10000
c    30000
dtype: int64

In [13]:
# 缺失数据的处理
# none会变成nan
pd.isnull(obj)  # 或者obj.isnull()

0    False
1    False
2    False
3    False
4    False
dtype: bool

## 2. dataframe
表格型的数据结构  
它含有一组有序的列  
每列可以是不同值的类型、数值、字符串  
有行索引、也有列索引  
可以理解成是由series组成的一个字典

In [15]:
# 构建
data = {
    '60年代': ['a', 'b', 'c'],
    '70年代': ['d', 'e', 'f'],
    '80年代': ['g', 'h', 'i'],
}

frame_data = pd.DataFrame(data)
print(frame_data)

60年代 70年代 80年代
0    a    d    g
1    b    e    h
2    c    f    i


In [17]:
# 访问
frame_data['70年代']
# 取出来的数据其实就是一个series

0    d
1    e
2    f
Name: 70年代, dtype: object

In [22]:
import numpy as np
dates = pd.date_range('20190301', periods=6)
dates

DatetimeIndex(['2019-03-01', '2019-03-02', '2019-03-03', '2019-03-04',
               '2019-03-05', '2019-03-06'],
              dtype='datetime64[ns]', freq='D')

In [23]:
df = pd.DataFrame(np.random.rand(6,4), index = dates, columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2019-03-01,0.43869,0.529751,0.123424,0.974243
2019-03-02,0.714029,0.137904,0.404188,0.6307
2019-03-03,0.515696,0.207398,0.876806,0.715236
2019-03-04,0.987398,0.324618,0.689395,0.254037
2019-03-05,0.490467,0.897999,0.104574,0.014072
2019-03-06,0.74971,0.360298,0.278479,0.630408


In [24]:
df.T

Unnamed: 0,2019-03-01,2019-03-02,2019-03-03,2019-03-04,2019-03-05,2019-03-06
A,0.43869,0.714029,0.515696,0.987398,0.490467,0.74971
B,0.529751,0.137904,0.207398,0.324618,0.897999,0.360298
C,0.123424,0.404188,0.876806,0.689395,0.104574,0.278479
D,0.974243,0.6307,0.715236,0.254037,0.014072,0.630408


In [26]:
df['20190301':'20190303']
# 不同于python里面左闭右开的原则
# 包含所有

Unnamed: 0,A,B,C,D
2019-03-01,0.43869,0.529751,0.123424,0.974243
2019-03-02,0.714029,0.137904,0.404188,0.6307
2019-03-03,0.515696,0.207398,0.876806,0.715236


In [27]:
# 对行列同时进行筛选
df.loc['20190301':'20190303', ['A','B']]

Unnamed: 0,A,B
2019-03-01,0.43869,0.529751
2019-03-02,0.714029,0.137904
2019-03-03,0.515696,0.207398


In [29]:
df.at[dates[0], 'A']

0.438689844059279

In [30]:
df.head(2)

Unnamed: 0,A,B,C,D
2019-03-01,0.43869,0.529751,0.123424,0.974243
2019-03-02,0.714029,0.137904,0.404188,0.6307


In [31]:
df.tail(3)

Unnamed: 0,A,B,C,D
2019-03-04,0.987398,0.324618,0.689395,0.254037
2019-03-05,0.490467,0.897999,0.104574,0.014072
2019-03-06,0.74971,0.360298,0.278479,0.630408


#### dataframe能够接受哪些数据类型
1. 二维numpy array
2. 由数组、列表或是元组组成的字典
3. 由series组成的字典
4. 由字典组成的字典
5. 字典或series的列表
6. 由列表或元组组成的列表
7. 另一个dataframe

## 3. pandas重新索引

In [35]:
obj1

1    a
2    b
3    c
4    d
5    e
dtype: object

In [42]:
job_1 = obj1.reindex([1,2,3,4,5,6])
job_1

1      a
2      b
3      c
4      d
5      e
6    NaN
dtype: object

In [45]:
job_1 = obj1.reindex([1,2,3,4,5,6], fill_value ='f')
job_1

1    a
2    b
3    c
4    d
5    e
6    f
dtype: object

In [None]:
o = obj.reindex(range(6), methid='ffill')
# ffill前向值填充， bfill后向值填充

## 4.算术运算和数据对其
pandas可以对不同索引的对象进行算术运算  
如果存在不同的索引对，则结果的索引就是该索引的并集

In [47]:
d1 = pd.Series([1,2,3,4], index = ['a','b','c','d'])
d2 = pd.Series([2,3,4,5,6], index = ['a','b','c','d','e'])
d1+d2

a    3.0
b    5.0
c    7.0
d    9.0
e    NaN
dtype: float64

## 5.datafram和series之间的运算

In [49]:
frame = pd.DataFrame(np.arange(12).reshape((4,3)), columns = list('bde'), index = [1,2,3,4])
series = frame.loc[1]
print(frame)
print(series)

b   d   e
1  0   1   2
2  3   4   5
3  6   7   8
4  9  10  11
b    0
d    1
e    2
Name: 1, dtype: int64


In [51]:
frame-series # 一直向下，广播相减
            # 相加时，没有就合并

Unnamed: 0,b,d,e
1,0,0,0
2,3,3,3
3,6,6,6
4,9,9,9


## 6.排序

In [54]:
# 根据条件进行排序
obj = pd.Series(range(4), index = ['d','e','a','b'])
print(obj)

d    0
e    1
a    2
b    3
dtype: int64


In [55]:
obj.sort_index()

a    2
b    3
d    0
e    1
dtype: int64

In [56]:
obj.sort_values()

d    0
e    1
a    2
b    3
dtype: int64

In [57]:
# 针对dataframe，根据任意一个轴上的索引进行排序
frame.sort_index()
frame.sort_index(axis=1)

## 7.层次化索引
在一个轴上，拥有多个索引级别  
能以低维度形式处理高维度数据

In [60]:
data = pd.Series(np.random.randn(10), index = [['a','a','a','b','b','b','c','c','d','d'], [1,2,3,4,5,6,7,8,1,2]])
data

a  1   -1.701284
   2   -1.076439
   3    1.691366
b  4   -0.955106
   5    0.112393
   6   -1.711662
c  7   -0.003432
   8    0.609470
d  1    0.672763
   2   -0.123253
dtype: float64

In [61]:
data.index

MultiIndex([('a', 1),
            ('a', 2),
            ('a', 3),
            ('b', 4),
            ('b', 5),
            ('b', 6),
            ('c', 7),
            ('c', 8),
            ('d', 1),
            ('d', 2)],
           )

In [62]:
data['b']

4   -0.955106
5    0.112393
6   -1.711662
dtype: float64

In [63]:
data['b':'d']

b  4   -0.955106
   5    0.112393
   6   -1.711662
c  7   -0.003432
   8    0.609470
d  1    0.672763
   2   -0.123253
dtype: float64

In [64]:
# 内层选取
data[:,2]

a   -1.076439
d   -0.123253
dtype: float64

In [65]:
data.unstack()

Unnamed: 0,1,2,3,4,5,6,7,8
a,-1.701284,-1.076439,1.691366,,,,,
b,,,,-0.955106,0.112393,-1.711662,,
c,,,,,,,-0.003432,0.60947
d,0.672763,-0.123253,,,,,,


In [66]:
data.unstack().stack()

a  1   -1.701284
   2   -1.076439
   3    1.691366
b  4   -0.955106
   5    0.112393
   6   -1.711662
c  7   -0.003432
   8    0.609470
d  1    0.672763
   2   -0.123253
dtype: float64

## 8.文本格式处理
read_csv: 从文件中加载带分隔符的数据，默认分隔符为逗号  
read_table: 默认分隔符为制表符'\t'  
read_fwf  
read_clipboard

In [None]:
pd.read_csv('data.csv')
pd.read_table('data.csv', sep=',')  # 制定分隔符
pd.read_csv('data.csv', header = None)  # 没有头部，所有文件里的数据都是目标数据，会自动生成头部
pd.read_csv('data.csv', index_col='c')  # 制定c列为索引列，也可以[]制定多列


In [None]:
# excel

In [None]:
excel = pd.read_excel('data.xlsx')
excel = pd.read_excel('data.xlsx', sheet_name = '工作表2')

In [None]:
excel.plot(kind = 'scatter', x='age', y='place').get_figure()