### pandas 数据结构 

#### Series

In [1]:
# Series 是一种一维数组，和 NumPy 里的数组很相似。
# 事实上，Series 基本上就是基于 NumPy 的数组对象来的。
# 和 NumPy 的数组不同，Series 能为数据自定义标签，也就是索引（index），
# 然后通过索引来访问数组中的数据。

In [2]:
import numpy as np
import pandas as pd

In [3]:
# 用list创建一个Series
countries = ['USA', 'China','UK','Japen']
data = [100,200,300,400]
my_series = pd.Series(data, countries)

In [4]:
my_series
# data是数据，而countries是数据的索引值，如果不指定indexpandas会默认添加索引

USA      100
China    200
UK       300
Japen    400
dtype: int64

In [6]:
# 用numpy数组创建
np_arr = np.array(data)
np_arr

array([100, 200, 300, 400])

In [7]:
pd.Series(data=np_arr)  # 默认索引是0开始

0    100
1    200
2    300
3    400
dtype: int32

In [18]:
# 如果给出索引了，data会扩充,data只能是单个值，不能是[5.0]
pd.Series(5., index=['a', 'b', 'c', 'd', 'e']) 

a    5.0
b    5.0
c    5.0
d    5.0
e    5.0
dtype: float64

In [9]:
# 用dict字典创建索引
# key自动作为index
# 和 NumPy 数组不同，Pandas 的 Series 能存放各种不同类型的对象。
my_dict = {'a':50, 'b':3.5, 'c':'hello'}
pd.Series(my_dict)

a       50
b      3.5
c    hello
dtype: object

In [10]:
# 取Series数据，和dict一样
my_series['China']

200

In [11]:
# Series 操作
# series必须是同维度
series1 = pd.Series([1,2,3,4], ["London", "Hongkong", "Logos", "Paries"])
series2 = pd.Series([1,3,6,4], ["London", "Logos", "Shenzhen", "Guangzhou", "Changsha"])
series1 - series2

ValueError: Length of passed values is 4, index implies 5

In [14]:
series1 = pd.Series([1,2,3,4], ["London", "Hongkong", "Logos", "Paries"])
series2 = pd.Series([7,2,6,4], ["London", "Logos", "Shenzhen", "Guangzhou"])
series1 - series2

Guangzhou    NaN
Hongkong     NaN
Logos        1.0
London      -6.0
Paries       NaN
Shenzhen     NaN
dtype: float64

In [15]:
# 如果 Pandas 在两个 Series 里找不到相同的 index，对应的位置就返回一个空值 NaN。

In [21]:
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
s

a   -2.035684
b    0.900498
c   -0.650077
d    0.005504
e   -0.301291
dtype: float64

In [22]:
s[:3]

a   -2.035684
b    0.900498
c   -0.650077
dtype: float64

In [23]:
s[s > s.median()]

b    0.900498
d    0.005504
dtype: float64

In [24]:
s[[4, 3, 1]]

e   -0.301291
d    0.005504
b    0.900498
dtype: float64

In [29]:
np.exp(s)

a    0.130591
b    2.460828
c    0.522006
d    1.005519
e    0.739863
dtype: float64

In [26]:
s.dtype

dtype('float64')

In [27]:
s.array

<PandasArray>
[  -2.0356842542533133,    0.9004978924584666,   -0.6500767404510837,
 0.0055038845265606585,   -0.3012908583701717]
Length: 5, dtype: float64

In [28]:
# 转换成numpy.array
s.to_numpy()

array([-2.03568425,  0.90049789, -0.65007674,  0.00550388, -0.30129086])

### Dataframe

In [30]:
"""
Dataframe是二维数据，分为行和列
"""

'\nDataframe是二维数据，分为行和列\n'

In [35]:
# 创建Dataframe
# 1、用Series创建
d = {'one': pd.Series([1., 2., 3.], index=['a', 'b', 'c']),
     'two': pd.Series([1., 2., 3., 4.], index=['a', 'b', 'c', 'd'])}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
a,1.0,1.0
b,2.0,2.0
c,3.0,3.0
d,,4.0


In [36]:
# 只挑选某些索引值
pd.DataFrame(d, index=['d', 'b', 'a'])

Unnamed: 0,one,two
d,,4.0
b,2.0,2.0
a,1.0,1.0


In [37]:
# 指定列
pd.DataFrame(d, index=['d', 'b', 'a'], columns=['two', 'three'])

Unnamed: 0,two,three
d,4.0,
b,2.0,
a,1.0,


In [38]:
# index可以看做是行
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [39]:
# columns就是列
df.columns

Index(['one', 'two'], dtype='object')

In [40]:
# 不指定index，默认就是从0开始
d = {'one': [1., 2., 3., 4.],'two': [4., 3., 2., 1.]}
df = pd.DataFrame(d)
df

Unnamed: 0,one,two
0,1.0,4.0
1,2.0,3.0
2,3.0,2.0
3,4.0,1.0


In [42]:
# 从字典中创建,key是index名
data2 = [{'a': 1, 'b': 2}, {'a': 5, 'b': 10, 'c': 20}]
df2 = pd.DataFrame(data2)
df2

Unnamed: 0,a,b,c
0,1,2,
1,5,10,20.0


In [43]:
pd.DataFrame({
    ('a', 'b'): {('A', 'B'): 1, ('A', 'C'): 2},
 ('a', 'a'): {('A', 'C'): 3, ('A', 'B'): 4},
('a', 'c'): {('A', 'B'): 5, ('A', 'C'): 6},
('b', 'a'): {('A', 'C'): 7, ('A', 'B'): 8},
('b', 'b'): {('A', 'D'): 9, ('A', 'B'): 10}})

Unnamed: 0_level_0,Unnamed: 1_level_0,a,a,a,b,b
Unnamed: 0_level_1,Unnamed: 1_level_1,b,a,c,a,b
A,B,1.0,4.0,5.0,8.0,10.0
A,C,2.0,3.0,6.0,7.0,
A,D,,,,,9.0


In [45]:
# 获取某一列
df['one']

0    1.0
1    2.0
2    3.0
3    4.0
Name: one, dtype: float64

In [46]:
df['three'] = df['one'] * df['two']
df['flag'] = df['one'] > 2
df

Unnamed: 0,one,two,three,flag
0,1.0,4.0,4.0,False
1,2.0,3.0,6.0,False
2,3.0,2.0,6.0,True
3,4.0,1.0,4.0,True


In [47]:
# 删除某一列
del df['two']

In [48]:
df

Unnamed: 0,one,three,flag
0,1.0,4.0,False
1,2.0,6.0,False
2,3.0,6.0,True
3,4.0,4.0,True


In [49]:
three = df.pop('three')
df

Unnamed: 0,one,flag
0,1.0,False
1,2.0,False
2,3.0,True
3,4.0,True


In [51]:
# 取某一列的前两行组成新的列
df['one_trunc'] = df['one'][:2]
df

Unnamed: 0,one,flag,one_trunc
0,1.0,False,1.0
1,2.0,False,
2,3.0,True,
3,4.0,True,


In [53]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [54]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.556609,-1.084966,0.252478,1.539058
2013-01-02,-2.175071,0.093069,0.130958,-1.227641
2013-01-03,-0.280983,-0.738522,-1.009401,-0.203073
2013-01-04,0.11522,-0.67139,-0.756132,0.665838
2013-01-05,-1.276377,0.441232,-0.050255,0.307119
2013-01-06,-1.577907,0.540626,-1.311558,0.680493


In [58]:
df2 = pd.DataFrame({'A': 1.,
   ...:                     'B': pd.Timestamp('20130102'),
   ...:                     'C': pd.Series(1, index=list(range(4)), dtype='float32'),  # list的维度要相同
   ...:                     'D': np.array([3] * 4, dtype='int32'),
   ...:                     'E': pd.Categorical(["test", "train", "test", "train"]),
   ...:                     'F': 'foo'})

In [56]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [59]:
# 前5个
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.556609,-1.084966,0.252478,1.539058
2013-01-02,-2.175071,0.093069,0.130958,-1.227641
2013-01-03,-0.280983,-0.738522,-1.009401,-0.203073
2013-01-04,0.11522,-0.67139,-0.756132,0.665838
2013-01-05,-1.276377,0.441232,-0.050255,0.307119


In [60]:
# 倒数3个
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,0.11522,-0.67139,-0.756132,0.665838
2013-01-05,-1.276377,0.441232,-0.050255,0.307119
2013-01-06,-1.577907,0.540626,-1.311558,0.680493


In [61]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [62]:
df.to_numpy()

array([[ 0.55660908, -1.08496582,  0.25247758,  1.53905788],
       [-2.17507056,  0.09306904,  0.13095837, -1.22764074],
       [-0.28098254, -0.73852218, -1.00940072, -0.20307299],
       [ 0.11522041, -0.67138984, -0.75613233,  0.66583755],
       [-1.27637747,  0.44123241, -0.05025515,  0.30711926],
       [-1.57790677,  0.5406261 , -1.31155842,  0.68049284]])

In [63]:
# 针对数字型元素计算平均值、方差、标准差、最小值这些
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.773085,-0.236658,-0.457318,0.293632
std,1.064513,0.683068,0.654124,0.938131
min,-2.175071,-1.084966,-1.311558,-1.227641
25%,-1.502524,-0.721739,-0.946084,-0.075525
50%,-0.77868,-0.28916,-0.403194,0.486478
75%,0.01617,0.354192,0.085655,0.676829
max,0.556609,0.540626,0.252478,1.539058


In [64]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.556609,-2.175071,-0.280983,0.11522,-1.276377,-1.577907
B,-1.084966,0.093069,-0.738522,-0.67139,0.441232,0.540626
C,0.252478,0.130958,-1.009401,-0.756132,-0.050255,-1.311558
D,1.539058,-1.227641,-0.203073,0.665838,0.307119,0.680493


In [66]:
# 除了index和col之后的值，和df.to_numpy()一致
df.values

array([[ 0.55660908, -1.08496582,  0.25247758,  1.53905788],
       [-2.17507056,  0.09306904,  0.13095837, -1.22764074],
       [-0.28098254, -0.73852218, -1.00940072, -0.20307299],
       [ 0.11522041, -0.67138984, -0.75613233,  0.66583755],
       [-1.27637747,  0.44123241, -0.05025515,  0.30711926],
       [-1.57790677,  0.5406261 , -1.31155842,  0.68049284]])

In [70]:
# 按照行，倒叙排序
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
0,1.0,2013-01-02,1.0,3,test,foo


In [72]:
# 按照值的 E列排序
df2.sort_values(by='E', ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,1.0,2013-01-02,1.0,3,train,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
0,1.0,2013-01-02,1.0,3,test,foo


#### 选择

In [73]:
df['A']  # 选择某一列

2013-01-01    0.556609
2013-01-02   -2.175071
2013-01-03   -0.280983
2013-01-04    0.115220
2013-01-05   -1.276377
2013-01-06   -1.577907
Freq: D, Name: A, dtype: float64

In [77]:
# 从第0个index到第3个index
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.556609,-1.084966,0.252478,1.539058
2013-01-02,-2.175071,0.093069,0.130958,-1.227641
2013-01-03,-0.280983,-0.738522,-1.009401,-0.203073


In [75]:
# 按照index选择
df['20130102':'20130104']

Unnamed: 0,A,B,C,D
2013-01-02,-2.175071,0.093069,0.130958,-1.227641
2013-01-03,-0.280983,-0.738522,-1.009401,-0.203073
2013-01-04,0.11522,-0.67139,-0.756132,0.665838


In [76]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.556609,-1.084966,0.252478,1.539058
2013-01-02,-2.175071,0.093069,0.130958,-1.227641
2013-01-03,-0.280983,-0.738522,-1.009401,-0.203073
2013-01-04,0.11522,-0.67139,-0.756132,0.665838
2013-01-05,-1.276377,0.441232,-0.050255,0.307119
2013-01-06,-1.577907,0.540626,-1.311558,0.680493


In [78]:
df[3:]

Unnamed: 0,A,B,C,D
2013-01-04,0.11522,-0.67139,-0.756132,0.665838
2013-01-05,-1.276377,0.441232,-0.050255,0.307119
2013-01-06,-1.577907,0.540626,-1.311558,0.680493


In [80]:
# 按照Label选择, 第一个表示行index，第二个表示列
print(dates[0])
df.loc[dates[0]]

2013-01-01 00:00:00


A    0.556609
B   -1.084966
C    0.252478
D    1.539058
Name: 2013-01-01 00:00:00, dtype: float64

In [81]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.556609,-1.084966
2013-01-02,-2.175071,0.093069
2013-01-03,-0.280983,-0.738522
2013-01-04,0.11522,-0.67139
2013-01-05,-1.276377,0.441232
2013-01-06,-1.577907,0.540626


In [82]:
df.loc[dates[0], 'A']

0.5566090845894024