# 主要内容——5.3 汇总和描述性统计
1. 协方差和相关性
2. Unique Values, Value Counts, and Membership

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.DataFrame([[1.4, np.nan], [7.1, -4.5],
                   [np.nan, np.nan], [0.75, -1.3]],
                  index=['a', 'b', 'c', 'd'],
                  columns=['one', 'two'])
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [3]:
df.sum()

one    9.25
two   -5.80
dtype: float64

In [4]:
df.sum(axis = 1)

a    1.40
b    2.60
c    0.00
d   -0.55
dtype: float64

用skipna来跳过计算na

In [7]:
df.mean(axis = 'columns', skipna=False)

a      NaN
b    1.300
c      NaN
d   -0.275
dtype: float64

一些方法，比如idxmin和idxmax，能返回间接的统计值，比如index value

In [8]:
df

Unnamed: 0,one,two
a,1.4,
b,7.1,-4.5
c,,
d,0.75,-1.3


In [9]:
df.idxmax()

one    b
two    d
dtype: object

In [10]:
# 计算累加值
df.cumsum()

Unnamed: 0,one,two
a,1.4,
b,8.5,-4.5
c,,
d,9.25,-5.8


产生汇总数据

In [11]:
df.describe()



Unnamed: 0,one,two
count,3.0,2.0
mean,3.083333,-2.9
std,3.493685,2.262742
min,0.75,-4.5
25%,,
50%,,
75%,,
max,7.1,-1.3


对于非数值型数据，describe能产生总计、频数等汇总数据

In [12]:
obj = pd.Series(['a', 'a', 'b', 'c'] * 4)
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [13]:
obj.describe()

count     16
unique     3
top        a
freq       8
dtype: object

描述和汇总统计的函数

In [16]:
from IPython.display import Image
Image(url= "6.png",width=500, height=300)

# 1. 协方差和相关性

假设一个DataFrame是股价和股票数量。这些数据取自yahoo finace，用padas-datareader包能加载。如果没有的话，用conda或pip来下载这个包：

In [20]:
import pandas_datareader.data as web

In [24]:
price = pd.read_pickle('data/yahoo_price.pkl')
volume = pd.read_pickle('data/yahoo_volume.pkl')

In [25]:
price.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,27.990226,313.062468,113.304536,25.884104
2010-01-05,28.038618,311.683844,111.935822,25.892466
2010-01-06,27.592626,303.826685,111.208683,25.733566
2010-01-07,27.541619,296.753749,110.823732,25.465944
2010-01-08,27.724725,300.709808,111.935822,25.641571


In [26]:
volume.head()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,123432400,3927000,6155300,38409100
2010-01-05,150476200,6031900,6841400,49749600
2010-01-06,138040000,7987100,5605300,58182400
2010-01-07,119282800,12876600,5840600,50559700
2010-01-08,111902700,9483900,4197200,51197400


pct_change():用于计算同colnums两个相邻的数字之间的变化率

In [27]:
returns = price.pct_change()
returns.tail()

Unnamed: 0_level_0,AAPL,GOOG,IBM,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-17,-0.00068,0.001837,0.002072,-0.003483
2016-10-18,-0.000681,0.019616,-0.026168,0.00769
2016-10-19,-0.002979,0.007846,0.003583,-0.002255
2016-10-20,-0.000512,-0.005652,0.001719,-0.004867
2016-10-21,-0.00393,0.003011,-0.012474,0.042096


In [28]:
returns['MSFT'].corr(returns['IBM'])

0.4997636114415116

In [29]:
returns['MSFT'].cov(returns['IBM'])

8.8706554797035489e-05

因为MSFT是一个有效的python属性，我们可以通过更简洁的方式来选中columns：

In [30]:
returns.MSFT.corr(returns.IBM)

0.4997636114415116

In [31]:
returns.corr()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,1.0,0.407919,0.386817,0.389695
GOOG,0.407919,1.0,0.405099,0.465919
IBM,0.386817,0.405099,1.0,0.499764
MSFT,0.389695,0.465919,0.499764,1.0


In [32]:
returns.cov()

Unnamed: 0,AAPL,GOOG,IBM,MSFT
AAPL,0.000277,0.000107,7.8e-05,9.5e-05
GOOG,0.000107,0.000251,7.8e-05,0.000108
IBM,7.8e-05,7.8e-05,0.000146,8.9e-05
MSFT,9.5e-05,0.000108,8.9e-05,0.000215


用Dataframe的corrwith方法，我们可以计算dataframe中不同columns之间，或row之间的相关系数

传入一个series

In [33]:
returns.corrwith(returns.IBM)

AAPL    0.386817
GOOG    0.405099
IBM     1.000000
MSFT    0.499764
dtype: float64

# 2.Unique Values, Value Counts, and Membership

用Dataframe的corrwith方法，我们可以计算dataframe中不同columns之间，或row之间的相似性。传递一个series

In [34]:
uniques = obj.unique()
uniques

array(['a', 'b', 'c'], dtype=object)

In [35]:
obj.value_counts()

a    8
b    4
c    4
dtype: int64

In [36]:
pd.value_counts(obj.values, sort=False)

c    4
a    8
b    4
dtype: int64

In [37]:
obj

0     a
1     a
2     b
3     c
4     a
5     a
6     b
7     c
8     a
9     a
10    b
11    c
12    a
13    a
14    b
15    c
dtype: object

In [38]:
mask = obj.isin(['b', 'c'])
mask

0     False
1     False
2      True
3      True
4     False
5     False
6      True
7      True
8     False
9     False
10     True
11     True
12    False
13    False
14     True
15     True
dtype: bool

In [39]:
obj[mask]

2     b
3     c
6     b
7     c
10    b
11    c
14    b
15    c
dtype: object

In [40]:
to_match = pd.Series(['c', 'a', 'b', 'b', 'c', 'a'])
unique_vals = pd.Series(['c', 'b', 'a'])

In [41]:
to_match

0    c
1    a
2    b
3    b
4    c
5    a
dtype: object

In [43]:
unique_vals

0    c
1    b
2    a
dtype: object

In [47]:
type(pd.Index(unique_vals))

pandas.indexes.base.Index

In [None]:
pd.Index(unique_vals).get_indexer(to_match)

返回to_match中每个值在unique_vals中的index

In [48]:
data = pd.DataFrame({'Qu1': [1, 3, 4, 3, 4],
                     'Qu2': [2, 3, 1, 2, 3],
                     'Qu3': [1, 5, 2, 4, 4]})
data

Unnamed: 0,Qu1,Qu2,Qu3
0,1,2,1
1,3,3,5
2,4,1,2
3,3,2,4
4,4,3,4


In [49]:
result = data.apply(pd.value_counts)
result

Unnamed: 0,Qu1,Qu2,Qu3
1,1.0,1.0,1.0
2,,2.0,1.0
3,2.0,2.0,
4,2.0,,2.0
5,,,1.0


每一行的laebls(即1，2，3，4，5)其实就是整个data里出现过的值，从1到5。而对应的每个方框里的值，则是表示该值在当前列中出现的次数。比如，(2, Qu1)的值是Nan，说明2这个数字没有在Qu1这一列出现过。(2, Qu2)的值是2，说明2这个数字在Qu2这一列出现过2次。(2, Qu3)的值是1，说明2这个数字在Qu3这一列出现过1次