In [2]:
import numpy as np
import pandas as pd

# 求和
## Series
使用`sum()`可以计算一个Series的和，`NaN`会被自动忽略，传入`skipna=False`可以不忽略NaN

In [3]:
a = pd.Series(np.random.randn(8),index=range(8))
a[7] = np.nan
a

0   -0.862705
1    2.028504
2   -0.836748
3    0.444316
4    1.166609
5    0.937819
6    0.374139
7         NaN
dtype: float64

In [4]:
a.sum()

3.2519339464475174

In [5]:
sum(a)

nan

In [6]:
a.sum(skipna=False)

nan

## DataFrame
DataFrame中使用`.sum()`方法可以计算每一列的和，传入`axis=1`可计算每一行的和，DataFrame中`sum()`和该方法结果不同

In [7]:
a = pd.DataFrame(np.random.randn(4,4),index=["a","b","c","d"])
a

Unnamed: 0,0,1,2,3
a,1.878661,-0.033517,0.832962,-1.091722
b,-0.520814,-1.225899,0.265975,0.495149
c,-0.576698,-1.47234,-0.020534,-0.776787
d,1.102613,-0.155163,-1.046787,-0.614612


In [8]:
a.sum()

0    1.883762
1   -2.886919
2    0.031616
3   -1.987972
dtype: float64

In [9]:
sum(a)

6

In [10]:
a.sum(axis=1)

a    1.586384
b   -0.985589
c   -2.846360
d   -0.713949
dtype: float64

## 其他函数
pandas提供很多类似统计函数，详情查看文档

# describe
describe是综合统计函数，一次性产生很多汇总统计

In [11]:
a.describe()

Unnamed: 0,0,1,2,3
count,4.0,4.0,4.0,4.0
mean,0.47094,-0.72173,0.007904,-0.496993
std,1.219537,0.733084,0.787507,0.690451
min,-0.576698,-1.47234,-1.046787,-1.091722
25%,-0.534785,-1.287509,-0.277098,-0.855521
50%,0.290899,-0.690531,0.12272,-0.695699
75%,1.296625,-0.124751,0.407722,-0.337172
max,1.878661,-0.033517,0.832962,0.495149


对于文字类型的对象产生不同的统计值

In [12]:
a[3]["a"] = "hello"
a

Unnamed: 0,0,1,2,3
a,1.878661,-0.033517,0.832962,hello
b,-0.520814,-1.225899,0.265975,0.495149
c,-0.576698,-1.47234,-0.020534,-0.776787
d,1.102613,-0.155163,-1.046787,-0.614612


In [13]:
a[3].describe()

count         4
unique        4
top       hello
freq          1
Name: 3, dtype: object

# 相关性
## 变化
使用`.pct_change()`方法可以计算一列上的前后的变化情况（百分比表示）

In [14]:
a = pd.DataFrame(np.random.randn(4,4))
a

Unnamed: 0,0,1,2,3
0,0.185204,-1.416195,0.500968,-0.559814
1,0.4341,-0.087881,-0.497894,-0.610854
2,-0.545177,-1.908306,0.615842,0.694264
3,-0.786858,-0.30658,-0.475984,-0.276054


In [15]:
a.pct_change()

Unnamed: 0,0,1,2,3
0,,,,
1,1.343896,-0.937946,-1.993864,0.091173
2,-2.255879,20.714685,-2.236893,-2.136547
3,0.443309,-0.839345,-1.772899,-1.397621


## 协方差/协方差矩阵
`.cov()`可以用于计算协方差（两个Series）或协方差矩阵（DataFrame）

In [16]:
a.cov()

Unnamed: 0,0,1,2,3
0,0.33737,0.10617,-0.019699,-0.221402
1,0.10617,0.763762,-0.520704,-0.364446
2,-0.019699,-0.520704,0.366527,0.203178
3,-0.221402,-0.364446,0.203178,0.367731


In [17]:
b = pd.Series(np.random.randn(6))
b

0    0.007146
1   -0.183408
2    1.271062
3   -1.074414
4   -0.862785
5    1.502270
dtype: float64

In [19]:
b.cov(pd.Series(np.random.randn(6)))

0.36001207482238373

## 相关系数
使用`corr()`方法可以返回相关系数（两个Series）或相关系数矩阵，使用`corrwith()`方法可以计算一个DataFrame所有列和另一个Series之间的相关系数或两个DataFrame之间的相关系数

In [20]:
a.corr()

Unnamed: 0,0,1,2,3
0,1.0,0.209155,-0.056019,-0.628585
1,0.209155,1.0,-0.984144,-0.687684
2,-0.056019,-0.984144,1.0,0.553424
3,-0.628585,-0.687684,0.553424,1.0


In [21]:
b.corr(pd.Series(np.random.randn(6)))

0.094478843596795023

In [22]:
a.corrwith(b)

0    0.046184
1   -0.817212
2    0.790733
3    0.734038
dtype: float64