In [1]:
import numpy as np
import pandas as pd

# 1.置换和随机抽样：

In [2]:
data = pd.DataFrame(np.arange(24).reshape(6,4))
data

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
5,20,21,22,23


In [3]:
sampler = np.random.permutation(6)
sampler

array([3, 2, 4, 1, 0, 5])

#### 可以在基于iloc 索引或等价的 take 函数中使用该数组

In [4]:
data.iloc[sampler]

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19
1,4,5,6,7
0,0,1,2,3
5,20,21,22,23


In [5]:
data.take(sampler)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
4,16,17,18,19
1,4,5,6,7
0,0,1,2,3
5,20,21,22,23


#### 通过使用 axis="columns" 调用 take 函数可以选择列的排列

In [6]:
data

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11
3,12,13,14,15
4,16,17,18,19
5,20,21,22,23


In [7]:
sampler = np.random.permutation(4)
sampler

array([3, 2, 0, 1])

In [8]:
data.take(sampler,axis="columns")

Unnamed: 0,3,2,0,1
0,3,2,0,1
1,7,6,4,5
2,11,10,8,9
3,15,14,12,13
4,19,18,16,17
5,23,22,20,21


#### 要选择不放回的随机子集（同一行不能出现两次），可以在 Series 和 DataFrame 上使用 sample 方法

In [9]:
data.sample(3)

Unnamed: 0,0,1,2,3
2,8,9,10,11
1,4,5,6,7
0,0,1,2,3


#### 要生成带有替代值的样本（允许有重复选择）,需要将 replace=True 传入 sample 方法

In [10]:
data.sample(6,replace=True)

Unnamed: 0,0,1,2,3
3,12,13,14,15
2,8,9,10,11
5,20,21,22,23
0,0,1,2,3
3,12,13,14,15
2,8,9,10,11


# 2.计算指标/虚拟变量

### 2.1 DataFrame 中一行只属于一个类别：

#### pandas.get_dummies

In [11]:
df = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "b"],"value": [11,22,33,44,55,66]})
df

Unnamed: 0,key,value
0,b,11
1,b,22
2,a,33
3,c,44
4,a,55
5,b,66


In [12]:
pd.get_dummies(df["key"])

Unnamed: 0,a,b,c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


#### 参数prefix

In [13]:
dummies = pd.get_dummies(df["key"],prefix="KEY")
dummies

Unnamed: 0,KEY_a,KEY_b,KEY_c
0,0,1,0
1,0,1,0
2,1,0,0
3,0,0,1
4,1,0,0
5,0,1,0


#### DataFrame.join 

In [14]:
df_with_dummy = df[["value"]].join(dummies)
df_with_dummy

Unnamed: 0,value,KEY_a,KEY_b,KEY_c
0,11,0,1,0
1,22,0,1,0
2,33,1,0,0
3,44,0,0,1
4,55,1,0,0
5,66,0,1,0


### 2.2 DataFrame 中的一行属于多个类别

#### str.get_dummies

In [15]:
df = pd.DataFrame({"key": ["b|c", "a|b|c", "a", "a|c", "b|d", "c|e"],"value": [11,22,33,44,55,66]})
df

Unnamed: 0,key,value
0,b|c,11
1,a|b|c,22
2,a,33
3,a|c,44
4,b|d,55
5,c|e,66


In [16]:
dummies = df["key"].str.get_dummies("|")
dummies

Unnamed: 0,a,b,c,d,e
0,0,1,1,0,0
1,1,1,1,0,0
2,1,0,0,0,0
3,1,0,1,0,0
4,0,1,0,1,0
5,0,0,1,0,1


#### 合并数据并添加虚拟变量列名前缀

In [17]:
df_with_dummy = df.join(dummies.add_prefix("KEY_"))
df_with_dummy

Unnamed: 0,key,value,KEY_a,KEY_b,KEY_c,KEY_d,KEY_e
0,b|c,11,0,1,1,0,0
1,a|b|c,22,1,1,1,0,0
2,a,33,1,0,0,0,0
3,a|c,44,1,0,1,0,0
4,b|d,55,0,1,0,1,0
5,c|e,66,0,0,1,0,1


### 2.3 pandas.get_dummies + pandas.cut

In [18]:
value = np.random.uniform(size=10)
value

array([0.25484881, 0.42672166, 0.75541075, 0.80483537, 0.73278982,
       0.35195332, 0.68100908, 0.26854481, 0.30220373, 0.06221511])

In [19]:
bins = [0 ,0.2 ,0.4 , 0.6 ,0.8 ,1]

In [20]:
pd.cut(value , bins)

[(0.2, 0.4], (0.4, 0.6], (0.6, 0.8], (0.8, 1.0], (0.6, 0.8], (0.2, 0.4], (0.6, 0.8], (0.2, 0.4], (0.2, 0.4], (0.0, 0.2]]
Categories (5, interval[float64, right]): [(0.0, 0.2] < (0.2, 0.4] < (0.4, 0.6] < (0.6, 0.8] < (0.8, 1.0]]

In [21]:
pd.get_dummies(pd.cut(value , bins))

Unnamed: 0,"(0.0, 0.2]","(0.2, 0.4]","(0.4, 0.6]","(0.6, 0.8]","(0.8, 1.0]"
0,0,1,0,0,0
1,0,0,1,0,0
2,0,0,0,1,0
3,0,0,0,0,1
4,0,0,0,1,0
5,0,1,0,0,0
6,0,0,0,1,0
7,0,1,0,0,0
8,0,1,0,0,0
9,1,0,0,0,0
