In [1]:
import pandas as pd
import numpy as np
df = pd.DataFrame({'data1':np.random.randn(5), #随机的5个值
                  'data2':np.random.randn(5)})

df

Unnamed: 0,data1,data2
0,-0.552094,2.314007
1,-0.251298,-0.277023
2,-0.011376,1.297017
3,-0.187267,0.191437
4,-0.119498,-0.077449


### 安排一个新特征（新增一个列）

In [2]:
df2 = df.assign(ration = df['data1']/df['data2'])  # ration 这个名字可以随便改，就是列名
df2

Unnamed: 0,data1,data2,ration
0,-0.552094,2.314007,-0.238588
1,-0.251298,-0.277023,0.90714
2,-0.011376,1.297017,-0.008771
3,-0.187267,0.191437,-0.978216
4,-0.119498,-0.077449,1.542928


### 删除一个特征（去掉一个列）

In [3]:
df2.drop('ration',axis='columns',inplace=True)
df2

Unnamed: 0,data1,data2
0,-0.552094,2.314007
1,-0.251298,-0.277023
2,-0.011376,1.297017
3,-0.187267,0.191437
4,-0.119498,-0.077449


### 列或者行中替换一个值

In [4]:
data = pd.Series([1,2,3,4,5,6,7,8,9])
data

0    1
1    2
2    3
3    4
4    5
5    6
6    7
7    8
8    9
dtype: int64

In [5]:
data.replace(9,np.nan,inplace=True)  #把第9个索引位置的值替换为NaN
data

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
5    6.0
6    7.0
7    8.0
8    NaN
dtype: float64

### 连续值离散化（划分到多个区间）

In [6]:
ages = [15,18,20,21,22,34,41,52,63,79,99,98]
bins = [10,40,80,100]
bins_res = pd.cut(ages,bins)  #把ages按照bins进行切分，可以看出上面的数据可以分为3组
bins_res

[(10, 40], (10, 40], (10, 40], (10, 40], (10, 40], ..., (40, 80], (40, 80], (40, 80], (80, 100], (80, 100]]
Length: 12
Categories (3, interval[int64]): [(10, 40] < (40, 80] < (80, 100]]

In [7]:
bins_res.codes  # 代表当前数据在哪个组，区间的编码

array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2], dtype=int8)

In [8]:
# 统计各个区间有多少个值
pd.value_counts(bins_res)

(10, 40]     6
(40, 80]     4
(80, 100]    2
dtype: int64

In [9]:
# 指定组名进行统计
group_names = ['Yonth','Mille','Old']
pd.value_counts(pd.cut(ages,[10,20,50,80],labels=group_names))

Mille    4
Old      3
Yonth    3
dtype: int64

### 有缺失值

In [10]:
df = pd.DataFrame([range(3),[0, np.nan,0],[0,0,np.nan],range(3)])
df

Unnamed: 0,0,1,2
0,0,1.0,2.0
1,0,,0.0
2,0,0.0,
3,0,1.0,2.0


In [11]:
# 判断是否有缺失值
df.isnull()

Unnamed: 0,0,1,2
0,False,False,False
1,False,True,False
2,False,False,True
3,False,False,False


In [12]:
# 定位缺失值   按照列来看
df.isnull().any()

0    False
1     True
2     True
dtype: bool

In [13]:
# 定位缺失值 按照行来看
df.isnull().any(axis = 1)

0    False
1     True
2     True
3    False
dtype: bool

In [14]:
# 填补缺失值
df.fillna('5')

Unnamed: 0,0,1,2
0,0,1,2
1,0,5,0
2,0,0,5
3,0,1,2


In [15]:
# 定位缺失值，方便观察怎么填
df[df.isnull().any(axis = 1)]

Unnamed: 0,0,1,2
1,0,,0.0
2,0,0.0,
