In [1]:
import numpy as np
import pandas as pd

# 1.离散化和分箱：

#### cut()方法

In [2]:
# 将年龄按照18~25,26~35,36~60,60~100分组
ages = [22,21,25,28,21,23,36,33,65,46,40,39]
bins = [18,25,35,60,100]

In [3]:
ages_cate = pd.cut(ages,bins)
ages_cate

[(18, 25], (18, 25], (18, 25], (25, 35], (18, 25], ..., (25, 35], (60, 100], (35, 60], (35, 60], (35, 60]]
Length: 12
Categories (4, interval[int64, right]): [(18, 25] < (25, 35] < (35, 60] < (60, 100]]

In [4]:
# 类别数组
ages_cate.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [5]:
# 不同年龄在类别数组中的索引值
ages_cate.codes

array([0, 0, 0, 1, 0, 0, 2, 1, 3, 2, 2, 2], dtype=int8)

In [6]:
# pandas.cut 结果中的 箱（bin）数量的 计数
pd.value_counts(ages_cate)

(18, 25]     5
(35, 60]     4
(25, 35]     2
(60, 100]    1
dtype: int64

#### 通过传递right=False改变区间

In [7]:
ages_cate = pd.cut(ages,bins)
ages_cate.categories

IntervalIndex([(18, 25], (25, 35], (35, 60], (60, 100]], dtype='interval[int64, right]')

In [8]:
ages_cate2 = pd.cut(ages,bins,right=False)
ages_cate2.categories

IntervalIndex([[18, 25), [25, 35), [35, 60), [60, 100)], dtype='interval[int64, left]')

#### 通过向labels选项传递一个列表或数组来自定义箱名

In [9]:
ages = [22,21,25,28,21,23,36,33,65,46,40,39]
bins = [18,25,35,60,100]
group_names =['未成年','成年','中年','老年']

In [10]:
ages_cate = pd.cut(ages,bins,labels=group_names)
ages_cate

['未成年', '未成年', '未成年', '成年', '未成年', ..., '成年', '老年', '中年', '中年', '中年']
Length: 12
Categories (4, object): ['未成年' < '成年' < '中年' < '老年']

#### 也可以传入希望切分的数量，系统会根据最大值和最小值自动计算出等长的箱，其中precision选项用于设置精度限制。

In [11]:
ages = [22,21,25,28,21,23,36,33,65,46,40,39]

In [12]:
ages_cate = pd.cut(ages,4,precision=2)
ages_cate

[(20.96, 32.0], (20.96, 32.0], (20.96, 32.0], (20.96, 32.0], (20.96, 32.0], ..., (32.0, 43.0], (54.0, 65.0], (43.0, 54.0], (32.0, 43.0], (32.0, 43.0]]
Length: 12
Categories (4, interval[float64, right]): [(20.96, 32.0] < (32.0, 43.0] < (43.0, 54.0] < (54.0, 65.0]]

### qcut()方法

In [13]:
data = np.random.randn(100)
data_cate = pd.qcut(data,4) # 切分成四份
data_cate

[(-0.817, 0.0586], (0.0586, 0.828], (-0.817, 0.0586], (0.0586, 0.828], (-0.817, 0.0586], ..., (-2.776, -0.817], (0.828, 2.261], (-2.776, -0.817], (-0.817, 0.0586], (-0.817, 0.0586]]
Length: 100
Categories (4, interval[float64, right]): [(-2.776, -0.817] < (-0.817, 0.0586] < (0.0586, 0.828] < (0.828, 2.261]]

In [14]:
pd.value_counts(data_cate)

(-2.776, -0.817]    25
(-0.817, 0.0586]    25
(0.0586, 0.828]     25
(0.828, 2.261]      25
dtype: int64

#### 传入自定义的分位数

In [15]:
data_cate = pd.qcut(data,[0,0.1,0.5,0.9,1])
data_cate

[(-1.275, 0.0586], (0.0586, 1.297], (-1.275, 0.0586], (0.0586, 1.297], (-1.275, 0.0586], ..., (-2.776, -1.275], (0.0586, 1.297], (-1.275, 0.0586], (-1.275, 0.0586], (-1.275, 0.0586]]
Length: 100
Categories (4, interval[float64, right]): [(-2.776, -1.275] < (-1.275, 0.0586] < (0.0586, 1.297] < (1.297, 2.261]]

In [16]:
pd.value_counts(data_cate)

(-1.275, 0.0586]    40
(0.0586, 1.297]     40
(-2.776, -1.275]    10
(1.297, 2.261]      10
dtype: int64

# 2.检测和过滤异常值

In [17]:
data = pd.DataFrame(np.random.randint(-100,100,(6, 4)))
data

Unnamed: 0,0,1,2,3
0,3,-97,31,-16
1,-33,49,27,46
2,-72,-41,15,-96
3,8,-100,-84,4
4,1,76,-53,-15
5,45,-8,-81,-98


In [18]:
# 汇总统计
data.describe()

Unnamed: 0,0,1,2,3
count,6.0,6.0,6.0,6.0
mean,-8.0,-20.166667,-24.166667,-29.166667
std,39.969989,73.335985,54.473541,57.146887
min,-72.0,-100.0,-84.0,-98.0
25%,-24.5,-83.0,-74.0,-76.0
50%,2.0,-24.5,-19.0,-15.5
75%,6.75,34.75,24.0,-0.75
max,45.0,76.0,31.0,46.0


#### 假设你想在其中一列中查找绝对值超过 50 的值

In [19]:
col = data[2]

In [20]:
col[col.abs() > 50]

3   -84
4   -53
5   -81
Name: 2, dtype: int32

#### 如要选择值超过 50 或 –50 的所有行，可以在布尔值DataFrame上使用 any 方法

In [21]:
data[(data.abs() > 50).any(axis="columns")]

Unnamed: 0,0,1,2,3
0,3,-97,31,-16
2,-72,-41,15,-96
3,8,-100,-84,4
4,1,76,-53,-15
5,45,-8,-81,-98


#### 通过np.sign()，限制值超出 –50 到 50的区间

In [22]:
data[(data.abs() > 50)] = np.sign(data)*50

In [23]:
data

Unnamed: 0,0,1,2,3
0,3,-50,31,-16
1,-33,49,27,46
2,-50,-41,15,-50
3,8,-50,-50,4
4,1,50,-50,-15
5,45,-8,-50,-50
