## References
https://www.bilibili.com/video/BV1QP4y1t76A?p=1

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}

In [5]:
df = pd.DataFrame(data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


## 1. 检查每列的缺失值

In [7]:
df.isnull().sum()

size      1
color     0
gender    1
price     1
weight    2
bought    0
dtype: int64

In [9]:
round(df.isnull().sum() / len(df), 2)

size      0.17
color     0.00
gender    0.17
price     0.17
weight    0.33
bought    0.00
dtype: float64

## 2. 填充缺失值

### 2.1 填充缺失值——使用列平均值填充缺失值

In [14]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
df[['weight']] = imputer.fit_transform(df[['weight']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


### 2.2 填充缺失值——获取SimpleImputer填充缺失值的统计值

In [15]:
imputer.statistics_

array([415.])

In [16]:
imputer.statistics_[0]

415.0

### 2.3 填充缺失值——使用常量填充缺失值

In [17]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [19]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value=99.0)  # shift + Tab查看函数说明
df[['price']] = imputer.fit_transform(df[['price']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [20]:
imputer.statistics_

array([99.])

### 2.4 填充缺失值——使用最频繁的值做缺失值填充

In [21]:
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


In [22]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df[['size']] = imputer.fit_transform(df[['size']])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,99.0,300.0,yes
3,M,green,female,129.0,415.0,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,415.0,no


### 2.5 填充缺失值——过滤掉缺失值的行并统计
* 筛选出非空行
* 筛选出数值列
* 进行均值统计

In [23]:
data = {
    'size': ['XL', 'L', 'M', np.nan, 'M', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red', 'green'],
    'gender': ['female', 'male', np.nan, 'female', 'female', 'male'],
    'price': [199.0, 89.0, np.nan, 129.0, 79.0, 89.0],
    'weight': [500, 450, 300, np.nan, 410, np.nan],
    'bought': ['yes', 'no', 'yes', 'no', 'yes', 'no']
}
df = pd.DataFrame(data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,,,300.0,yes
3,,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


In [27]:
df[~df['weight'].isnull()].select_dtypes(include=['float']).mean()

price     122.333333
weight    415.000000
dtype: float64

### 2.6 填充缺失值——同时对多列字符串数据使用常量填充缺失值

In [29]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='constant', fill_value='empty')
columns = df.select_dtypes(include=['object']).columns
columns

Index(['size', 'color', 'gender', 'bought'], dtype='object')

In [31]:
df.loc[:, columns] = imputer.fit_transform(df[columns])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500.0,yes
1,L,green,male,89.0,450.0,no
2,M,blue,empty,,300.0,yes
3,empty,green,female,129.0,,no
4,M,red,female,79.0,410.0,yes
5,M,green,male,89.0,,no


## 3. 数值离散化
### 3.1 数值离散化——等宽区间

In [37]:
import pandas as pd
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83, 68.]})
df

Unnamed: 0,weight
0,75.0
1,78.5
2,85.0
3,91.0
4,84.5
5,83.0
6,68.0


In [38]:
df['weight_cut'] = pd.cut(df['weight'], bins=3)
df

Unnamed: 0,weight,weight_cut
0,75.0,"(67.977, 75.667]"
1,78.5,"(75.667, 83.333]"
2,85.0,"(83.333, 91.0]"
3,91.0,"(83.333, 91.0]"
4,84.5,"(83.333, 91.0]"
5,83.0,"(75.667, 83.333]"
6,68.0,"(67.977, 75.667]"


In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   weight      7 non-null      float64 
 1   weight_cut  7 non-null      category
dtypes: category(1), float64(1)
memory usage: 319.0 bytes


### 3.2 数值离散化——指定区间

In [40]:
import pandas as pd
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83, 68.]})
df

Unnamed: 0,weight
0,75.0
1,78.5
2,85.0
3,91.0
4,84.5
5,83.0
6,68.0


In [41]:
df['weight_cut'] = pd.cut(df['weight'], bins=[60, 75, 80, 95])
df

Unnamed: 0,weight,weight_cut
0,75.0,"(60, 75]"
1,78.5,"(75, 80]"
2,85.0,"(80, 95]"
3,91.0,"(80, 95]"
4,84.5,"(80, 95]"
5,83.0,"(80, 95]"
6,68.0,"(60, 75]"


### 3.3 数值离散化——指定区间标签

In [44]:
import pandas as pd
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83, 68.]})
df['weight_cut'] = pd.cut(df['weight'], bins=[60, 75, 80, 95], labels=['light', 'normal', 'heavy'])
df

Unnamed: 0,weight,weight_cut
0,75.0,light
1,78.5,normal
2,85.0,heavy
3,91.0,heavy
4,84.5,heavy
5,83.0,heavy
6,68.0,light


### 3.4 数值离散化——one-hot编码

In [48]:
import pandas as pd
df = pd.DataFrame(data={'weight': [75., 78.5, 85., 91., 84.5, 83, 68.]})
df['weight_cut'] = pd.cut(df['weight'], bins=[60, 75, 80, 95], labels=['light', 'normal', 'heavy'])
df

Unnamed: 0,weight,weight_cut
0,75.0,light
1,78.5,normal
2,85.0,heavy
3,91.0,heavy
4,84.5,heavy
5,83.0,heavy
6,68.0,light


In [49]:
df = pd.get_dummies(df)
df

Unnamed: 0,weight,weight_cut_light,weight_cut_normal,weight_cut_heavy
0,75.0,1,0,0
1,78.5,0,1,0
2,85.0,0,0,1
3,91.0,0,0,1
4,84.5,0,0,1
5,83.0,0,0,1
6,68.0,1,0,0


## 4. 特征提取
### 4.1 特征提取——元素的个数

In [51]:
import pandas as pd
data_dict = {
    'currency': [
        ['PLN', 'USD'],
        ['EUR', 'USD', 'PLN', 'CAD'],
        ['GBP'],
        ['JPY'], 
        ['JPY', 'CZK', 'HUF'],
        []
    ]
}
df = pd.DataFrame(data_dict)
df

Unnamed: 0,currency
0,"[PLN, USD]"
1,"[EUR, USD, PLN, CAD]"
2,[GBP]
3,[JPY]
4,"[JPY, CZK, HUF]"
5,[]


In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6 entries, 0 to 5
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   currency  6 non-null      object
dtypes: object(1)
memory usage: 176.0+ bytes


In [53]:
type(df.iloc[0][0])

list

In [55]:
df['number'] = df['currency'].map(len)
df

Unnamed: 0,currency,number
0,"[PLN, USD]",2
1,"[EUR, USD, PLN, CAD]",4
2,[GBP],1
3,[JPY],1
4,"[JPY, CZK, HUF]",3
5,[],0


### 4.2 特征提取——是否包含元素 

In [56]:
import pandas as pd
data_dict = {
    'currency': [
        ['PLN', 'USD'],
        ['EUR', 'USD', 'PLN', 'CAD'],
        ['GBP'],
        ['JPY'], 
        ['JPY', 'CZK', 'HUF'],
        []
    ]
}
df = pd.DataFrame(data_dict)
df

Unnamed: 0,currency
0,"[PLN, USD]"
1,"[EUR, USD, PLN, CAD]"
2,[GBP]
3,[JPY]
4,"[JPY, CZK, HUF]"
5,[]


In [57]:
df['USD_flag'] = df['currency'].map(lambda x: 1 if 'USD' in x else 0)
df

Unnamed: 0,currency,USD_flag
0,"[PLN, USD]",1
1,"[EUR, USD, PLN, CAD]",1
2,[GBP],0
3,[JPY],0
4,"[JPY, CZK, HUF]",0
5,[],0


### 4.3 特征提取——从字符串提取标签

In [58]:
import pandas as pd
df = pd.DataFrame(
    {
        'tags': [
            '#good#vibes',
            '#hot#summer#holiday',
            '#street#food',
            '#workout'
        ]
    }
)
df

Unnamed: 0,tags
0,#good#vibes
1,#hot#summer#holiday
2,#street#food
3,#workout


In [59]:
df = df['tags'].str.split('#', expand=True)
df

Unnamed: 0,0,1,2,3
0,,good,vibes,
1,,hot,summer,holiday
2,,street,food,
3,,workout,,


In [60]:
df = df.drop(columns=0)
df

Unnamed: 0,1,2,3
0,good,vibes,
1,hot,summer,holiday
2,street,food,
3,workout,,


In [62]:
df.columns = ['tag1', 'tag2', 'tag3']
df

Unnamed: 0,tag1,tag2,tag3
0,good,vibes,
1,hot,summer,holiday
2,street,food,
3,workout,,


### 4.4 特征提取——每行缺失值个数

In [63]:
df['missing'] = df.isnull().sum(axis=1)
df

Unnamed: 0,tag1,tag2,tag3,missing
0,good,vibes,,1
1,hot,summer,holiday,0
2,street,food,,1
3,workout,,,2


### 4.5 特征提取——字符串清理转数字类型

In [64]:
import pandas as pd
df = pd.DataFrame(
    {
        'investments': [
            '100_000_000',
            '100_000',
            '30_000_000',
            '100_500_000'
        ]
    }
)
df

Unnamed: 0,investments
0,100_000_000
1,100_000
2,30_000_000
3,100_500_000


In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   investments  4 non-null      object
dtypes: object(1)
memory usage: 160.0+ bytes


In [66]:
df['investments'] = df['investments'].str.replace('_', '').astype(int)
df

Unnamed: 0,investments
0,100000000
1,100000
2,30000000
3,100500000


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 1 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   investments  4 non-null      int64
dtypes: int64(1)
memory usage: 160.0 bytes


## 5. IRIS数据
### 5.1 IRIS数据——加载认识数据

In [68]:
from sklearn.datasets import load_iris
iris = load_iris()

In [69]:
type(iris)

sklearn.utils.Bunch

In [70]:
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [71]:
iris['filename']

'/Users/apple/opt/anaconda3/lib/python3.7/site-packages/sklearn/datasets/data/iris.csv'

In [80]:
iris['data'][:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

In [83]:
iris['target'][:5]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

### 5.2 IRIS数据——查看列名和分类名

In [84]:
iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [85]:
pd.DataFrame(data=iris['data'], columns=iris['feature_names'])

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


### 5.3 IRIS数据——数据和目标的shape

In [86]:
data = iris['data']

In [87]:
type(data)

numpy.ndarray

In [88]:
data.shape

(150, 4)

In [89]:
target = iris['target']

In [90]:
type(target)

numpy.ndarray

In [91]:
target.shape

(150,)

In [92]:
pd.Series(target).unique()

array([0, 1, 2])

### 5.4 IRIS数据——分割训练集测试集

In [93]:
from sklearn.model_selection import train_test_split

In [94]:
data_train, data_test, target_train, target_test = \
    train_test_split(data, target, test_size=0.3)

In [95]:
data_train.shape

(105, 4)

In [96]:
target_train.shape

(105,)

In [97]:
data_test.shape

(45, 4)

In [98]:
target_test.shape

(45,)

### 5.5 IRIS数据——逻辑回归训练

In [99]:
from sklearn.linear_model import LogisticRegression

In [100]:
model = LogisticRegression(max_iter=1000)

In [101]:
model.fit(data_train, target_train)  

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [102]:
model.score(data_train, target_train)  # 计算模型在训练集的准确率

0.9619047619047619

In [103]:
model.score(data_test, target_test)  # 计算模型在测试集的准确率

0.9333333333333333

### 5.6 IRIS数据——在测试集上实现预估

In [104]:
model

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [105]:
data_test.shape

(45, 4)

In [108]:
target_pred = model.predict(data_test)

In [109]:
target_pred.shape

(45,)

In [110]:
target_pred

array([2, 2, 1, 1, 2, 0, 0, 2, 1, 0, 1, 2, 1, 0, 1, 0, 2, 0, 0, 2, 0, 2,
       0, 2, 1, 0, 2, 1, 2, 2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 0, 0, 1, 1, 1,
       0])

### 5.7 IRIS数据——理解混淆矩阵

In [112]:
from sklearn.metrics import confusion_matrix

In [114]:
confusion_matrix(target_test, target_pred)

array([[13,  0,  0],
       [ 0, 13,  0],
       [ 0,  3, 16]])

### 5.8 IRIS数据——理解分类报告

In [115]:
from sklearn.metrics import classification_report

In [118]:
print(classification_report(target_test, target_pred, target_names=iris['target_names']))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        13
  versicolor       0.81      1.00      0.90        13
   virginica       1.00      0.84      0.91        19

    accuracy                           0.93        45
   macro avg       0.94      0.95      0.94        45
weighted avg       0.95      0.93      0.93        45



## 6 分类列编码
### 6.1 分类列编码——预估目标列

In [120]:
import pandas as pd
data = {
    'size': ['XL', 'L', 'M', 'L', 'M'],
    'color': ['red', 'green', 'blue', 'green', 'red'],
    'gender': ['female', 'male', 'male', 'female', 'female'],
    'price': [199.0, 89.0, 99.0, 129.0, 79.0],
    'weight': [500, 450, 300, 380, 410],
    'bought': ['yes', 'no', 'yes', 'no', 'yes']
}
df = pd.DataFrame(data=data)
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,yes
1,L,green,male,89.0,450,no
2,M,blue,male,99.0,300,yes
3,L,green,female,129.0,380,no
4,M,red,female,79.0,410,yes


In [121]:
from sklearn.preprocessing import LabelEncoder

In [123]:
labelEncoder = LabelEncoder()
df['bought'] = labelEncoder.fit_transform(df['bought'])
df

Unnamed: 0,size,color,gender,price,weight,bought
0,XL,red,female,199.0,500,1
1,L,green,male,89.0,450,0
2,M,blue,male,99.0,300,1
3,L,green,female,129.0,380,0
4,M,red,female,79.0,410,1


In [124]:
labelEncoder.classes_

array(['no', 'yes'], dtype=object)

In [125]:
labelEncoder.inverse_transform(df['bought'])

array(['yes', 'no', 'yes', 'no', 'yes'], dtype=object)

### 6.2 分类列编码——OneHot编码

In [126]:
from sklearn.preprocessing import OneHotEncoder

In [127]:
oneHotEncoder = OneHotEncoder(sparse=False)

In [129]:
oneHotEncoder.fit(df[['size']])

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)

In [130]:
oneHotEncoder.transform(df[['size']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [131]:
oneHotEncoder.categories_

[array(['L', 'M', 'XL'], dtype=object)]

##  7 乳腺癌数据集——加载并认识数据

In [132]:
from sklearn.datasets import load_breast_cancer

In [135]:
breast_cancer = load_breast_cancer()

In [136]:
print(breast_cancer['DESCR'])

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 3 is Mean Radius, f

### 7.1  乳腺癌数据集——查看data和target

In [145]:
data = breast_cancer['data']
target = breast_cancer['target']

In [146]:
data.shape

(569, 30)

In [147]:
target.shape

(569,)

In [148]:
type(data), type(target)

(numpy.ndarray, numpy.ndarray)

In [149]:
data[:5]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
        1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
        4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
        2.250e-02, 4.571e-03, 2.357e

In [150]:
target[:5]

array([0, 0, 0, 0, 0])

### 7.2 乳腺癌数据集——合并data和target

In [151]:
import numpy as np
all_datas = np.c_[data, target]

In [152]:
all_datas.shape

(569, 31)