In [1]:
%reload_ext autoreload
%autoreload 2

! [ -L /datasets ] && rm -f /datasets
! ln -s /data/datasets/ /datasets

from k12libs.utils.nb_easy import k12ai_set_notebook

k12ai_set_notebook(cellw=95)

## 需掌握知识点

KNN, 决策树，随机森林，集成学习等算法原理介绍

In [2]:
import pandas as pd
import numpy as np
from pyr.app.k12ai import EasyaiClassifier, EasyaiTrainer
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix

## 加载数据
df_train = pd.read_csv('/datasets/ml/titanic/train.csv')

## 数据处理
def data_preprocess(df):
    # Age
    def _impute_age(cols):
        Age = cols[0]
        Pclass = cols[1]
        if pd.isnull(Age):
            if Pclass == 1:
                return 37
            elif Pclass == 2:
                return 29
            else:
                return 24
        else:
            return Age
    df['Age'] = df[['Age','Pclass']].apply(_impute_age, axis=1)

    # Sex
    sex_dict = {
        'male': 0,
        'female': 1,
    }
    df["Sex"].replace(sex_dict, inplace=True)

    # Embarked
    df['Embarked'].fillna('S', inplace=True)
    emb_dict = {
        'S': 0,
        'C': 1,
        'Q': 2,
    }
    df["Embarked"].replace(emb_dict, inplace=True)
    df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
    return df

df_train = data_preprocess(df_train)
            
## 数据分割
X = df_train.drop('Survived', axis=1) # 特征
y = df_train['Survived'] # 目标
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=.7)
 
## 构建模型
knn = KNeighborsClassifier(
    n_neighbors = 5,        # [O] 近邻的个数
    weights='uniform',      # [O] 权重 1. uniform: 无论近邻距离多远, 权重一样 2. 'distance': 距离越近权重越大
    algorithm = 'auto',     # [O] 构建算法 1. audo: 自动选择 2. ball_tree(球树) 3. kd_tree(KD树) 4. brute(蛮力实现)
    leaf_size = 30,         # [O] 建子树的叶子节点数量的阈值(algorithm==brute时无效)
    metric = 'minkowski',   # [O] 距离函数 1. euclidean(欧式距离) 2. manhattan(曼哈顿距离) 3. chebyshev(切比雪夫距离) 4. minkowski(闵可夫斯基距离)
    p = 2                   # [O] 当metric='minkowski'时有效, 作为距离函数的参数                  
)
    
## 训练评估
y_pred = knn.fit(X_train, y_train).predict(X_test)

## 模型特性    
print('正确率:')
print(accuracy_score(y_pred, y_test))
print('混淆矩阵:')
print(confusion_matrix(y_pred, y_test))

# 预测
features = np.array([
    [2.0, 1.0, 22, 1.0, 1.0, 29.0, 0.0], # 存活 [1]
    [9.0, 0.0, 22, 0.0, 0.0, 9.0, 0.0],  # 死亡 [0]
])
y_pred = knn.predict(features)
print('预测结果:', y_pred) 

正确率:
0.7014925373134329
混淆矩阵:
[[125  42]
 [ 38  63]]
预测结果: [1 0]
