In [1]:
%reload_ext autoreload
%autoreload 2

! [ ! -L /datasets ] && ln -s /data/datasets/ /datasets

from k12libs.utils.nb_easy import k12ai_set_notebook

k12ai_set_notebook(cellw=95)

## 需掌握知识点

KNN, 决策树，随机森林，集成学习等算法原理介绍

In [13]:
import pandas as pd
import numpy as np

from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier

# Metrics
from sklearn.metrics import accuracy_score, confusion_matrix

## 加载数据
df_train = pd.read_csv('/datasets/ml/sf-crime/train.csv', parse_dates=['Dates'])

## 数据处理
def data_preprocess(df):
    # Date
    df['Year'] = df['Dates'].dt.year
    df['Month'] = df['Dates'].dt.month
    df['Day'] = df['Dates'].dt.day
    df['Hour'] = df['Dates'].dt.hour
    week_dict = {
        "Monday": 1,
        "Tuesday":2,
        "Wednesday":3,
        "Thursday":4,
        "Friday":5,
        "Saturday":6,
        "Sunday":7
    }
    df["DayOfWeek"].replace(week_dict, inplace=True)

    # Category
    categories = df['Category'].unique()
    category_dict = {}
    for i, c in enumerate(categories, 1):
        category_dict[c] = i
    category_dict
    df["Category"].replace(category_dict, inplace=True)

    # District
    district = df["PdDistrict"].unique()
    district_dict = {}
    for i, c in enumerate(district, 1):
        district_dict[c] = i
    df["PdDistrict"].replace(district_dict, inplace=True)

    df.drop(['Dates', 'Descript', 'Resolution', 'Address'] , axis=1, inplace=True)
    return df

df_train = data_preprocess(df_train)

## 数据分割
X = df_train.drop('Category', axis=1) # 特征
y = df_train['Category'] # 目标

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=.2)

## 构建模型
xgb = XGBClassifier(
    learning_rate=0.1,         # eta
    booster='gbtree',          # gbtree:树模型做为基分类器（默认） gbliner:线性模型做为基分类器
    n_estimators=100,          # 决策树的个数(迭代次数)
    max_depth=12,              # 树的深度
    gamma=0.1,                 # 惩罚项系数，用于控制是否后剪枝的参数,越大越保守，一般0.1、0.2这样子
    subsample=0.8,             # 随机选择80%样本建立决策树
    objective='multi:softmax', # 目标函数(定损失函数)
    reg_alpha=1,               # L1正则化
    reg_lambda=2,              # L2正则化 参数越大，模型越不容易过拟合
    verbosity=1,               # 调试 0: slient, 1-3
    random_state=27            # 随机数
  )

## 训练预测
y_pred = xgb.fit(X_train, y_train).predict(X_test) # 用时比较长

## 模型特性    
print('正确率:')
print(accuracy_score(y_pred, y_test))
print('混淆矩阵:')
print(confusion_matrix(y_pred, y_test))

正确率:
0.30492568760321165
混淆矩阵:
[[  223   204    80 ...     0     1     0]
 [ 2270  9600  3930 ...    15   251     2]
 [ 2666  7833 25305 ...    36   216     3]
 ...
 [    0     0     0 ...     0     0     0]
 [    1     1     1 ...     0     0     0]
 [    0     0     0 ...     0     0     0]]
