In [1]:
%reload_ext autoreload
%autoreload 2

! [ -L /datasets ] && rm -f /datasets
! ln -s /data/datasets/ /datasets

from k12libs.utils.nb_easy import k12ai_set_notebook

k12ai_set_notebook(cellw=95)

## 需掌握知识点

KNN, 决策树，随机森林，集成学习等算法原理介绍

In [2]:
import pandas as pd
import numpy as np

from sklearn import preprocessing 
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

# Metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

## 加载数据
df_train = pd.read_csv('/datasets/ml/house-prices/train.csv')

## 数据处理
def data_preprocess(df):
    ### 添加新的特征
    df['TotalSA'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF']

    ### 有序类别编码
    bin_map  = {
        'TA':2,'Gd':3, 'Fa':1,'Ex':4,'Po':1,'None':0,'Y':1,'N':0,'Reg':3,'IR1':2,'IR2':1,'IR3':0,"None" : 0,
        "No" : 2, "Mn" : 2, "Av": 3,"Gd" : 4,"Unf" : 1, "LwQ": 2, "Rec" : 3,"BLQ" : 4, "ALQ" : 5, "GLQ" : 6
    }
    df['PavedDrive'] = df['PavedDrive'].map({'N' : 0, 'P' : 1, 'Y' : 2})
    df['ExterQual'] = df['ExterQual'].map(bin_map)
    df['ExterCond'] = df['ExterCond'].map(bin_map)
    df['BsmtCond'] = df['BsmtCond'].map(bin_map)
    df['BsmtQual'] = df['BsmtQual'].map(bin_map)
    df['HeatingQC'] = df['HeatingQC'].map(bin_map)
    df['KitchenQual'] = df['KitchenQual'].map(bin_map)
    df['FireplaceQu'] = df['FireplaceQu'].map(bin_map)
    df['GarageQual'] = df['GarageQual'].map(bin_map)
    df['GarageCond'] = df['GarageCond'].map(bin_map)
    df['CentralAir'] = df['CentralAir'].map(bin_map)
    df['LotShape'] = df['LotShape'].map(bin_map)
    df['BsmtExposure'] = df['BsmtExposure'].map(bin_map)
    df['BsmtFinType1'] = df['BsmtFinType1'].map(bin_map)
    df['BsmtFinType2'] = df['BsmtFinType2'].map(bin_map)

    ### 时间特征编码
    lab = preprocessing.LabelEncoder()
    df['YearBuilt'] = lab.fit_transform(df['YearBuilt'])
    df['YearRemodAdd'] = lab.fit_transform(df['YearRemodAdd'])

    ### One-Hot编码
    rest_object_df = df.select_dtypes(include=['object'])
    df = pd.get_dummies(df, columns=rest_object_df.columns)
    df.head()

    ### 数据对数转换
    skew = df.apply(lambda x: x.skew())
    skew.drop(['SalePrice'], inplace=True)
    skew_features = skew[abs(skew) >= 1].index
    df[skew_features] = np.log1p(df[skew_features])

    # 数据正则化
    def normalize(x):
        if x.name in ['SalePrice']:
            return x
        return round((x - x.mean()) / x.std(), 3)
    return df.apply(normalize)

df_train = data_preprocess(df_train)

## 数据分割
X = df_train.drop('SalePrice', axis=1) # 特征
y = df_train['SalePrice'] # 目标
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=.8)

## 构建模型
gbr = GradientBoostingRegressor(
    loss = 'ls',  # least squares
    learning_rate = 0.1, 
)

## 训练预测
y_pred = gbr.fit(X_train, y_train).predict(X_test)

## 模型特性
print('R^2:', r2_score(y_test, y_pred))
print('MAE:', mean_absolute_error(y_test, y_pred))
print('MSE:', mean_squared_error(y_test, y_pred))
print('MSLE', mean_squared_log_error(y_test, y_pred))

R^2: 0.8979223692688175
MAE: 16141.597900173347
MSE: 612172713.8923123
MSLE 0.018965317343788066
