In [1]:
%reload_ext autoreload
%autoreload 2

In [2]:
from k12libs.utils.nb_easy import k12ai_run_project

In [3]:
algo = 'gradient_boosting'
dataset = 'house-prices'

In [6]:
project = k12ai_run_project(debug=True, tb_port=9002,
    framework='k12ml', task='regressor', network=algo, dataset=dataset)

Box(children=(Box(children=(VBox(children=(Tab(children=(VBox(children=(HBox(children=(Text(value='regressor',…

-------------------

In [46]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn import preprocessing 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

from k12libs.utils.nb_easy import K12AI_DATASETS_ROOT
from k12libs.utils.nb_easy import k12ai_set_notebook

In [17]:
df_train = pd.read_csv(f"{K12AI_DATASETS_ROOT}/ml/house-prices/train.csv")
df_test = pd.read_csv(f"{K12AI_DATASETS_ROOT}/ml/house-prices/test.csv")
df_train.shape, df_test.shape

((1449, 71), (1448, 71))

In [20]:
# 添加新特征
df_train['TotalSA'] = df_train['TotalBsmtSF'] + df_train['1stFlrSF'] + df_train['2ndFlrSF']

In [21]:
# 顺序类别特征编码
bin_map  = {
    'TA':2,'Gd':3, 'Fa':1,'Ex':4,'Po':1,'None':0,'Y':1,'N':0,'Reg':3,'IR1':2,'IR2':1,'IR3':0,"None" : 0,
    "No" : 2, "Mn" : 2, "Av": 3,"Gd" : 4,"Unf" : 1, "LwQ": 2, "Rec" : 3,"BLQ" : 4, "ALQ" : 5, "GLQ" : 6
}
df_train['PavedDrive'] = df_train['PavedDrive'].map({'N' : 0, 'P' : 1, 'Y' : 2})
df_train['ExterQual'] = df_train['ExterQual'].map(bin_map)
df_train['ExterCond'] = df_train['ExterCond'].map(bin_map)
df_train['BsmtCond'] = df_train['BsmtCond'].map(bin_map)
df_train['BsmtQual'] = df_train['BsmtQual'].map(bin_map)
df_train['HeatingQC'] = df_train['HeatingQC'].map(bin_map)
df_train['KitchenQual'] = df_train['KitchenQual'].map(bin_map)
df_train['FireplaceQu'] = df_train['FireplaceQu'].map(bin_map)
df_train['GarageQual'] = df_train['GarageQual'].map(bin_map)
df_train['GarageCond'] = df_train['GarageCond'].map(bin_map)
df_train['CentralAir'] = df_train['CentralAir'].map(bin_map)
df_train['LotShape'] = df_train['LotShape'].map(bin_map)
df_train['BsmtExposure'] = df_train['BsmtExposure'].map(bin_map)
df_train['BsmtFinType1'] = df_train['BsmtFinType1'].map(bin_map)
df_train['BsmtFinType2'] = df_train['BsmtFinType2'].map(bin_map)

In [22]:
# 时间特征编码
lab = preprocessing.LabelEncoder()
df_train['YearBuilt'] = lab.fit_transform(df_train['YearBuilt'])
df_train['YearRemodAdd'] = lab.fit_transform(df_train['YearRemodAdd'])

In [23]:
# One-Hot编码
rest_object_df = df_train.select_dtypes(include=['object'])
df_train = pd.get_dummies(df_train, columns=rest_object_df.columns)
df_train.head()

Unnamed: 0,LotShape,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,CentralAir,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,3,4,2,4,2,2,6,1,4,1,...,0,0,0,1,0,0,0,0,1,0
1,3,2,2,4,2,4,5,1,4,1,...,0,0,0,1,0,0,0,0,1,0
2,2,4,2,4,2,2,6,1,4,1,...,0,0,0,1,0,0,0,0,1,0
3,2,2,2,2,4,2,5,1,4,1,...,0,0,0,1,1,0,0,0,0,0
4,2,4,2,4,2,3,6,1,4,1,...,0,0,0,1,0,0,0,0,1,0


In [25]:
# 数据对数转换
skew = df_train.apply(lambda x: x.skew())
skew.drop(['SalePrice'], inplace=True)
skew_features = skew[abs(skew) >= 1].index
df_train[skew_features] = np.log1p(df_train[skew_features])

In [26]:
# 数据正则化
def normalize(x):
    if x.name in ['SalePrice']:
        return x
    return round((x - x.mean()) / x.std(), 3)
df_train = df_train.apply(normalize)
df_train.head()

Unnamed: 0,LotShape,ExterQual,ExterCond,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinType2,HeatingQC,CentralAir,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0.634,1.302,-0.266,0.944,-0.009,-0.38,1.166,-0.238,0.688,0.263,...,-0.059,-0.299,-0.046,0.388,-0.274,-0.053,-0.091,-0.118,0.466,-0.303
1,0.634,-0.744,-0.266,0.944,-0.009,2.341,0.691,-0.238,0.688,0.263,...,-0.059,-0.299,-0.046,0.388,-0.274,-0.053,-0.091,-0.118,0.466,-0.303
2,-0.851,1.302,-0.266,0.944,-0.009,-0.38,1.166,-0.238,0.688,0.263,...,-0.059,-0.299,-0.046,0.388,-0.274,-0.053,-0.091,-0.118,0.466,-0.303
3,-0.851,-0.744,-0.266,-0.829,3.575,-0.38,0.691,-0.238,0.688,0.263,...,-0.059,-0.299,-0.046,0.388,3.652,-0.053,-0.091,-0.118,-2.143,-0.303
4,-0.851,1.302,-0.266,0.944,-0.009,0.981,1.166,-0.238,0.688,0.263,...,-0.059,-0.299,-0.046,0.388,-0.274,-0.053,-0.091,-0.118,0.466,-0.303


In [41]:
X = df_train.loc[:, set(df_train.columns) - set(('SalePrice'))]
y = df_train.loc[:, 'SalePrice']
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    train_size=.8)

In [42]:
gbr = GradientBoostingRegressor(
    loss = 'ls',  # least squares
    learning_rate = 0.1, 
)

In [43]:
gbr.fit(X_train, y_train)

GradientBoostingRegressor()

In [44]:
predict = gbr.predict(X_test)

In [47]:
r2_score(y_test, predict)

0.9998691702136395

In [48]:
mean_absolute_error(y_test, predict)

617.2697786762855

In [49]:
mean_squared_error(y_test, predict)

750603.1397320027

In [50]:
mean_squared_log_error(y_test, predict)

2.825897000274687e-05

In [51]:
median_absolute_error(y_test, predict)

395.6242583932908