In [7]:
!pip install category_encoders xgboost

Collecting xgboost
  Downloading xgboost-1.2.0-py3-none-win_amd64.whl (86.5 MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.2.0


In [23]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

import category_encoders as ce
from xgboost import XGBRegressor

import joblib

In [10]:
data = pd.read_csv('../data/house-prices-advanced-regression-techniques/train.csv')
submit = pd.read_csv('../data/house-prices-advanced-regression-techniques/test.csv')

In [11]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1), data.SalePrice.values, test_size=.2, random_state=233)

In [14]:
def feature_list(df, exclude, value_threshold):
    cat = []
    quant = []

    for i in df.columns:
        if i not in exclude:
            if df[i].dtype == 'O' or df[i].nunique() <= value_threshold:
                cat.append(i)
            else:
                quant.append(i)

    return cat, quant

In [15]:
cat, quant = feature_list(data, exclude=('SalePrice','Id'), value_threshold=16)

In [17]:
#[preprocessing training set]

cbe = ce.CatBoostEncoder(random_state=1)
X_train_cat_encoded = cbe.fit_transform(X_train[cat],y_train)

In [18]:
X_train_encoded = pd.concat([X_train[quant],X_train_cat_encoded],axis=1)

In [24]:
ss = StandardScaler()
X_train_normed = ss.fit_transform(X_train_encoded)

In [30]:
joblib.dump(cbe,'../model/cbe.joblib')
joblib.dump(ss,'../model/ss.joblib')

['../model/ss.joblib']

In [47]:
def preprocessing(X):
    ''''''
    X_cat_encoded = cbe.transform(X[cat])
    X_encoded = pd.concat([X[quant],X_cat_encoded],axis=1)
    
    X_normed = ss.transform(X_encoded)
    
    return X_normed

In [41]:
#[fitting model]

xgb_rgr = XGBRegressor(n_estimators=100000,max_depth=7,random_state=555)

In [51]:
xgb_rgr.fit(X_train_normed, y_train,eval_set=[(preprocessing(X_test),y_test)],eval_metric='rmsle',early_stopping_rounds=10)

[0]	validation_0-rmsle:1.21681
Will train until validation_0-rmsle hasn't improved in 10 rounds.
[1]	validation_0-rmsle:0.69832
[2]	validation_0-rmsle:0.45330
[3]	validation_0-rmsle:0.31723
[4]	validation_0-rmsle:0.23724
[5]	validation_0-rmsle:0.19195
[6]	validation_0-rmsle:0.16788
[7]	validation_0-rmsle:0.15295
[8]	validation_0-rmsle:0.14500
[9]	validation_0-rmsle:0.14052
[10]	validation_0-rmsle:0.13754
[11]	validation_0-rmsle:0.13600
[12]	validation_0-rmsle:0.13514
[13]	validation_0-rmsle:0.13497
[14]	validation_0-rmsle:0.13415
[15]	validation_0-rmsle:0.13396
[16]	validation_0-rmsle:0.13373
[17]	validation_0-rmsle:0.13374
[18]	validation_0-rmsle:0.13373
[19]	validation_0-rmsle:0.13336
[20]	validation_0-rmsle:0.13330
[21]	validation_0-rmsle:0.13357
[22]	validation_0-rmsle:0.13364
[23]	validation_0-rmsle:0.13346
[24]	validation_0-rmsle:0.13367
[25]	validation_0-rmsle:0.13359
[26]	validation_0-rmsle:0.13366
[27]	validation_0-rmsle:0.13370
[28]	validation_0-rmsle:0.13380
[29]	validation_

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100000, n_jobs=0, num_parallel_tree=1,
             random_state=555, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             subsample=1, tree_method='exact', validate_parameters=1,
             verbosity=None)

In [53]:
joblib.dump(xgb_rgr,'../model/xgb_rgr.joblib')

['../model/xgb_rgr.joblib']

In [56]:
score = pd.DataFrame({'Id': submit.Id.values, 
                      'SalePrice': xgb_rgr.predict(preprocessing(submit))
                     })

In [58]:
score.to_csv('../data/submit.csv',index=False)