In [18]:
# This is to supress the warning messages (if any) generated in our code
import warnings
warnings.filterwarnings('ignore')

# Import of fundamental libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics as sts
%matplotlib inline

# Machine learning

### Processing

In [22]:
import pandas as pd
full_data = pd.read_csv('full_data.csv',index_col=0)

In [23]:
####################################
## spliting train and test datasets 
###################################
import numpy as np

X = full_data.iloc[:1451,]
y = X['SalePrice']
test = full_data.iloc[1451:,]

X.drop('SalePrice',axis=1,inplace=True)
test.drop('SalePrice',axis=1,inplace=True)

In [25]:
##############################################
## Scaling the data
#############################################
from sklearn.preprocessing import RobustScaler

cols = X.select_dtypes(np.number).columns
transformer = RobustScaler().fit(X[cols])
X[cols] = transformer.transform(X[cols])
test[cols] = transformer.transform(test[cols])

In [26]:
##############################################
## Split data in train and valuation datasetes
##############################################
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=21)

In [27]:
##############################################
# XGBoost
#############################################
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV

# Instanciate the model
xgb = XGBRegressor(booster='gbtree', objective='reg:squarederror')

#list of parameters to optimize
param_lst = {
    'learning_rate' : [0.01, 0.1, 0.15, 0.3, 0.5],
    'n_estimators' : [100, 500, 1000, 2000, 3000],
    'max_depth' : [3, 6, 9],
    'min_child_weight' : [1, 5, 10, 20],
    'reg_alpha' : [0.001, 0.01, 0.1],
    'reg_lambda' : [0.001, 0.01, 0.1]
}

# Randomizedd search instance
xgb_reg = RandomizedSearchCV(estimator = xgb, 
                             param_distributions = param_lst,
                             n_iter = 100,
                             scoring = 'neg_root_mean_squared_error',
                             cv = 5)

In [28]:
# Looking for the best parametes and timing the search
import time
start = time.time()
xgb_search = xgb_reg.fit(X_train, y_train)
stop = time.time()
ttime = (stop-start)/60
print(f'Tuning XGBoost hyperparameters:{ttime:.2f} minutes')

best_param = xgb_search.best_params_
xgb = XGBRegressor(**best_param)ui

Tuning XGBoost hyperparameters:32.51 minutes


### Training and Evaluation

In [29]:
#####################################
## function to calculate the mean score of cross_val
#####################################
def mean_cross_val(model, X, y):
    score = cross_val_score(model, X, y, cv=5)
    mean = score.mean()
    return mean

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import cross_val_score

start = time.time()
xgb.fit(X_train, y_train)   
stop = time.time()

preds = xgb.predict(X_val) 
preds_test_xgb = xgb.predict(test)
mae_xgb = mean_absolute_error(y_val, preds)
rmse_xgb = np.sqrt(mean_squared_error(y_val, preds))
score_xgb = xgb.score(X_val, y_val)
cv_xgb = mean_cross_val(xgb, X, y)

print(f'Mean Absolute Error: {mae_xgb:.4f}')
print(f'Root of Mean Squared Error: {rmse_xgb:.4f}')
print(f'Score (R^2): {score_xgb:.4f}')
print(f'Mean of cross_val score: {cv_xgb:.4f}')
print(f'Time to train the model:{stop-start:.3f} seconds')

Mean Absolute Error: 0.0900
Root of Mean Squared Error: 0.1276
Score (R^2): 0.8979
Mean of cross_val score: 0.8915
Time to train the model:4.935 seconds


In [30]:
subm = np.exp(preds_test_xgb)
output = pd.DataFrame({'Id': test.index,
                       'SalePrice': subm})
output.to_csv('submission_v1.csv',index=False)
print('Ready!!')

Ready!!


In [12]:
subm = np.exp(preds_test_xgb)

In [35]:
X.shape

(1451, 189)

In [36]:
df = pd.DataFrame(columns=["Feature", "Importance"])
df["Feature"] = X.columns
df["Importance"] = xgb.feature_importances_
df.sort_values("Importance", axis=0, ascending=False, inplace=True)


In [38]:
df.head(10)

Unnamed: 0,Feature,Importance
3,OverallQual,0.220628
24,KitchenQual,0.107038
37,MSZoning_RM,0.04477
148,CentralAir_N,0.036155
9,BsmtQual,0.035822
26,FireplaceQu,0.035328
109,Exterior1st_BrkComm,0.034103
19,GrLivArea,0.030473
170,GarageFinish_Unf,0.028241
103,RoofStyle_Gambrel,0.026424
