In [None]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
# from sklearn.ensemble import HistGradientBoostingRegressor
import xgboost as xgb

pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

#### Read and pre-process data

In [None]:
# fetch data 

housing_data = pd.read_csv('Kaggle_Data/house_price_subset_expanded.csv')

housing_data.head()

In [None]:
# subset data to only include columns for Normal Sale Condition

len(housing_data)

subset_data = housing_data[housing_data['SaleCondition'] == 'Normal'].drop(columns = ['SaleCondition'])

subset_data.head()

len(subset_data)

In [None]:
subset_data.isna().sum()

In [None]:
subset_data.columns

In [None]:
# adding non-linear columns
if True: 
    subset_data['LogLotArea'] = np.log(subset_data['LotArea'])
    subset_data['LotoverLiv'] = subset_data['LotArea']*subset_data['GrLivArea']
    subset_data['OvQualSq'] = subset_data['OverallQual']**2
    subset_data['OvCondsq'] = subset_data['OverallCond']**2
    subset_data['OvCondcu'] = subset_data['OverallCond']**3
    subset_data['BedAbGndsq'] = subset_data['BedroomAbvGr']**2
    subset_data['GarageCarsSq'] = subset_data['GarageCars']**2

In [None]:
subset_data['LotShape'].replace(['IR1', 'IR2', 'IR3'], 'Irr', inplace = True)
subset_data['GarageType'].replace(['BuiltIn', 'Basment', '2Types', 'CarPort'], 'Other', inplace = True)
subset_data['GarageType'].fillna('Other', inplace = True)
print(set(subset_data['LotShape']))
print(set(subset_data['GarageType']))
subset_data.head(20)

In [None]:
subset_data = pd.get_dummies(subset_data, drop_first=True) # Change drop_first to True after demonstration
subset_data.head(20)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice']), subset_data['SalePrice'], test_size=0.25)
X_train, X_test, y_train, y_test = train_test_split(subset_data.drop(columns = ['SalePrice']), subset_data['SalePrice'], test_size=0.25, random_state=35)
# Pick a random_state as below and keep using the same number (example 35) to repeat the same test and train data
# X_train, X_test, y_train, y_test = train_test_split(subset_data['bmi'], subset_data['charges'], test_size=0.25, random_state=35)
X_train
X_test
y_train
y_test

In [None]:
if True: 
    from sklearn.preprocessing import StandardScaler
    sc = StandardScaler()
    X_train = pd.DataFrame(sc.fit_transform(X_train), columns = X_train.columns, index = X_train.index)
    X_test = pd.DataFrame(sc.transform(X_test), columns = X_test.columns, index = X_test.index)
    X_train
    X_test
    y_train
    y_test

In [None]:
# model = LinearRegression(fit_intercept = True)
model = Ridge(alpha = 0.1, fit_intercept = True)
model.fit(X_train, y_train) 

# The following gives the R-square score
model.score(X_train, y_train) 

# This is the coefficient Beta_1, ..., Beta_7
model.coef_

# This is the coefficient Beta_0
model.intercept_

In [None]:
test_output = pd.DataFrame(model.predict(X_test), index = X_test.index, columns = ['pred_SalePrice'])
# When extending to multiple features remove .array.reshape(-1, 1)
test_output.head()

In [None]:
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)

In [None]:
abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()
model.score(X_test, y_test)

#### Decision Tree

In [None]:
clf = DecisionTreeRegressor(random_state=50)

clf = clf.fit(X_train, y_train) 

In [None]:
X_train.columns

In [None]:
clf.feature_importances_

In [None]:
test_output = pd.DataFrame(clf.predict(X_test), index = X_test.index, columns = ['pred_SalePrice'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()

#### Bagging Regressor ####

In [None]:
regr = BaggingRegressor(random_state=50, n_estimators = 200, max_samples = 100)

regr = regr.fit(X_train, y_train) 

In [None]:
test_output = pd.DataFrame(regr.predict(X_test), index = X_test.index, columns = ['pred_SalePrice'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()

#### Random Forest Regressor ####

In [None]:
rf = RandomForestRegressor(random_state=50, min_samples_leaf = 3, max_features = "sqrt")

rf = rf.fit(X_train, y_train) 


In [None]:
X_train.columns

In [None]:
rf.feature_importances_

In [None]:
test_output = pd.DataFrame(rf.predict(X_test), index = X_test.index, columns = ['pred_SalePrice'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()

#### Gradient Boosting Regressor ####

In [None]:
gb = GradientBoostingRegressor(random_state=50, min_samples_leaf = 2, max_depth = 4)

gb = gb.fit(X_train, y_train) 


In [None]:
X_train.columns

In [None]:
gb.feature_importances_

In [None]:
test_output = pd.DataFrame(gb.predict(X_test), index = X_test.index, columns = ['pred_SalePrice'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()

#### XGBoost Regressor ####

In [None]:
# XGBoost comes with its own class for storing datasets called DMatrix. 
# It is a highly optimized class for memory and speed. 
# That's why converting datasets into this format is a requirement for the native XGBoost API:


# Create regression matrices

dtrain_reg = xgb.DMatrix(X_train, y_train, enable_categorical=True)

dtest_reg = xgb.DMatrix(X_test, y_test, enable_categorical=True)

In [None]:
# Define hyperparameters

# params = {"objective": "reg:squarederror", "tree_method": "gpu_hist"}
# Use above if we have GPU
params = {"objective": "reg:squarederror", "tree_method": "exact", "max_depth" : 4, "learning_rate" : 0.1} # use "tree_method" : "hist" if you need speed

In [None]:
n = 100

model = xgb.train(

   params=params,

   dtrain=dtrain_reg,

   num_boost_round=n,

)

In [None]:
from sklearn.metrics import mean_squared_error
preds = model.predict(dtest_reg)

In [None]:
test_output = pd.DataFrame(preds, index = X_test.index, columns = ['pred_SalePrice'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()

#### Hybrid Model ####

In [None]:
model = LinearRegression(fit_intercept = True)
model.fit(X_train, y_train) 

# The following gives the R-square score
model.score(X_train, y_train) 

In [None]:
training_residuals = y_train - model.predict(X_train)

In [None]:
rf = RandomForestRegressor(random_state=50, min_samples_leaf = 3, max_features = "sqrt")

rf = rf.fit(X_train, training_residuals) 


In [None]:
pred_residuals = rf.predict(X_test)
y_pred = pred_residuals + model.predict(X_test)

In [None]:
test_output = pd.DataFrame(y_pred, index = X_test.index, columns = ['pred_SalePrice'])
test_output = test_output.merge(y_test, left_index = True, right_index = True)
test_output.head()
mean_absolute_error = abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()
print('Mean absolute error is ')
print(mean_absolute_error)
abs(test_output['pred_SalePrice'] - test_output['SalePrice']).mean()/test_output['SalePrice'].mean()