In [44]:
# Importing relevant modules
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib notebook

In [45]:
# Uploading the dataset
data = pd.read_csv("train.csv")

In [46]:
columns_with_nan = data.columns[data.isna().any()].tolist()

# Display the names of columns with NaN values
print(columns_with_nan)

['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']


In [47]:
# data.columns

In [48]:
# filling all column with NaN values with preceding values in the column
data.fillna(method="ffill", inplace = True)

In [49]:
columns_with_nan = data.columns[data.isna().any()].tolist()

# Display the names of columns with NaN values
print(columns_with_nan)

['Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature']


In [50]:
# Splitting the training set by columns into features and target volues
X = data.drop(["SalePrice"], axis =1)
y = data["SalePrice"]

In [74]:
# Since the dataset consist of a blend of both categorical and numerical columns, we want to convert all to be numerical
# to facilitate the learning process
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

# define the columns to be encoded and scaled
cat_cols = X.select_dtypes(include=['category', 'object']).columns.tolist()
num_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# define the preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), cat_cols),
        ('num', StandardScaler(), num_cols)
    ])

fit_X= preprocessor.fit(X)
transform_X = preprocessor.transform(X)

# Filling outstanding NaN values with Zeros
transform_X.data[np.isnan(transform_X.data)] = 0

In [75]:
# splitting the dataset into training and test sets
from sklearn.model_selection import train_test_split

X_train,X_test, y_train, y_test = train_test_split(transform_X,y, random_state= 0, test_size = 0.2)


In [76]:
# Using cross-validation to evaluate model performance given various hyperparameter instances
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

clf = GradientBoostingRegressor(random_state=85) 
grid_values = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2],'max_depth': [3, 4, 5]}

grid_search = GridSearchCV(clf, param_grid = grid_values, cv=3, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [77]:
# View the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150}


In [78]:
# Creating the housing pricing predictive model
hp_model = GradientBoostingRegressor(random_state = 85, learning_rate = 0.01, max_depth = 3, n_estimators = 50).fit(X_train,y_train)

In [95]:
# Checking for potential overfitting of the model
print("Training Score: {:.2f}" .format(hp_model.score(X_train,y_train)))
print("Test Score: {:.2f}" .format(hp_model.score(X_test,y_test)))

In [83]:
y_predict = hp_model.predict(X_test)

In [94]:
from sklearn.metrics import mean_squared_error, r2_score

print("Mean Squared Error: {:.2f}".format(mean_squared_error(y_test, y_predict)))
print("R2 Score : {:.2f}".format(r2_score(y_test, y_predict)))