In [13]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
from xgboost import XGBRegressor

In [2]:
# EDA
data = pd.read_csv('train.csv')
print(data.shape)
data.head(3)

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [3]:
data.drop('Id', axis = 1, inplace=True)

# drop all columns with >= 50% NaN values
threshold = len(data) * 0.5
cols_to_drop = data.columns[data.isnull().sum() >= threshold]
data.drop(cols_to_drop, axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)

print(cols_to_drop)
print(data.shape)

Index(['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')
(1460, 75)


In [4]:
# Which variables are of type object? These will be transformed using dummy variables. 
# This will increase the column dimension of our dataframe, but we have enough samples that this shouldn't be an issue for modeling
object_columns = data.select_dtypes(include=['object']).columns
print(object_columns)

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')


In [5]:
# Make dummies
data_dummied = pd.get_dummies(data, columns=object_columns, drop_first=True)

data_dummied.reset_index(drop=True, inplace=True)

# Normalize the entire new dataframe column wise
scaler = StandardScaler()
data_dummied_scaled = pd.DataFrame(scaler.fit_transform(data_dummied), columns=data_dummied.columns)

# We will use the train data set as provided by Kaggle are our main data, and use the test set provided by Kaggle as a validation set
X = data_dummied_scaled.drop(columns='SalePrice', axis=1)
y = data_dummied_scaled['SalePrice']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize train and test sets independently
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

X_train_normalized_df = pd.DataFrame(X_train_normalized, columns=X_train.columns)
X_test_normalized_df = pd.DataFrame(X_test_normalized, columns=X_test.columns)

Random Forest

In [8]:
# Parameters grid for GridSearch
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_features': ['sqrt', 'log2'],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 10, 20],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Create and train the random forest regressor using GridSearchCV with 10-fold cross validation
model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

grid_search.fit(X_train_normalized_df, y_train)

# Best parameters from GridSearchCV
best_params = grid_search.best_params_
print(f'Best parameters: {best_params}')

# Evaluate the model using cross-validation
cv_scores = grid_search.cv_results_['mean_test_score']
cv_rmse = (-cv_scores.mean()) ** 0.5
print(f'Cross-Validated RMSE: {cv_rmse}')

# Train the model with best parameters on the training set
best_model = grid_search.best_estimator_
best_model.fit(X_train_normalized_df, y_train)

# Predict on the test set
y_pred = best_model.predict(X_test_normalized_df)

# Calculate regression metrics
test_rmse = mean_squared_error(y_test, y_pred, squared=False)
test_mae = mean_absolute_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print(f'Test RMSE: {test_rmse}')
print(f'Test MAE: {test_mae}')
print(f'Test R²: {test_r2}')


# Predict on the normalized test data
test_predictions = best_model.predict(X_test_normalized_df)


Fitting 5 folds for each of 432 candidates, totalling 2160 fits
Best parameters: {'bootstrap': False, 'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Cross-Validated RMSE: 0.4280735321131126
Test RMSE: 0.3794985389400195
Test MAE: 0.21581999380386493
Test R²: 0.8815828361976491




XGBoost

In [14]:
# Prepare the parameter grid for Grid Search
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

# Prepare the XGBRegressor
xgb_model = XGBRegressor(objective='reg:squarederror')

# Implement 10-fold cross-validation with grid search
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train_normalized_df, y_train)

# Get the best parameters and the corresponding score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score (negative MSE): {best_score}")

# Train the model with the best parameters on the full training set
best_model = grid_search.best_estimator_

# Evaluate the model on the test set
test_score = best_model.score(X_test_normalized_df, y_test)
print(f"Test Score (R^2): {test_score}")

Fitting 10 folds for each of 108 candidates, totalling 1080 fits
Best Parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1.0}
Best Score (negative MSE): -0.1178942911851216
Test Score (R^2): 0.9089530612403024
