In [32]:
import pandas as pd
import numpy as np
from sklearn.datasets import make_classification
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [25]:
# EDA
data = pd.read_csv('train.csv')
print(data.shape)
data.head(3)

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


In [26]:
data.drop('Id', axis = 1, inplace=True)

# drop all columns with >= 50% NaN values
threshold = len(data) * 0.5
cols_to_drop = data.columns[data.isnull().sum() >= threshold]
data.drop(cols_to_drop, axis=1, inplace=True)
data.reset_index(drop=True, inplace=True)

print(cols_to_drop)
print(data.shape)

Index(['Alley', 'MasVnrType', 'PoolQC', 'Fence', 'MiscFeature'], dtype='object')
(1460, 75)


In [27]:
# Which variables are of type object? These will be transformed using dummy variables. 
# This will increase the column dimension of our dataframe, but we have enough samples that this shouldn't be an issue for modeling
object_columns = data.select_dtypes(include=['object']).columns
print(object_columns)

Index(['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional',
       'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
       'PavedDrive', 'SaleType', 'SaleCondition'],
      dtype='object')


In [28]:
# Make dummies
data_dummied = pd.get_dummies(data, columns=object_columns, drop_first=True)

data_dummied.reset_index(drop=True, inplace=True)

# Normalize the entire new dataframe column wise
scaler = StandardScaler()
data_dummied_scaled = pd.DataFrame(scaler.fit_transform(data_dummied), columns=data_dummied.columns)

In [29]:
# We will use the train data set as provided by Kaggle are our main data, and use the test set provided by Kaggle as a validation set

X = data_dummied_scaled.drop(columns='SalePrice', axis=1)
y = data_dummied_scaled['SalePrice']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Normalize train and test sets independently
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_test_normalized = scaler.transform(X_test)

X_train_normalized_df = pd.DataFrame(X_train_normalized, columns=X_train.columns)
X_test_normalized_df = pd.DataFrame(X_test_normalized, columns=X_test.columns)

In [None]:

# Define the Random Forest model
rf = RandomForestRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Define the GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           cv=5, n_jobs=-1, verbose=2, scoring='accuracy')

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_rf = grid_search.best_estimator_

print(f"Best Parameters: {best_params}")

# Predict and evaluate the model on the test set
y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Test Set Accuracy: {accuracy}")

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
