<a href="https://colab.research.google.com/github/remcovanwijk040/AI/blob/main/kaggle/housing_prices_competition/script_competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Housing competition kaggle


## Setup

Link naar competitie: https://www.kaggle.com/competitions/home-data-for-ml-course/overview

In [34]:
# Setup
import numpy as np
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Load data
url = 'https://raw.githubusercontent.com/remcovanwijk040/AI/main/kaggle/housing_prices_competition/test.csv'
test_df = pd.read_csv(url)
test_df.columns = test_df.columns.str.lower()

url = 'https://raw.githubusercontent.com/remcovanwijk040/AI/main/kaggle/housing_prices_competition/train.csv'
train_df = pd.read_csv(url)
train_df.columns = train_df.columns.str.lower()

## Feature selection

In [67]:
from numpy.lib import column_stack
# For the ease i start to drop all columns with NA:
train_df_adjust = train_df.dropna(axis='columns')
list(train_df_adjust.columns)
features_all = ['mssubclass', 'mszoning', 'lotarea', 'street', 'lotshape', 'landcontour', 'utilities', 'lotconfig', 
            'landslope', 'neighborhood', 'condition1', 'condition2', 'bldgtype', 'housestyle',
            'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd', 'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'exterqual',
            'extercond', 'foundation', 'bsmtfinsf1','bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'heating', 'heatingqc', 'centralair', '1stflrsf',
             '2ndflrsf', 'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'kitchenqual',
             'totrmsabvgrd', 'functional', 'fireplaces', 'garagecars', 'garagearea', 'paveddrive', 'wooddecksf', 'openporchsf', 'enclosedporch', '3ssnporch', 'screenporch',
             'poolarea', 'miscval', 'mosold', 'yrsold', 'saletype', 'salecondition']

cols=train_df_adjust.select_dtypes(exclude='object').columns.to_list() # make a list of all columns excluding objects

features_num = ['mssubclass', 'lotarea', 'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd', 'bsmtfinsf1',
                 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', '1stflrsf', '2ndflrsf', 'lowqualfinsf', 'grlivarea', 'bsmtfullbath',
                 'bsmthalfbath', 'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd', 'fireplaces', 'garagecars',
                 'garagearea', 'wooddecksf', 'openporchsf', 'enclosedporch', '3ssnporch', 'screenporch', 'poolarea', 'miscval', 'mosold',
                 'yrsold']
# The variables in the ele list are not included in the test set after cleaning NA's
ele = ['bsmtfinsf1', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'bsmtfullbath', 'bsmthalfbath', 'garagecars', 'garagearea']
features_num_filter = set(features_num) - set(ele)

#Excluding categorical features for now
#from sklearn import preprocessing
#le = preprocessing.LabelEncoder()
#for i in cols:
#if train_df_adjust[i].dtypes=='object':
#      train_df_adjust[i] = pd.factorize(train_df_adjust[i])[0]
                             

['mssubclass', 'lotarea', 'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd', 'bsmtfinsf1', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', '1stflrsf', '2ndflrsf', 'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd', 'fireplaces', 'garagecars', 'garagearea', 'wooddecksf', 'openporchsf', 'enclosedporch', '3ssnporch', 'screenporch', 'poolarea', 'miscval', 'mosold', 'yrsold']


## Build the model

In [71]:
# Defining x and y
y = train_df_adjust["saleprice"]
x = train_df_adjust[features_num_filter]

#Splitting the data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Train the algoritm
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import GridSearchCV
regressor = RandomForestRegressor(random_state = 1)
param_grid = {
   'n_estimators': [10, 110, 200, 500],
   'max_features': ['auto', 'sqrt', 'log2'],
   'max_depth' : [5, 20, 60],
   'criterion' :['squared_error', 'absolute_error']
}
#GSCV = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5)
#GSCV.fit(x_train, y_train)
#GSCV.best_params_ 

#{'criterion': 'squared_error',
# 'max_depth': 60,
# 'max_features': 'sqrt',
# 'n_estimators': 500}


# Evaluate the final model
from sklearn import metrics

random_forest = RandomForestRegressor(random_state = 1, n_estimators = 500, max_features='sqrt', max_depth=60, criterion = 'squared_error')
random_forest.fit(x_train, y_train)
y_pred = random_forest.predict(x_test)
print('MAE: ', metrics.mean_absolute_error(y_test, y_pred))
print('MSE: ', metrics.mean_squared_error(y_test, y_pred)) 


# In sklearn, you can easily perform that using an oob_score = True parameter. If set to True, this parameter makes Random Forest Regressor use “out-of-bag” samples to estimate the R^2 on unseen data. If you get a value of more than 0.75, it means your model does not overfit (the best possible score is equal to 1)
random_forest_out_of_bag = RandomForestRegressor(oob_score=True)
random_forest_out_of_bag.fit(x_train, y_train)
print(random_forest_out_of_bag.oob_score_) 


MAE:  17904.526970319635
MSE:  830082907.3712528
0.8193607470404908


## Apply the model

In [75]:
# Adjust test data
from numpy.lib import column_stack
  # For the ease i start to drop all columns with NA:
test_df_adjust = test_df.dropna(axis='columns')

cols=train_df_adjust.select_dtypes(exclude='object').columns.to_list() # make a list of all columns excluding objects

# Define x
x = test_df_adjust[features_num_filter]

# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

# Predict salespriceh
y_pred_test = random_forest.predict(x_test)

display(y_pred_test)


array([205914.474     , 181380.834     , 120560.884     ,  82463.618     ,
       140832.724     , 300195.37      , 300901.614     , 148207.44      ,
       226662.62      , 207826.848     , 178053.92      ,  84457.74      ,
       199534.92      , 292596.934     , 220106.772     , 108744.332     ,
       112836.816     , 111475.722     , 195270.348     , 142978.604     ,
       142438.7       , 135897.04      , 260942.358     , 314987.206     ,
        87672.432     , 189514.46      , 132333.488     , 191817.49      ,
       462212.074     , 120500.626     , 134862.122     , 111386.404     ,
       121389.82      ,  92126.24      , 140428.428     , 335578.238     ,
       125848.056     , 102892.75      , 263129.506     , 124472.508     ,
       161972.648     , 142185.28      ,  98661.51      , 121192.966     ,
       181924.31      , 175561.686     , 119889.116     , 186234.938     ,
       268859.262     , 252236.19      , 122406.45      , 322594.952     ,
       106493.862     , 2