### Ames Housing Price Prediction


**Setting the Kaggle environment**

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# Input data files are available in the read-only "../input/" directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/home-data-for-ml-course/sample_submission.csv
/kaggle/input/home-data-for-ml-course/sample_submission.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv.gz
/kaggle/input/home-data-for-ml-course/data_description.txt
/kaggle/input/home-data-for-ml-course/test.csv.gz
/kaggle/input/home-data-for-ml-course/train.csv
/kaggle/input/home-data-for-ml-course/test.csv


**Loading modules and data**

In [2]:
# import libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import matplotlib_inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Load the data
train_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
# print(type(train_data))
df_train = train_data.copy()

test_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")
# print(type(test_data))
df_test = test_data.copy()

print(df_train.shape)
print(df_test.shape)

(1460, 81)
(1459, 80)


Since the X_train and df_test are not the same size, I will drop one X_train row that has the most missing value count. 

In [3]:
# Calculate the number of missing values in each row
missing_values_per_row = df_train.isna().sum(axis=1)
# print(missing_values_per_row)

# Add the missing values count as a new column
df_train['missing_count'] = missing_values_per_row
# X_train['missing_count']

# # Sort the DataFrame by the missing count column
df_train.sort_values(by='missing_count', ascending=False, inplace=True)

# Drop the row with the most missing values
df_train.drop(df_train.index[0], axis=0, inplace=True)
df_train.drop('missing_count', axis=1, inplace=True)

### 1. Preprocessing training and testing datasets

**Looking at non-null values**

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1459 entries, 1011 to 1328
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   object 
 3   LotFrontage    1200 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1459 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  Overa

In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

**Looking at columns with missing values**

In [6]:
missing_value_cols_train = df_train.columns[df_train.isnull().sum() > 0]
missing_value_cols_test = df_test.columns[df_test.isnull().sum() > 0]

common_cols = list(set(missing_value_cols_train) | set(missing_value_cols_test))

df_missing = pd.DataFrame({'df_train': df_train[common_cols].isnull().sum(),
                 'X-test': df_test[common_cols].isnull().sum(),
                 'Data Type': df_train[common_cols].dtypes})

print(df_missing.sort_values("df_train", ascending=False))

              df_train  X-test Data Type
PoolQC            1452    1456    object
MiscFeature       1405    1408    object
Alley             1368    1352    object
Fence             1178    1169    object
FireplaceQu        689     730    object
LotFrontage        259     227   float64
GarageType          80      76    object
GarageYrBlt         80      78   float64
GarageCond          80      78    object
GarageQual          80      78    object
GarageFinish        80      78    object
BsmtFinType2        37      42    object
BsmtExposure        37      44    object
BsmtCond            36      45    object
BsmtFinType1        36      42    object
BsmtQual            36      44    object
MasVnrArea           8      15   float64
MasVnrType           8      16    object
Electrical           1       0    object
Exterior2nd          0       1    object
BsmtUnfSF            0       1     int64
Exterior1st          0       1    object
SaleType             0       1    object
Functional      

**Assesing numerical columns with missing values**
* LotFrontage - We will replace missing values by mean.       259     227   float64
* GarageYrBlt  - We will replace missing values by mode.      81      78   float64
* MasVnrArea   - We will replace missing values by mean.        8      15   float64
* GarageCars   - We will replace missing values by median.        0       1     int64
* BsmtFinSF2   - We will replace missing values by median.        0       1     int64
* BsmtFinSF1   - We will replace missing values by median.         0       1     int64
* BsmtUnfSF    - We will replace missing values by median.        0       1     int64
* BsmtHalfBath - We will replace missing values by median.        0       2     int64
* TotalBsmtSF  - We will replace missing values by median.        0       1     int64
* GarageArea   - We will replace missing values by mean.        0       1     int64
* BsmtFullBath - We will replace missing values by median.        0       2     int64

**Assesing categorical columns with missing values**
* PoolQC      - we will drop this column.     1453    1456    object
* MiscFeature - we will drop this column.      1406    1408    object
* Alley       - we will drop this column.     1369    1352    object
* Fence       - we will drop this column.      1179    1169    object
* FireplaceQu - We will replace missing values by mode.       690     730    object
* GarageQual  - We will replace missing values by mode.        81      78    object
* GarageType  - We will replace missing values by mode.       81      76     object
* GarageCond  - We will replace missing values by mode.       81      78     object
* GarageFinish - We will replace missing values by mode.        81      78   object
* BsmtFinType2 - We will replace missing values by mode.      38      42     object
* BsmtExposure - We will replace missing values by mode.      38      44     object
* BsmtFinType1 - We will replace missing values by mode.      37      42     object
* BsmtCond     - We will replace missing values by mode.       37      45    object
* BsmtQual     - We will replace missing values by mode.       37      44    object
* MasVnrType   - We will replace missing values by mode.        8      16    object
* Electrical   - We will replace missing values by mode.        1       0    object
* Utilities    - We will replace missing values by mode.        0       2    object
* Functional   - We will replace missing values by mode.        0       2    object
* SaleType     - We will replace missing values by mode.        0       1    object
* MSZoning     - We will replace missing values by mode.        0       4    object
* Exterior2nd  - We will replace missing values by mode.        0       1    object
* Exterior1st  - We will replace missing values by mode.        0       1    object
* KitchenQual  - We will replace missing values by mode.        0       1    object




**Dropping Columns**

In [7]:
df_train.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)
print(len(df_train.columns))

df_test.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)
print(len(df_test.columns))

76
75


In [8]:
print(df_train.shape)
print(df_test.shape)

(1459, 76)
(1459, 75)


**Preprocessing categorical columns**

In [9]:
# identify categorical columns
cat_cols_df_missing = df_missing[df_missing['Data Type']=='object'].sort_values('df_train', ascending=False)
print(cat_cols_df_missing)

              df_train  X-test Data Type
PoolQC            1452    1456    object
MiscFeature       1405    1408    object
Alley             1368    1352    object
Fence             1178    1169    object
FireplaceQu        689     730    object
GarageType          80      76    object
GarageCond          80      78    object
GarageQual          80      78    object
GarageFinish        80      78    object
BsmtFinType2        37      42    object
BsmtExposure        37      44    object
BsmtCond            36      45    object
BsmtFinType1        36      42    object
BsmtQual            36      44    object
MasVnrType           8      16    object
Electrical           1       0    object
KitchenQual          0       1    object
Functional           0       2    object
SaleType             0       1    object
Exterior1st          0       1    object
Exterior2nd          0       1    object
MSZoning             0       4    object
Utilities            0       2    object


Iterate through catagorical columns and replace missing values with "mode"

In [10]:
for col in list(cat_cols_df_missing.index):
    if col in list(df_train.columns):
        df_train[col].fillna(df_train[col].mode()[0], inplace=True)
        df_test[col].fillna(df_test[col].mode()[0], inplace=True)
    else:
        continue

**Since there are mismatches in the categories for the following columns, we'll just drop these categories.**
- Affected colums: Condition2, Electrical, Exterior1st, Exterior2nd, GarageQual, Heating, HouseStyle, RoofMatl, Utilities

Let's check the affected columns first.

In [11]:
cols_to_check = ["Condition2", "Electrical", "Exterior1st", "Exterior2nd","GarageQual",
                 "Heating", "HouseStyle", "RoofMatl", "Utilities"]

for col in cols_to_check:
    print(col)
#     print(df_train[col].isnull().sum())
    df_train_obj = df_train.select_dtypes(include='object')
    df_test_obj = df_test.select_dtypes(include='object')
    print(sorted(df_train_obj[col].unique()))
    print(sorted(df_test_obj[col].unique()),"\n")

Condition2
['Artery', 'Feedr', 'Norm', 'PosA', 'PosN', 'RRAe', 'RRAn', 'RRNn']
['Artery', 'Feedr', 'Norm', 'PosA', 'PosN'] 

Electrical
['FuseA', 'FuseF', 'FuseP', 'Mix', 'SBrkr']
['FuseA', 'FuseF', 'FuseP', 'SBrkr'] 

Exterior1st
['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing']
['AsbShng', 'AsphShn', 'BrkComm', 'BrkFace', 'CBlock', 'CemntBd', 'HdBoard', 'MetalSd', 'Plywood', 'Stucco', 'VinylSd', 'Wd Sdng', 'WdShing'] 

Exterior2nd
['AsbShng', 'AsphShn', 'Brk Cmn', 'BrkFace', 'CBlock', 'CmentBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Other', 'Plywood', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'Wd Shng']
['AsbShng', 'AsphShn', 'Brk Cmn', 'BrkFace', 'CBlock', 'CmentBd', 'HdBoard', 'ImStucc', 'MetalSd', 'Plywood', 'Stone', 'Stucco', 'VinylSd', 'Wd Sdng', 'Wd Shng'] 

GarageQual
['Ex', 'Fa', 'Gd', 'Po', 'TA']
['Fa', 'Gd', 'Po', 'TA'] 

Heating
['Floor', 'GasA', 'GasW', 'Grav', 'OthW

**Dropping the affected columns (from above step)**

In [12]:
for col in cols_to_check:
    df_train.drop(col, axis=1, inplace=True)
    df_test.drop(col, axis=1, inplace=True)

**Dropping observation from  `df_train` that is not present in `df_test`**

* This code is checking for the categorical columns in the df_train data and comparing their unique values with the unique values in the df_test data.
* For each column, the code checks if the difference between the sorted sets of unique values in df_train and df_test is equal to zero. If so, it continues to the next column.
* If the difference is not equal to zero, it calculates the difference between the two sets and uses it to remove the rows in df_train that have values belonging to the calculated difference set.

In [13]:
result = pd.DataFrame({'Train': df_train.select_dtypes(include='object').apply(lambda x: x.nunique()),
                       'Test': df_test.select_dtypes(include='object').apply(lambda x: x.nunique())}).sort_values("Train", ascending=False)
result

Unnamed: 0,Train,Test
Neighborhood,25,25
SaleType,9,9
Condition1,9,9
Functional,7,7
SaleCondition,6,6
Foundation,6,6
GarageType,6,6
RoofStyle,6,6
BsmtFinType2,6,6
BsmtFinType1,6,6


**Checking if `df_train` and `df_test` have the same subcategories for each categorical column**

In [14]:
for cols in df_train.select_dtypes(include='object').columns:
    print(cols)
    print(sorted(df_train[cols].unique()))
    print(sorted(df_test[cols].unique()), "\n")

MSZoning
['C (all)', 'FV', 'RH', 'RL', 'RM']
['C (all)', 'FV', 'RH', 'RL', 'RM'] 

Street
['Grvl', 'Pave']
['Grvl', 'Pave'] 

LotShape
['IR1', 'IR2', 'IR3', 'Reg']
['IR1', 'IR2', 'IR3', 'Reg'] 

LandContour
['Bnk', 'HLS', 'Low', 'Lvl']
['Bnk', 'HLS', 'Low', 'Lvl'] 

LotConfig
['Corner', 'CulDSac', 'FR2', 'FR3', 'Inside']
['Corner', 'CulDSac', 'FR2', 'FR3', 'Inside'] 

LandSlope
['Gtl', 'Mod', 'Sev']
['Gtl', 'Mod', 'Sev'] 

Neighborhood
['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker']
['Blmngtn', 'Blueste', 'BrDale', 'BrkSide', 'ClearCr', 'CollgCr', 'Crawfor', 'Edwards', 'Gilbert', 'IDOTRR', 'MeadowV', 'Mitchel', 'NAmes', 'NPkVill', 'NWAmes', 'NoRidge', 'NridgHt', 'OldTown', 'SWISU', 'Sawyer', 'SawyerW', 'Somerst', 'StoneBr', 'Timber', 'Veenker'] 

Condition1
['Artery', 'F

**Asserting there is no differnce in subcatagories per catagorical column**

In [15]:
train_cat_cols_missing_vals = list(df_train.select_dtypes(include='object').columns[df_train.select_dtypes(include='object').isnull().sum() > 0])
print(len(train_cat_cols_missing_vals))

test_cat_cols_missing_vals = list(df_test.select_dtypes(include='object').columns[df_test.select_dtypes(include='object').isnull().sum() > 0])
print(len(test_cat_cols_missing_vals))

0
0


In [16]:
print(df_train.select_dtypes(include='object').columns[df_train.select_dtypes(include='object').isnull().sum()>0])

print(df_test.select_dtypes(include='object').columns[df_test.select_dtypes(include='object').isnull().sum()>0])

Index([], dtype='object')
Index([], dtype='object')


**Handling Missing Values for numeric columns**

Let's count the number of missing values per column. 

In [17]:
train_num_cols = df_train.select_dtypes(exclude='object').columns[df_train.select_dtypes(exclude='object').isnull().sum()>0]

df_train[train_num_cols].isnull().sum().sort_values(ascending=False)

LotFrontage    259
GarageYrBlt     80
MasVnrArea       8
dtype: int64

**Replacing Missing Values**

Affected columns from trainig data
* LotFrontage - replace with average lot frontage
* GarageYrBlt - replace with most frequent year
* MasVnrArea  - replace with average area

In [18]:
df_train["LotFrontage"].fillna(df_train["LotFrontage"].median(), inplace=True)
df_train["GarageYrBlt"].fillna(df_train["GarageYrBlt"].median(), inplace=True)
df_train["MasVnrArea"].fillna(df_train["MasVnrArea"].median(), inplace=True)

**Handling df_test missing values**

Let's check for columns with missing values.

In [19]:
test_num_cols = df_test.select_dtypes(exclude='object').columns[df_test.select_dtypes(exclude='object').isnull().sum()>0]

df_test[test_num_cols].isnull().sum().sort_values(ascending=False)

LotFrontage     227
GarageYrBlt      78
MasVnrArea       15
BsmtFullBath      2
BsmtHalfBath      2
BsmtFinSF1        1
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
GarageCars        1
GarageArea        1
dtype: int64

**Replacing Missing Values**

Testing data
* LotFrontage - replace with median
* GarageYrBlt - replace with median
* MasVnrArea - replace with median
* BsmtFullBath - replace with median
* BsmtHalfBath - replace with median
* BsmtFinSF1 - replace with median
* BsmtFinSF2 - replace with median
* BsmtUnfSF - replace with median
* TotalBsmtSF - replace with median
* GarageCars - replace with median
* GarageArea - replace with median

In [20]:
num_cols_missing_vals = list(df_test[test_num_cols].isnull().sum().sort_values(ascending=False).index)

for col in num_cols_missing_vals:
    df_test[col].fillna(df_test[col].median(), inplace=True)

**Separating training data features from lables**

In [21]:
# Split the data into features and target
X_train = df_train.drop("SalePrice", axis=1)
print(f"X-train dimension: {X_train.shape}")

y_train_final = df_train["SalePrice"]
print(f"y-train dimension: {y_train_final.shape}")

# test set
print(f"X-test dimension: {df_test.shape}")

X-train dimension: (1459, 66)
y-train dimension: (1459,)
X-test dimension: (1459, 66)


### OneHotEncoding
Let's convert categories into 0's and 1's using OneHotEncoding

In [22]:
# Train 
from sklearn.preprocessing import OneHotEncoder

# select the categorical columns
cat_cols_train = X_train.select_dtypes(include='object')
print(len(list(cat_cols_train.columns)))

cat_cols_test = df_test.select_dtypes(include='object')
print(len(list(cat_cols_test.columns)))

one_hot_encoded_train_data = pd.get_dummies(X_train) #.select_dtypes(include='object')) 
final_train = one_hot_encoded_train_data.copy()

one_hot_encoded_test_data = pd.get_dummies(df_test) #.select_dtypes(include='object')) 
final_test = one_hot_encoded_test_data.copy()

30
30


In [23]:
final_train.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
1011,90,75.0,9825,5,5,1965,1965,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1218,50,52.0,6240,4,5,1947,1950,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
533,20,50.0,5000,1,3,1946,1950,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
1179,20,77.0,8335,5,5,1954,1954,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0
705,190,70.0,5600,4,5,1930,1950,0.0,0,0,...,0,0,0,1,0,0,0,0,1,0


In [24]:
final_test.head()

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,20,80.0,11622,5,6,1961,1961,0.0,468.0,144.0,...,0,0,0,1,0,0,0,0,1,0
1,20,81.0,14267,6,6,1958,1958,108.0,923.0,0.0,...,0,0,0,1,0,0,0,0,1,0
2,60,74.0,13830,5,5,1997,1998,0.0,791.0,0.0,...,0,0,0,1,0,0,0,0,1,0
3,60,78.0,9978,6,6,1998,1998,20.0,602.0,0.0,...,0,0,0,1,0,0,0,0,1,0
4,120,43.0,5005,8,5,1992,1992,0.0,263.0,0.0,...,0,0,0,1,0,0,0,0,1,0


checking for mismatch between train and test datasets

In [25]:
set(final_train.columns) - set(final_test.columns)

set()

In [26]:
print(final_train.isnull().sum().sum(), final_test.isnull().sum().sum())

0 0


Checking dimension of our datasets before building ML model.

In [27]:
print(final_train.shape, final_test.shape, y_train_final.shape)

(1459, 202) (1459, 202) (1459,)


**Let's split our train dataframe into trainig and validation sets before building final model**

In [28]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(final_train, 
                                                  y_train_final,
                                                  test_size=0.25,
                                                  random_state=123)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(1094, 202) (365, 202) (1094,) (365,)


### 2. Building Base Regression Model 
- datasets `X_train`, `X_val`, `y_train`, `y_val` from above will be used for training model.
- `LinearRegression`, `RandomForest` and `XGBoost` regression models are chosen.
- We will train the models, evaluate them and choose the best model based on lowest RMSE value.

In [29]:
# load required modules
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import cross_val_score

# Create the XGBoost and RandomForest Regressor
lr_reg = LinearRegression()
rf_reg = RandomForestRegressor()
xgb_reg = xgb.XGBRegressor()


# Fit the regressor to the training data
lr_reg.fit(X_train, y_train)
rf_reg.fit(X_train, y_train)
xgb_reg.fit(X_train, y_train)


# Make predictions on the validation data
y_val_pred_lr = lr_reg.predict(X_val)
y_val_pred_rf = rf_reg.predict(X_val)
y_val_pred_xgb = xgb_reg.predict(X_val)


# Compute RMSE
lr_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_lr))
rf_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
xgb_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))

# Cross-validate the model using 10-fold CV
lr_cv_scores = cross_val_score(lr_reg, X_train, y_train, cv=10)
rf_cv_scores = cross_val_score(rf_reg, X_train, y_train, cv=10)
xgb_cv_scores = cross_val_score(xgb_reg, X_train, y_train, cv=10)

# Print RMSE scores
print(f"RMSE Linear Regression: {lr_rmse: .2f}")
print(f"RMSE RandomForest: {rf_rmse: .2f}")
print(f"RMSE XGBoost: {xgb_rmse}")
print("")
print(f"Linear Regression 10-Fold CV Mean Score: {np.mean(lr_cv_scores): .2f}")
print(f"RandomForest 10-Fold CV Mean Score: {np.mean(rf_cv_scores): .2f}")
print(f"XGBoost 10-Fold CV Mean Score: {np.mean(xgb_cv_scores): .2f}")


RMSE Linear Regression:  1807774716.99
RMSE RandomForest:  33221.79
RMSE XGBoost: 30558.475606003376

Linear Regression 10-Fold CV Mean Score: -47734701.33
RandomForest 10-Fold CV Mean Score:  0.83
XGBoost 10-Fold CV Mean Score:  0.80


**Inspecting Feature Importances**

**1. Linear Regressor**

In [30]:
# Get feature importance scores
lr_importance = lr_reg.coef_
lr_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': lr_importance})
lr_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 10 most important features
print(lr_importance_df.head(10))

          Feature    Importance
11    TotalBsmtSF  2.175225e+13
12       1stFlrSF  9.544299e+11
13       2ndFlrSF  9.544299e+11
14   LowQualFinSF  9.544299e+11
45   LotShape_IR3  1.994796e+11
44   LotShape_IR2  1.994796e+11
46   LotShape_Reg  1.994796e+11
43   LotShape_IR1  1.994796e+11
152  CentralAir_N  1.214982e+11
153  CentralAir_Y  1.214982e+11


**2. Random Forest Regressor**

In [31]:
# Get feature importance scores
rf_importance = rf_reg.feature_importances_
rf_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_importance})
rf_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 10 most important features
print(rf_importance_df.head(10))

        Feature  Importance
3   OverallQual    0.539520
15    GrLivArea    0.116619
11  TotalBsmtSF    0.053714
8    BsmtFinSF1    0.049661
13     2ndFlrSF    0.023650
12     1stFlrSF    0.022121
18     FullBath    0.019099
26   GarageArea    0.017978
5     YearBuilt    0.011751
2       LotArea    0.010788


**3. XGBoost Regressor**

In [32]:
# Get feature importance scores
xgb_importance = xgb_reg.get_booster().get_score(importance_type='weight')
xgb_importance_df = pd.DataFrame({'Feature': list(xgb_importance.keys()), 
                                  'Importance': list(xgb_importance.values())})
xgb_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 10 most important features
print(xgb_importance_df.head(10))

        Feature  Importance
1   LotFrontage       297.0
2       LotArea       284.0
0    MSSubClass       197.0
10    BsmtUnfSF       152.0
15    GrLivArea       148.0
11  TotalBsmtSF       134.0
8    BsmtFinSF1       122.0
5     YearBuilt       118.0
26   GarageArea       110.0
12     1stFlrSF       100.0


### 3. Performing Hyperparameter Tuning

Since `XGBoost Regressor` performed best, we will use it as the **final submission model**. 

Here are XGBoost's best_paramaters from GridSearch.
- learning_rate=0.1
- max_depth=2
- n_estimators=300
- reg_alpha=0.5

**Using GridSearch results to tune hyperparameters**
- models to use `RandomForestRegressor` and `XGBRegressor`
- so far `XGBoost` has the lowest RMSE value (best model so far).

Let's see if hyperparameter tunning improves the two models. 

In [33]:
# load required modules
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import cross_val_score

# Create the XGBoost and RandomForest Regressor
xgb_reg = xgb.XGBRegressor(learning_rate=0.1,
                           max_depth=3,
                           n_estimators=300,
                           reg_alpha= 0.5)

rf_reg = RandomForestRegressor(max_depth=7,
                               min_samples_split=5,
                               n_estimators=100)

# Fit the regressor to the training data
xgb_reg.fit(X_train, y_train)
rf_reg.fit(X_train, y_train)

# Make predictions on the validation data
y_val_pred_xgb = xgb_reg.predict(X_val)
y_val_pred_rf = rf_reg.predict(X_val)

# Compute RMSE
xgb_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
rf_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))

# Cross-validate the model using 10-fold CV
xgb_cv_scores = cross_val_score(xgb_reg, X_train, y_train, cv=10)
rf_cv_scores = cross_val_score(rf_reg, X_train, y_train, cv=10)

# Print RMSE scores
print(f"RMSE RandomForest: {rf_rmse: .2f}")
print(f"RMSE XGBoost: {xgb_rmse}")
print("")
print(f"RandomForest 10-Fold CV Mean Score: {np.mean(rf_cv_scores): .2f}")
print(f"XGBoost 10-Fold CV Mean Score: {np.mean(xgb_cv_scores): .2f}")


RMSE RandomForest:  34818.02
RMSE XGBoost: 27525.935790238887

RandomForest 10-Fold CV Mean Score:  0.83
XGBoost 10-Fold CV Mean Score:  0.83


Both `RMSE` and `model scores` improved after tunning the hyperparameters.

****Extracting Important Features****

In [34]:
# Get feature importance scores
rf_importance = rf_reg.feature_importances_
rf_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_importance})
rf_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 20 most important features
print(rf_importance_df.head(20))

              Feature  Importance
3         OverallQual    0.575125
15          GrLivArea    0.109009
8          BsmtFinSF1    0.058491
11        TotalBsmtSF    0.042662
13           2ndFlrSF    0.022801
12           1stFlrSF    0.022376
18           FullBath    0.019425
26         GarageArea    0.015811
5           YearBuilt    0.010229
2             LotArea    0.010151
178  GarageFinish_Unf    0.008291
25         GarageCars    0.007833
123       BsmtQual_Ex    0.007521
6        YearRemodAdd    0.006520
1         LotFrontage    0.005785
22       TotRmsAbvGrd    0.004374
10          BsmtUnfSF    0.004060
7          MasVnrArea    0.003973
23         Fireplaces    0.003807
28        OpenPorchSF    0.003208


In [35]:
# Get feature importance scores
xgb_importance = xgb_reg.get_booster().get_score(importance_type='weight')
xgb_importance_df = pd.DataFrame({'Feature': list(xgb_importance.keys()), 
                                  'Importance': list(xgb_importance.values())})
xgb_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 20 most important features
print(xgb_importance_df.head(20))

         Feature  Importance
15     GrLivArea       122.0
2        LotArea       112.0
8     BsmtFinSF1       107.0
26    GarageArea        93.0
3    OverallQual        91.0
1    LotFrontage        88.0
0     MSSubClass        71.0
10     BsmtUnfSF        66.0
12      1stFlrSF        66.0
11   TotalBsmtSF        58.0
5      YearBuilt        58.0
4    OverallCond        54.0
24   GarageYrBlt        52.0
28   OpenPorchSF        44.0
27    WoodDeckSF        44.0
9     BsmtFinSF2        40.0
7     MasVnrArea        36.0
13      2ndFlrSF        34.0
6   YearRemodAdd        32.0
23    Fireplaces        25.0


### 4. Perfoming PCA and Final Model Training 
- PCA will reduce the number of features

In [36]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(final_train)
X_test_scaled = scaler.transform(final_test)

# Perform PCA
pca = PCA(n_components=20) # set n_components to 20
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

X_test_submit = pd.DataFrame(X_test_pca,
                          columns=['PC_'+str(i+1) for i in range(20)],
                          index=final_test.index)
X_train_submit = pd.DataFrame(X_train_pca,
                          columns=['PC_'+str(i+1) for i in range(20)],
                          index=final_train.index)

# train XGBoost model
model = xgb.XGBRegressor(learning_rate=0.1,
                         max_depth=2,
                         n_estimators=300,
                         reg_alpha= 0.5)

model.fit(X_train_pca, y_train_final)

# Predict using the test set
y_pred = model.predict(X_test_submit)

# save the predictions to a CSV file
output = pd.DataFrame({'Id': X_test_submit.index+1461, 'SalePrice': y_pred})
output.to_csv('submission.csv', index=False)

print('Successfully created predictions and is saved to "submission.csv" file')


Successfully created predictions and is saved to "submission.csv" file


In [37]:
print(X_train_submit.shape, X_test_submit.shape)

(1459, 20) (1459, 20)


In [38]:
submission_df = pd.read_csv("/kaggle/working/submission.csv")
print(submission_df.shape)

(1459, 2)


In [39]:
submission_df.head()

Unnamed: 0,Id,SalePrice
0,1461,119158.39
1,1462,152505.4
2,1463,176377.72
3,1464,222163.78
4,1465,180495.11
