### Ames Housing Price Prediction


**Setting the Kaggle environment**

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

# Input data files are available in the read-only "../input/" directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


**Loading modules and data**

In [None]:
# import libraries
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import numpy as np
import matplotlib.pyplot as plt
import matplotlib_inline
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Load the data
train_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/train.csv")
# print(type(train_data))
df_train = train_data.copy()

test_data = pd.read_csv("/kaggle/input/home-data-for-ml-course/test.csv")
# print(type(test_data))
df_test = test_data.copy()

print(df_train.shape)
print(df_test.shape)

Since the X_train and df_test are not the same size, I will drop one X_train row that has the most missing value count. 

In [None]:
# Calculate the number of missing values in each row
missing_values_per_row = df_train.isna().sum(axis=1)
# print(missing_values_per_row)

# Add the missing values count as a new column
df_train['missing_count'] = missing_values_per_row
# X_train['missing_count']

# # Sort the DataFrame by the missing count column
df_train.sort_values(by='missing_count', ascending=False, inplace=True)

# Drop the row with the most missing values
df_train.drop(df_train.index[0], axis=0, inplace=True)
df_train.drop('missing_count', axis=1, inplace=True)

### 1. Preprocessing training and testing datasets

**Looking at non-null values**

In [None]:
df_train.info()

In [None]:
df_test.info()

**Looking at columns with missing values**

In [None]:
missing_value_cols_train = df_train.columns[df_train.isnull().sum() > 0]
missing_value_cols_test = df_test.columns[df_test.isnull().sum() > 0]

common_cols = list(set(missing_value_cols_train) | set(missing_value_cols_test))

df_missing = pd.DataFrame({'df_train': df_train[common_cols].isnull().sum(),
                 'X-test': df_test[common_cols].isnull().sum(),
                 'Data Type': df_train[common_cols].dtypes})

print(df_missing.sort_values("df_train", ascending=False))

**Assesing numerical columns with missing values**
* LotFrontage - We will replace missing values by mean.       259     227   float64
* GarageYrBlt  - We will replace missing values by mode.      81      78   float64
* MasVnrArea   - We will replace missing values by mean.        8      15   float64
* GarageCars   - We will replace missing values by median.        0       1     int64
* BsmtFinSF2   - We will replace missing values by median.        0       1     int64
* BsmtFinSF1   - We will replace missing values by median.         0       1     int64
* BsmtUnfSF    - We will replace missing values by median.        0       1     int64
* BsmtHalfBath - We will replace missing values by median.        0       2     int64
* TotalBsmtSF  - We will replace missing values by median.        0       1     int64
* GarageArea   - We will replace missing values by mean.        0       1     int64
* BsmtFullBath - We will replace missing values by median.        0       2     int64

**Assesing categorical columns with missing values**
* PoolQC      - we will drop this column.     1453    1456    object
* MiscFeature - we will drop this column.      1406    1408    object
* Alley       - we will drop this column.     1369    1352    object
* Fence       - we will drop this column.      1179    1169    object
* FireplaceQu - We will replace missing values by mode.       690     730    object
* GarageQual  - We will replace missing values by mode.        81      78    object
* GarageType  - We will replace missing values by mode.       81      76     object
* GarageCond  - We will replace missing values by mode.       81      78     object
* GarageFinish - We will replace missing values by mode.        81      78   object
* BsmtFinType2 - We will replace missing values by mode.      38      42     object
* BsmtExposure - We will replace missing values by mode.      38      44     object
* BsmtFinType1 - We will replace missing values by mode.      37      42     object
* BsmtCond     - We will replace missing values by mode.       37      45    object
* BsmtQual     - We will replace missing values by mode.       37      44    object
* MasVnrType   - We will replace missing values by mode.        8      16    object
* Electrical   - We will replace missing values by mode.        1       0    object
* Utilities    - We will replace missing values by mode.        0       2    object
* Functional   - We will replace missing values by mode.        0       2    object
* SaleType     - We will replace missing values by mode.        0       1    object
* MSZoning     - We will replace missing values by mode.        0       4    object
* Exterior2nd  - We will replace missing values by mode.        0       1    object
* Exterior1st  - We will replace missing values by mode.        0       1    object
* KitchenQual  - We will replace missing values by mode.        0       1    object




**Dropping Columns**

In [None]:
df_train.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)
print(len(df_train.columns))

df_test.drop(['Id', 'PoolQC', 'MiscFeature', 'Alley', 'Fence'], axis=1, inplace=True)
print(len(df_test.columns))

In [None]:
print(df_train.shape)
print(df_test.shape)

**Preprocessing categorical columns**

In [None]:
# identify categorical columns
cat_cols_df_missing = df_missing[df_missing['Data Type']=='object'].sort_values('df_train', ascending=False)
print(cat_cols_df_missing)

Iterate through catagorical columns and replace missing values with "mode"

In [None]:
for col in list(cat_cols_df_missing.index):
    if col in list(df_train.columns):
        df_train[col].fillna(df_train[col].mode()[0], inplace=True)
        df_test[col].fillna(df_test[col].mode()[0], inplace=True)
    else:
        continue

**Since there are mismatches in the categories for the following columns, we'll just drop these categories.**
- Affected colums: Condition2, Electrical, Exterior1st, Exterior2nd, GarageQual, Heating, HouseStyle, RoofMatl, Utilities

Let's check the affected columns first.

In [None]:
cols_to_check = ["Condition2", "Electrical", "Exterior1st", "Exterior2nd","GarageQual",
                 "Heating", "HouseStyle", "RoofMatl", "Utilities"]

for col in cols_to_check:
    print(col)
#     print(df_train[col].isnull().sum())
    df_train_obj = df_train.select_dtypes(include='object')
    df_test_obj = df_test.select_dtypes(include='object')
    print(sorted(df_train_obj[col].unique()))
    print(sorted(df_test_obj[col].unique()),"\n")

**Dropping the affected columns (from above step)**

In [None]:
for col in cols_to_check:
    df_train.drop(col, axis=1, inplace=True)
    df_test.drop(col, axis=1, inplace=True)

**Dropping observation from  `df_train` that is not present in `df_test`**

* This code is checking for the categorical columns in the df_train data and comparing their unique values with the unique values in the df_test data.
* For each column, the code checks if the difference between the sorted sets of unique values in df_train and df_test is equal to zero. If so, it continues to the next column.
* If the difference is not equal to zero, it calculates the difference between the two sets and uses it to remove the rows in df_train that have values belonging to the calculated difference set.

In [None]:
result = pd.DataFrame({'Train': df_train.select_dtypes(include='object').apply(lambda x: x.nunique()),
                       'Test': df_test.select_dtypes(include='object').apply(lambda x: x.nunique())}).sort_values("Train", ascending=False)
result

**Checking if `df_train` and `df_test` have the same subcategories for each categorical column**

In [None]:
for cols in df_train.select_dtypes(include='object').columns:
    print(cols)
    print(sorted(df_train[cols].unique()))
    print(sorted(df_test[cols].unique()), "\n")

**Asserting there is no differnce in subcatagories per catagorical column**

In [None]:
train_cat_cols_missing_vals = list(df_train.select_dtypes(include='object').columns[df_train.select_dtypes(include='object').isnull().sum() > 0])
print(len(train_cat_cols_missing_vals))

test_cat_cols_missing_vals = list(df_test.select_dtypes(include='object').columns[df_test.select_dtypes(include='object').isnull().sum() > 0])
print(len(test_cat_cols_missing_vals))

In [None]:
print(df_train.select_dtypes(include='object').columns[df_train.select_dtypes(include='object').isnull().sum()>0])

print(df_test.select_dtypes(include='object').columns[df_test.select_dtypes(include='object').isnull().sum()>0])

**Handling Missing Values for numeric columns**

Let's count the number of missing values per column. 

In [None]:
train_num_cols = df_train.select_dtypes(exclude='object').columns[df_train.select_dtypes(exclude='object').isnull().sum()>0]

df_train[train_num_cols].isnull().sum().sort_values(ascending=False)

**Replacing Missing Values**

Affected columns from trainig data
* LotFrontage - replace with average lot frontage
* GarageYrBlt - replace with most frequent year
* MasVnrArea  - replace with average area

In [None]:
df_train["LotFrontage"].fillna(df_train["LotFrontage"].median(), inplace=True)
df_train["GarageYrBlt"].fillna(df_train["GarageYrBlt"].median(), inplace=True)
df_train["MasVnrArea"].fillna(df_train["MasVnrArea"].median(), inplace=True)

**Handling df_test missing values**

Let's check for columns with missing values.

In [None]:
test_num_cols = df_test.select_dtypes(exclude='object').columns[df_test.select_dtypes(exclude='object').isnull().sum()>0]

df_test[test_num_cols].isnull().sum().sort_values(ascending=False)

**Replacing Missing Values**

Testing data
* LotFrontage - replace with median
* GarageYrBlt - replace with median
* MasVnrArea - replace with median
* BsmtFullBath - replace with median
* BsmtHalfBath - replace with median
* BsmtFinSF1 - replace with median
* BsmtFinSF2 - replace with median
* BsmtUnfSF - replace with median
* TotalBsmtSF - replace with median
* GarageCars - replace with median
* GarageArea - replace with median

In [None]:
num_cols_missing_vals = list(df_test[test_num_cols].isnull().sum().sort_values(ascending=False).index)

for col in num_cols_missing_vals:
    df_test[col].fillna(df_test[col].median(), inplace=True)

**Separating training data features from lables**

In [None]:
# Split the data into features and target
X_train = df_train.drop("SalePrice", axis=1)
print(f"X-train dimension: {X_train.shape}")

y_train_final = df_train["SalePrice"]
print(f"y-train dimension: {y_train_final.shape}")

# test set
print(f"X-test dimension: {df_test.shape}")

### OneHotEncoding
Let's convert categories into 0's and 1's using OneHotEncoding

In [None]:
# Train 
from sklearn.preprocessing import OneHotEncoder

# select the categorical columns
cat_cols_train = X_train.select_dtypes(include='object')
print(len(list(cat_cols_train.columns)))

cat_cols_test = df_test.select_dtypes(include='object')
print(len(list(cat_cols_test.columns)))

one_hot_encoded_train_data = pd.get_dummies(X_train) #.select_dtypes(include='object')) 
final_train = one_hot_encoded_train_data.copy()

one_hot_encoded_test_data = pd.get_dummies(df_test) #.select_dtypes(include='object')) 
final_test = one_hot_encoded_test_data.copy()

In [None]:
final_train.head()

In [None]:
final_test.head()

checking for mismatch between train and test datasets

In [None]:
set(final_train.columns) - set(final_test.columns)

In [None]:
print(final_train.isnull().sum().sum(), final_test.isnull().sum().sum())

Checking dimension of our datasets before building ML model.

In [None]:
print(final_train.shape, final_test.shape, y_train_final.shape)

**Let's split our train dataframe into trainig and validation sets before building final model**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(final_train, 
                                                  y_train_final,
                                                  test_size=0.25,
                                                  random_state=123)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

### 2. Building Base Regression Model 
- datasets `X_train`, `X_val`, `y_train`, `y_val` from above will be used for training model.
- `LinearRegression`, `RandomForest` and `XGBoost` regression models are chosen.
- We will train the models, evaluate them and choose the best model based on lowest RMSE value.

In [None]:
# load required modules
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import cross_val_score

# Create the XGBoost and RandomForest Regressor
lr_reg = LinearRegression()
rf_reg = RandomForestRegressor()
xgb_reg = xgb.XGBRegressor()


# Fit the regressor to the training data
lr_reg.fit(X_train, y_train)
rf_reg.fit(X_train, y_train)
xgb_reg.fit(X_train, y_train)


# Make predictions on the validation data
y_val_pred_lr = lr_reg.predict(X_val)
y_val_pred_rf = rf_reg.predict(X_val)
y_val_pred_xgb = xgb_reg.predict(X_val)


# Compute RMSE
lr_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_lr))
rf_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))
xgb_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))

# Cross-validate the model using 10-fold CV
lr_cv_scores = cross_val_score(lr_reg, X_train, y_train, cv=10)
rf_cv_scores = cross_val_score(rf_reg, X_train, y_train, cv=10)
xgb_cv_scores = cross_val_score(xgb_reg, X_train, y_train, cv=10)

# Print RMSE scores
print(f"RMSE Linear Regression: {lr_rmse: .2f}")
print(f"RMSE RandomForest: {rf_rmse: .2f}")
print(f"RMSE XGBoost: {xgb_rmse}")
print("")
print(f"Linear Regression 10-Fold CV Mean Score: {np.mean(lr_cv_scores): .2f}")
print(f"RandomForest 10-Fold CV Mean Score: {np.mean(rf_cv_scores): .2f}")
print(f"XGBoost 10-Fold CV Mean Score: {np.mean(xgb_cv_scores): .2f}")


**Inspecting Feature Importances**

**1. Linear Regressor**

In [None]:
# Get feature importance scores
lr_importance = lr_reg.coef_
lr_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': lr_importance})
lr_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 10 most important features
print(lr_importance_df.head(10))

**2. Random Forest Regressor**

In [None]:
# Get feature importance scores
rf_importance = rf_reg.feature_importances_
rf_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_importance})
rf_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 10 most important features
print(rf_importance_df.head(10))

**3. XGBoost Regressor**

In [None]:
# Get feature importance scores
xgb_importance = xgb_reg.get_booster().get_score(importance_type='weight')
xgb_importance_df = pd.DataFrame({'Feature': list(xgb_importance.keys()), 
                                  'Importance': list(xgb_importance.values())})
xgb_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 10 most important features
print(xgb_importance_df.head(10))

### 3. Performing Hyperparameter Tuning

Since `XGBoost Regressor` performed best, we will use it as the **final submission model**. 

Here are XGBoost's best_paramaters from GridSearch.
- learning_rate=0.1
- max_depth=2
- n_estimators=300
- reg_alpha=0.5

**Using GridSearch results to tune hyperparameters**
- models to use `RandomForestRegressor` and `XGBRegressor`
- so far `XGBoost` has the lowest RMSE value (best model so far).

Let's see if hyperparameter tunning improves the two models. 

In [None]:
# load required modules
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import cross_val_score

# Create the XGBoost and RandomForest Regressor
xgb_reg = xgb.XGBRegressor(learning_rate=0.1,
                           max_depth=3,
                           n_estimators=300,
                           reg_alpha= 0.5)

rf_reg = RandomForestRegressor(max_depth=7,
                               min_samples_split=5,
                               n_estimators=100)

# Fit the regressor to the training data
xgb_reg.fit(X_train, y_train)
rf_reg.fit(X_train, y_train)

# Make predictions on the validation data
y_val_pred_xgb = xgb_reg.predict(X_val)
y_val_pred_rf = rf_reg.predict(X_val)

# Compute RMSE
xgb_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_xgb))
rf_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred_rf))

# Cross-validate the model using 10-fold CV
xgb_cv_scores = cross_val_score(xgb_reg, X_train, y_train, cv=10)
rf_cv_scores = cross_val_score(rf_reg, X_train, y_train, cv=10)

# Print RMSE scores
print(f"RMSE RandomForest: {rf_rmse: .2f}")
print(f"RMSE XGBoost: {xgb_rmse}")
print("")
print(f"RandomForest 10-Fold CV Mean Score: {np.mean(rf_cv_scores): .2f}")
print(f"XGBoost 10-Fold CV Mean Score: {np.mean(xgb_cv_scores): .2f}")


Both `RMSE` and `model scores` improved after tunning the hyperparameters.

****Extracting Important Features****

In [None]:
# Get feature importance scores
rf_importance = rf_reg.feature_importances_
rf_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_importance})
rf_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 20 most important features
print(rf_importance_df.head(20))

In [None]:
# Get feature importance scores
xgb_importance = xgb_reg.get_booster().get_score(importance_type='weight')
xgb_importance_df = pd.DataFrame({'Feature': list(xgb_importance.keys()), 
                                  'Importance': list(xgb_importance.values())})
xgb_importance_df.sort_values(by='Importance', ascending=False, inplace=True)

# Display top 20 most important features
print(xgb_importance_df.head(20))

### 4. Perfoming PCA and Final Model Training 
- PCA will reduce the number of features

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import xgboost as xgb

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(final_train)
X_test_scaled = scaler.transform(final_test)

# Perform PCA
pca = PCA(n_components=20) # set n_components to 20
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

X_test_submit = pd.DataFrame(X_test_pca,
                          columns=['PC_'+str(i+1) for i in range(20)],
                          index=final_test.index)
X_train_submit = pd.DataFrame(X_train_pca,
                          columns=['PC_'+str(i+1) for i in range(20)],
                          index=final_train.index)

# train XGBoost model
model = xgb.XGBRegressor(learning_rate=0.1,
                         max_depth=2,
                         n_estimators=300,
                         reg_alpha= 0.5)

model.fit(X_train_pca, y_train_final)

# Predict using the test set
y_pred = model.predict(X_test_submit)

# save the predictions to a CSV file
output = pd.DataFrame({'Id': X_test_submit.index+1461, 'SalePrice': y_pred})
output.to_csv('submission.csv', index=False)

print('Successfully created predictions and is saved to "submission.csv" file')


In [None]:
print(X_train_submit.shape, X_test_submit.shape)

In [None]:
submission_df = pd.read_csv("/kaggle/working/submission.csv")
print(submission_df.shape)

In [None]:
submission_df.head()