# House Prices - Advanced Regression Techniques

## James Morgan (jhmmorgan)
*2021-11-04*

### 1. Set Up
##### 1.1 Import Libraries

In [None]:
import pandas as pd
import numpy as np
import math

# Use this cell to begin, and add as many cells as you need to complete your analysis!
# Libaries
import matplotlib.pyplot as plt
import seaborn as sns

# Statistics
from scipy import stats
from scipy.special import boxcox, inv_boxcox
#from scipy.stats import norm

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, ExtraTreesRegressor, GradientBoostingRegressor
from sklearn.linear_model import LassoCV, LinearRegression, RidgeCV

from sklearn.metrics import mean_absolute_error as MAE
from sklearn.metrics import mean_squared_error as MSE
from sklearn.metrics import mean_squared_log_error as MSLE



##### 1.2 Default classes
Used for better printing outputs.

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   HEADER = BOLD + UNDERLINE
   END = '\033[0m'

##### 1.3 Load Data
Lets read in the data...

However, before we do, the data description shows NA as being a valid value for many of the categories
Normally meaning none, i.e. Alley == NA means no alley access, not missing data. We'll therefore override the default NA's to ignore "NA" from the NA list.



In [None]:
# Get the default NA values from Pandas and remove "NA". Use this as the default list of NA's
_na_values = ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'n/a', '<NA>', '#NA', 'NULL', 'null', 'NaN', '-NaN', 'nan', '-nan', '']

# Now we can read the files
src_path      = "/kaggle/input/house-prices-advanced-regression-techniques/"
test_df       = pd.read_csv(src_path + "test.csv", keep_default_na = True)#, na_values = _na_values)
train_df      = pd.read_csv(src_path + "train.csv", keep_default_na = True)#, na_values = _na_values)
sample_sub_df = pd.read_csv(src_path + "sample_submission.csv")

#train_df.head().style.set_properties(**{"background-color": "#98FB98","color": "black", "border-color": "black"})
train_df.head()

In [None]:
# Using Ids as Indexes
for df in [train_df, test_df]:
    df.set_index("Id", inplace=True)
train_df.head()

### 2. EDA (Exploratory Data Analysis)
###### 2.1 Data Check

In [None]:
# Data Type Check
df = pd.DataFrame({"Column": train_df.columns, "Dtype": train_df.dtypes.astype("str").tolist(), 
                   "Sample1": train_df.loc[1].tolist(),
                   "Sample2": train_df.loc[50].tolist(), 
                   "Sample3": train_df.loc[500].tolist()})
print(color.BOLD + color.UNDERLINE + "Data Types for all features in the training data frame" + color.END)
print(df.to_string())

###### Observations:
* **MSSubClass**, **MoSold**, **YrSold** are categorical, but stored as numbers
* **OverallQal**, **OverallCond** are also categorical, however with a scale of 1 to 10 so are ok to remain numbers
* **CentralAir** is a Y/N and so should be a boolean
* Various features have missing values, however **NA** was one of their options.

We need to change these in both the training and test datasets.

We'll first replace any null value in the specified columns to the string "NA":

In [None]:
def fix_category_NA(df):
    # Array of features with NA as a valid option
    features_with_NA = ["Alley",
                        "BsmtQual",
                        "BsmtCond",
                        "BsmtExposure",
                        "BsmtFinType1",
                        "BsmtFinType2",
                        "FireplaceQu",
                        "GarageType",
                        "GarageFinish",
                        "GarageQual",
                        "GarageCond",
                        "PoolQC",
                        "Fence",
                        "MiscFeature"]
    
    for feature in features_with_NA:
        df.replace({feature: {np.NAN : "NA"}}, inplace=True)

In [None]:
for df in [train_df, test_df]:
        fix_category_NA(df)
print(color.HEADER + "Head of some of the columns we amended" + color.END)
train_df[["Alley", "BsmtQual", "GarageType", "PoolQC", "MiscFeature"]].head()

I now want to check each categorical feature's possible value.  I'm not looking to fix this to match the list provided (although this would be a good idea in a real life scenario), however I'd like to ensure there are no mistakes that could lead to incorrect categorisations, such as multiple spellings for one value e.g. **Exm One** and **ExmOne**.

In [None]:
categorical_features = train_df.select_dtypes(exclude = [np.number, bool]).columns
print(color.HEADER + "Unique values for each categorical feature" + color.END)
for categories in categorical_features:
    print(color.BOLD + categories + color.END)
    print(pd.concat([train_df, test_df])[categories].sort_values().unique())

There are certainly differences to the data description, but no multiple spellings or errors that will impact our model.

We'll now replace various integers to appropriate categories:

In [None]:
def fix_categories_integers(df):
    df.replace({"MSSubClass": {20: "SC20", 30: "SC30", 40: "SC40", 45: "SC45", 50: "SC50", 60: "SC60", 70: "SC70", 75: "SC75",
                               80: "SC80", 85: "SC85", 90: "SC90", 120: "SC120", 150: "SC150", 160: "SC160", 180: "SC180", 190: "SC190"}, 
                "MoSold": {1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun",
                           7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec"},
                "CentralAir": {"Y": True, "N": False}},
                inplace=True)
    df["YrSold"] = pd.Categorical(df.YrSold)

In [None]:
for df in [train_df, test_df]:
    fix_categories_integers(df)
print(color.HEADER + "Head of the features we've fixed" + color.END)
train_df[["MSSubClass", "YrSold", "MoSold", "CentralAir"]].head()

##### 2.2 Missing Values
We now need to review and clean any missing value.

In [None]:
_train_df      = train_df.drop(columns = "SalePrice")
combined_df = pd.concat([_train_df, test_df])

print(color.BOLD + "Which features contain null values?" + color.END)
print(combined_df.isnull().sum()[combined_df.isnull().sum()>0])

We can fill most of these in by looking at the data description by using defaults such as "Other" or "None.
The numeric values can have 0, such as those with square feet metrics.

**GarageYrBlt** contains a year, or is blank if there is no garage.  We can change this to a categorical feature.

In [None]:
def fix_missing_values(df):
    MSZoning_series    = df.groupby("Neighborhood").MSZoning.agg(lambda x:x.value_counts().index[0])
    LotFrontage_series = df.groupby("Neighborhood").LotFrontage.median()

    df.fillna({"Utilities": "AllPub",
                    "Exterior1st":"Other",
                    "Exterior2nd":"Other",
                    "MasVnrType":"None",
                    "MasVnrArea":0,
                    "BsmtFinSF1":0,
                    "BsmtFinSF2":0,
                    "BsmtUnfSF":0,
                    "TotalBsmtSF":0,
                    "Electrical":"SBrkr",
                    "BsmtFullBath":0,
                    "BsmtHalfBath":0,
                    "KitchenQual":"TA",
                    "Functional":"Typ",
                    "GarageYrBlt":"None",
                    "GarageCars":0,
                    "GarageArea":0,
                    "SaleType":"Oth",
                    "MSZoning": df["Neighborhood"].apply(lambda x: MSZoning_series[x]),
                    "LotFrontage": df["Neighborhood"].apply(lambda x: LotFrontage_series[x])}, inplace = True)

    df["GarageYrBlt"] = pd.Categorical(df.GarageYrBlt)

In [None]:
for df in [train_df, test_df]:
    fix_missing_values(df)

print(color.BOLD + color.RED + "Number of null values across both train and test data frames?")
print(f"{pd.concat([train_df.drop(['SalePrice'], axis = 1), test_df]).isnull().sum().sum()}"+color.END)

Fantastic! There are no null values.

##### 2.3 Visualisation of the data
###### 2.3.1. SalePrice
We now need to visualise the data.

**How evenly distributed is the sale price (target) in our training data?**

In [None]:
# Figure
plt.figure(figsize=(12, 4))
plt.suptitle("Visualising the skewness of the SalePrice target variable")

# Distribution Plot
plt.subplot(1, 2, 1)
sns.histplot(train_df["SalePrice"], stat = "density", kde = True)
plt.title('Distribution Plot')

# Probability Plot
plt.subplot(1, 2, 2)
stats.probplot(train_df['SalePrice'], plot=plt)

plt.tight_layout()
plt.show()
plt.clf()

The SalePrice appears skewed. A log1p transformation smooths this better than the square root.  The boxcox is only marginally better and personally less preferred.

In [None]:
train_df['SalePrice'] = np.log1p(train_df.SalePrice)

# Figure
plt.figure(figsize=(12, 4))
plt.suptitle("Visualisaing the skewnewss of the SalePrice target variable following a log1p transformation")
# Distribution Plot
plt.subplot(1, 2, 1)
sns.histplot(train_df["SalePrice"], stat = "density", kde = True)
plt.title('Distribution Plot')

# Probability Plot
plt.subplot(1, 2, 2)
stats.probplot(train_df['SalePrice'], plot=plt)

plt.tight_layout()
plt.show()
plt.clf()

##### 2.3 Visualisation of the data
###### 2.3.2 Numerical and Categorical Features

We now need to visualise the numerical and categorical features.

* We want to extract the names of the numerical and categorical features
* We want to visualise the density and relationship of the numerical values against the SalePrice target variable.
    * We'll achieve this by producing an 4x4 grid of visualisation.
    * This gives us 16 subplots per plot.
    * We'll show 8 features per plot (each feature has two subplots)

In [None]:
numerical_features = train_df.select_dtypes(include = [np.number]).columns
print(color.BOLD + f'Numerical Features ({len(numerical_features)}):' + color.END + f'\n{numerical_features}')
categorical_features = train_df.select_dtypes(exclude = [np.number, bool]).columns
print(color.BOLD + f'Categorical Features ({len(categorical_features)}):' + color.END + f'\n{categorical_features}')

In [None]:
# We want to split the numerical and categorical features into groups to view the data better
# To do this, we'll group these in sets of 10

# How many groups are needed?
#  Each will be a 4x4 grid. Total of 16 charts per plot
#  Each plot will have two charts, total of 8 features per plot
numerical_groups   = math.ceil(len(numerical_features.values)/8)
categorical_groups = math.ceil(len(categorical_features.values)/8)

total_groups       = numerical_groups + categorical_groups

numerical_step     = 8
categorical_step   = 8

group_num = np.empty(int(numerical_groups), dtype = pd.Series)
for grp in np.arange(numerical_groups):
#  print(grp * numerical_step)
  st = int(grp * numerical_step)
  en = int((grp+1) * numerical_step - 1)+1
  group_num[int(grp)] = numerical_features[st:en]


group_cat = np.empty(int(categorical_groups), dtype = pd.Series)
for grp in np.arange(categorical_groups):
  #print(grp * numerical_step)
  st = int(grp * categorical_step)
  en = int((grp+1) * categorical_step - 1)+1
  group_cat[int(grp)] = categorical_features[st:en]


# EDA of all groups
print(color.BOLD + color.UNDERLINE + "Visualisation of distribution and relationship of numerical features vs SalePrice" + color.END)
groups = group_num
for grp in groups:
    plt.figure(figsize=(12, 12))
    i = 1
    for feature in grp:
        # Distribution Plot
        width  = 4
        height = 4
        _=plt.subplot(height, width, i)
        _=sns.histplot(train_df[feature], kde=True, stat="density", linewidth=0)
        _=plt.title("Distribution")
        i += 1

        # Scatter Plot
        _=plt.subplot(height, width, i)
        _=sns.scatterplot(data=train_df, x=feature, y="SalePrice", alpha=0.5)
        _=plt.title("Relationship")
        i += 1
    plt.tight_layout()
    plt.show()
    plt.clf()



##### 2.3 Visualisation of the data
###### 2.3.2.1 Numerical Outliers
* **LotFrontage** > 250
* **LotArea** > 100000
* **BsmtFinSF1** > 4000
* **BsmtFinSF2** > 1200
* **TotalBsmtSF** > 5000
* **GrLivArea** > 4000
* **KitchenAbcGr** = 0
* **WoodDeckSF** > 750
* **OpenPorchSF** > 500
* **EnclosedPorch** > 500
* **MiscVal** > 5000

###### 2.3.2.2 Numerical Feature Engineering
* **LowQualFin**   | if LowQualFinSF == 0 then False, =>1 then True
* **BsmtFullBath** | 0 then False, =>1 then True
* **BsmtHalfBath** | 0 then False, =>1 then True
* **HalfBath**     | 0 then False, =>1 then True
* **BedroomAbvGr** | >= 5 then 5
* **KitchenAbvGr** | >=2 then 2
* **Fireplaces**   | >= 2 then 2
* **GarageCars**   | >= 3 then 3
* **HasPool**      | if PoolArea == 0 then False, >0 then True
* **LivAreaRatio** | Living Area Ratio (GrLivArea / LotArea)
* **SpaceRatio**   | Space ( (FirstFlrSF + SecondFlrSF) / df.TotRmsAbvGrd)
* **TotalBath**    | BsmtFullBath + BsmtHalfBath
* **TotalRoom**    | TotRmsAbvGrd + FullBath + HalfBath
* **NhbdRank**     | Neighbour Rank - The median GrLivArea for the Neighbourhood
* **GrLivAreaPlusBsmtSF**  | Total living area (df.GrLivArea + df.TotalBsmtSF)
* **RecentRemodLargeBsmt** | df.YearRemodAdd * df.TotalBsmtSF

Let's remove these outliers as not to skew our data / predictions then move onto the feature engineering:

In [None]:
old_length = len(train_df)
train_df   = train_df.drop(train_df[(train_df.LotFrontage > 200)|
                                          (train_df.LotArea > 100000)|
                                          (train_df.BsmtFinSF1 > 4000)|
                                          (train_df.BsmtFinSF2 > 1200)|
                                          (train_df.TotalBsmtSF > 5000)|
                                          (train_df.GrLivArea > 4000)|
                                          (train_df.KitchenAbvGr == 0)|
                                          (train_df.WoodDeckSF > 750)|
                                          (train_df.OpenPorchSF > 500)|
                                          (train_df.EnclosedPorch > 500)|
                                          (train_df.MiscVal > 5000)].index)
new_length = len(train_df)
print(color.HEADER + color.RED + f'Reduction in training data from removing outliers is {np.round(100*(old_length-new_length)/old_length, 2)}%' + color.END)

Now lets implement our numerical feature engineering.

In [None]:
def numerical_feature_engineering(df):
    # Create New Features
    df["Has_LowQualFinSF"] = df["LowQualFinSF"].apply(lambda x: False if x==0 else True)
    df["Has_Pool"]         = df["PoolArea"].apply(lambda x: False if x==0 else True)
    df["LivAreaRatio"]     = df.GrLivArea / df.LotArea
    df["SpaceRatio"]       = (df["1stFlrSF"] + df["2ndFlrSF"]) / df["TotRmsAbvGrd"]
    df["TotalBath"]        = df.BsmtFullBath + df.BsmtHalfBath
    df["TotalRoom"]        = df.TotRmsAbvGrd + df.FullBath + df.HalfBath

    # Feature Engineering: Numerical Features
    df["BsmtFullBath"] = df["BsmtFullBath"].apply(lambda x: False if x==0 else True)
    df["BsmtHalfBath"] = df["BsmtHalfBath"].apply(lambda x: False if x==0 else True)
    df["HalfBath"]     = df["HalfBath"].apply(lambda x: False if x==0 else True)
    df["BedroomAbvGr"] = df["BedroomAbvGr"].apply(lambda x: x if x<5 else 5)
    df["KitchenAbvGr"] = df["KitchenAbvGr"].apply(lambda x: x if x<2 else 2)
    df["Fireplaces"]   = df["Fireplaces"].apply(lambda x: x if x<2 else 2)
    df["GarageCars"]   = df["GarageCars"].apply(lambda x: x if x<3 else 3)
    df["NhbdRank"]     = df.groupby('Neighborhood')['GrLivArea'].transform('median')
    df["GrLivAreaPlusBsmtSF"] = df.GrLivArea + df.TotalBsmtSF
    df["RecentRemodLargeBsmt"] = df.YearRemodAdd * df.TotalBsmtSF

  # Drop replaced features
    df.drop(columns = ["LowQualFinSF", "PoolArea"], inplace = True)

In [None]:
for df in [train_df, test_df]:
    numerical_feature_engineering(df)

In [None]:
print(color.HEADER + "Shape of our training dataframe, after numerical feature engineering" + color.END)
train_df.shape

##### 2.3 Visualisation of the data
###### 2.3.2.3 Categorical dimension reduction

In [None]:
# EDA of categorical groups
print(color.HEADER + "Visualisation of distribution and relationship of categorical features vs SalePrice" + color.END)
groups = group_cat
for grp in groups:
    plt.figure(figsize=(12, 12))
    i = 1
    for feature in grp:
        # Distribution Plot
        width  = 4
        height = 4
        _=plt.subplot(height, width, i)
        _=sns.countplot(x = train_df[feature])
        _=plt.xticks(rotation=90)
        _=plt.title("Distribution")
        i += 1

        # Scatter Plot
        _=plt.subplot(height, width, i)
        _=sns.stripplot(data=train_df, x=feature, y="SalePrice", alpha=0.5)
        _=plt.xticks(rotation=90)
        _=plt.title("Relationship")
        i += 1
    plt.tight_layout()
    plt.show()
    plt.clf()

##### 2.3 Visualisation of the data
###### 2.3.2.4 Categorical Feature Engineering

* **Drop** Street, Utilities, Condition2
* **RoofMatl** = ClyTile or Other
* **ExterQual** = Gd/Ex = Good, TA/FA = Average
* **Heating** = GasA or Other
* **Electrical** = SBrkr or Other
* **KitchenQual** = Gd/Ex = Good, TA/FA = Average
* **Functional** = Typ, Other
* **SaleType** = WD, New, Other

* **FrontageType** = Count of (how many of)
    * 'WoodDeckSF', 
    * 'OpenPorchSF',
    * 'EnclosedPorch',
    * 'Threeseasonporch',
    * 'ScreenPorch'

Let's make these amendments.

In [None]:
def categorical_feature_engineering(df):
    # Update existing features
    df["RoofMatl"]    = df["RoofMatl"].apply(lambda x: x if x=="CompShg" else "Other")
    df["ExterQual"]   = df["ExterQual"].apply(lambda x: "Good" if x in ["Gd", "Ex"] else "Average")
    df["Heating"]     = df["Heating"].apply(lambda x: x if x=="GasA" else "Other")
    df["Electrical"]  = df["Electrical"].apply(lambda x: x if x=="SBrkr" else "Other")
    df["KitchenQual"] = df["KitchenQual"].apply(lambda x: "Good" if x in ["Gd", "Ex"] else "Average")
    df["Functional"]  = df["Functional"].apply(lambda x: x if x=="Typ" else "Other")
    df["SaleType"]    = df["SaleType"].apply(lambda x: x if x in ["WD", "New"] else "Other")
    
    # Add new features
    df["FrontageType"] = df[["WoodDeckSF",
                             "OpenPorchSF",
                             "EnclosedPorch",
                             "3SsnPorch",
                             "ScreenPorch"
                            ]].gt(0.0).sum(axis=1)

    # Drop replaced features
    df.drop(columns = ["Street", "Utilities", "Condition2"], inplace = True)

In [None]:
for df in [train_df, test_df]:
    categorical_feature_engineering(df)

print(color.HEADER + "Head of the categorical features we've amended / added" + color.END)    
#train_df[["RoofMatl", "ExterQual", "Heating", "Electrical", "KitchenQual", "Functional", "SaleType", "FrontageType"]].head()

Finally, lets have a look at a correlation of features to see if there are any strong correlations that we can remove?

In [None]:
# Are there any features that are highly correlated to each other now that we've encoded categorical
#    data to numeric? If so, can we drop them?
corr = train_df.drop(columns=["SalePrice"]).corr()
mask = np.zeros_like(corr, dtype=bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(22, 12))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
_ = sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,square=True, linewidths=.5, annot = False).set(title="Correlation of features")
_ = plt.xlabel("Feature")
_ = plt.ylabel("Feature")
_ = plt.show()
_ = plt.clf()

There is certainly some correlation, but no more than +- 0.5, which isn't enough for me to remove features.

### 3. Pre-Processing
###### 3.1 Categorical encoding

We'll now encode the categorical data to numbers, which is needed for the prediction models to work.



In [None]:
# Let's encode the categorical features
test_df_pre          = test_df.copy()
train_df_pre         = train_df.copy()
_train_df            = train_df.drop(columns = "SalePrice")

categorical_features = pd.concat([_train_df, test_df_pre]).select_dtypes(exclude = [np.number, bool]).columns
combined_df_cat      = pd.concat([_train_df, test_df_pre])[categorical_features].reset_index(drop=True)

encoder_mapping      = pd.DataFrame(index = categorical_features, columns = {"encoder", "mapping"})

for i in np.arange(len(categorical_features)):
    le = LabelEncoder()
    encoder_mapping.iloc[i]["encoder"] = le.fit(list(combined_df_cat.iloc[:,i]))
    encoder_mapping.iloc[i]["mapping"] = dict(zip(le.classes_, range(len(le.classes_))))


for feature in encoder_mapping.index:
    train_df_pre.replace({feature: encoder_mapping.loc[feature]["mapping"]}, inplace=True)
    test_df_pre.replace({feature: encoder_mapping.loc[feature]["mapping"]}, inplace=True)

### 4. Machine Learning - Prediction
##### 4.1 Configuation

We're now ready to build a prediction model.  First, we'll need to set up our configuation, where we'll set our seed and test size.

We'll also get training data ready:
* X = Independant columns (features)
* Y = Target Variable (SalePrice)

We'll then need to scale the data before proceeding.  This is completed after the train test split, as we don't want any leakage of training data into the test data.

In [None]:
SEED      = 42
test_size = 0.3   #  30% test, 70% train
cv        = 5     #  5 fold cross vailidation

X = train_df_pre.drop(["SalePrice"], axis = "columns")    # Independant columns (all the features used for prediction)
y = train_df_pre["SalePrice"]                             # Target Variable

##### 4.2 Train Test Split
We now need to split the training data into a further train test split, using the configuation above.

In [None]:
# Create training and test sets
X_train_unscaled, X_test_unscaled, y_train, y_test = train_test_split(X, y, test_size = test_size, random_state=SEED)

###### 3.2 Scaling the data

Now that each feature is encoded to a number, we should scale the data.

In [None]:
# Set up the scaler
scaler                    = RobustScaler()

# Fit and Transforn the scaling to both the train and test dataset
X_train_scaled            = pd.DataFrame(scaler.fit_transform(X_train_unscaled))
X_test_scaled             = pd.DataFrame(scaler.transform(X_test_unscaled))
test_df_scaled            = pd.DataFrame(scaler.transform(test_df_pre))


# Amend the columns of the scaled data to match those of the original data frame
X_train_scaled.columns    = X_train_unscaled.columns.values
X_test_scaled.columns     = X_test_unscaled.columns.values
test_df_scaled.columns    = test_df_pre.columns.values


# Amend the index of the scaled data to match those of the original data frame
X_train_scaled.index      = X_train_unscaled.index.values
X_test_scaled.index       = X_test_unscaled.index.values
test_df_scaled.index      = test_df_pre.index.values


# Output the final data frames. 
X_train                   = X_train_scaled
X_test                    = X_test_scaled
test_df_processed         = test_df_scaled

train_df_processed        = pd.concat([pd.concat([X_train, y_train], axis = 1), pd.concat([X_test, y_test], axis = 1)]).sort_index()


print(color.BOLD + color.UNDERLINE + "Head of the scaled, encoded and clened training data frame" + color.END)
print(X_train.head(n=3))
print()
print(color.BOLD + color.UNDERLINE + "Head of the scaled, encoded and clened test data frame" + color.END)
print(test_df_processed.head(n=3))

##### 4.3 First Model
###### Linear and Ensemble Mix

To start, we'll try several Linear and Ensemble models.
* Ridge Regression
* Lasso
* Random Forest Regression
* Ada
* Extra Trees Regression
* Gradiant Boosting Regression

For the Ensemble's, I'll use an n_estimator of 20.
We'll use negative mean squred log error as the scoring mechanism and apply a cross validation 5.

In [None]:
# Ridge Regression (L2 Regularization)
alphas = np.arange(1, 10, 1)
ridge = RidgeCV(alphas, normalize=True)
ridge.fit(X_train, y_train)
best_alpha = ridge.alpha_

iterations = 5
for i in range(iterations):
    alphas = [best_alpha*x for x in np.arange(0.1, 2, 0.1)]
    ridge = RidgeCV(alphas, normalize=True)
    ridge.fit(X_train, y_train)
    best_alpha = ridge.alpha_

ridge_score = np.sqrt(-cross_val_score(ridge, X_train, y_train, cv=cv, scoring='neg_mean_squared_log_error'))

In [None]:
# Lasso Regression (L1 Regularization)
lasso = LassoCV(alphas=None, max_iter=100000, normalize=True)
lasso.fit(X_train, y_train)
best_alpha = lasso.alpha_
lasso_score = np.sqrt(-cross_val_score(lasso, X_train, y_train, cv=cv, scoring='neg_mean_squared_log_error'))

In [None]:
forest = RandomForestRegressor(n_estimators = 20, random_state = SEED)
forest.fit(X_train, y_train)

ada = AdaBoostRegressor(n_estimators = 20, random_state = SEED)
ada.fit(X_train, y_train)

bagging = BaggingRegressor(n_estimators = 20, random_state = SEED)
bagging.fit(X_train, y_train)

ETR = ExtraTreesRegressor(n_estimators = 20, random_state = SEED)
ETR.fit(X_train, y_train)

GBR = GradientBoostingRegressor(n_estimators = 20, random_state = SEED)
GBR.fit(X_train, y_train)

forest_score = np.sqrt(-cross_val_score(forest, X_train, y_train, cv=cv, scoring='neg_mean_squared_log_error'))
ada_score = np.sqrt(-cross_val_score(ada, X_train, y_train, cv=cv, scoring='neg_mean_squared_log_error'))
bagging_score = np.sqrt(-cross_val_score(bagging, X_train, y_train, cv=cv, scoring='neg_mean_squared_log_error'))
ETR_score = np.sqrt(-cross_val_score(ETR, X_train, y_train, cv=cv, scoring='neg_mean_squared_log_error'))
GBR_score = np.sqrt(-cross_val_score(GBR, X_train, y_train, cv=cv, scoring='neg_mean_squared_log_error'))

In [None]:
results = pd.DataFrame({"Ridge":[round(np.mean(ridge_score),5)],
                        "Lasso":[round(np.mean(lasso_score),5)],
                        "Forest":[round(np.mean(forest_score),5)],
                        "Ada":[round(np.mean(ada_score),5)],
                        "Bagging":[round(np.mean(bagging_score),5)],
                        "ETR":[round(np.mean(ETR_score),5)], 
                        "GBR":[round(np.mean(GBR_score),5)]},  
                       index = ["RMSLE"])
results

Our best score is from a **Lasso Regression**, which gives us an RMSE of 0.0092.
The best ensemble method is **Bagging**, closely followed by **Random Forest**.

I'll take these three models forward and see if we can improve them using hyper parameters.

##### 4.4 Hyper Parameter Tuning
###### Tuning Lasso Regression

In [None]:
lasso_param_grid = {
    "n_alphas": [50, 100, 200, 500, 1000],
    "max_iter": [500, 1000, 10000, 25000, 50000],
    "selection": ["cyclic", "random"],
    "random_state": [SEED]
}
print(color.UNDERLINE + "Lasso Hyper Parameter Grid" + color.END)
lasso_param_grid

In [None]:
grid_search_lasso = GridSearchCV(estimator = lasso,
                                param_grid = lasso_param_grid,
                                cv = cv,
                                n_jobs = -1,
                                verbose = 0)
_ = grid_search_lasso.fit(X_train, y_train)

print(color.BOLD + color.UNDERLINE + "Best parameters are:" + color.END)
print(color.BOLD + f"{grid_search_lasso.best_params_}" + color.END)

In [None]:
lasso_best_grid = grid_search_lasso.best_estimator_
lasso_score_2 = np.sqrt(-cross_val_score(lasso_best_grid, X_train, y_train, scoring = "neg_mean_squared_log_error", cv = cv))

In [None]:
bagging_param_grid = {
    "bootstrap":[True, False],
    "max_features":[1, 4, 10],
    "max_samples":[1, 4],
    "n_estimators":[10, 75, 250],
    "random_state": [SEED]
}

print(color.UNDERLINE + "Bagging Regressor Hyper Parameter Grid" + color.END)
bagging_param_grid

In [None]:
grid_search_bagging = GridSearchCV(estimator = bagging,
                                param_grid = bagging_param_grid,
                                cv = cv,
                                n_jobs = -1,
                                verbose = 0)
_ = grid_search_bagging.fit(X_train, y_train)

print(color.BOLD + color.UNDERLINE + "Best parameters are:" + color.END)
print(color.BOLD + f"{grid_search_bagging.best_params_}" + color.END)

In [None]:
bagging_best_grid = grid_search_bagging.best_estimator_
bagging_score_2 = np.sqrt(-cross_val_score(bagging_best_grid, X_train, y_train, scoring = "neg_mean_squared_log_error", cv = cv))

In [None]:
forest_param_grid = {
    "bootstrap":[True, False],
    "max_depth":[80, 100, None],
    "max_features":[4, 10, "auto"],
    "min_samples_leaf":[1, 4],
    "min_samples_split":[2, 6, 10],
    "n_estimators":[75, 250],
    "random_state":[SEED]
}

print(color.UNDERLINE + "Random Forest Regressor Hyper Parameter Grid" + color.END)
forest_param_grid

In [None]:
grid_search_forest = GridSearchCV(estimator = forest,
                                param_grid = forest_param_grid,
                                cv = cv,
                                n_jobs = -1,
                                verbose = 0)
_ = grid_search_forest.fit(X_train, y_train)

print(color.BOLD + color.UNDERLINE + "Best parameters are:" + color.END)
print(color.BOLD + f"{grid_search_forest.best_params_}" + color.END)

In [None]:
forest_best_grid = grid_search_forest.best_estimator_
forest_score_2 = np.sqrt(-cross_val_score(forest_best_grid, X_train, y_train, scoring = "neg_mean_squared_log_error", cv = cv))

##### 4.5 Results of the Hyper Parameter Tuning

In [None]:
results = pd.DataFrame({"Lasso":[round(np.mean(lasso_score),7), round(np.mean(lasso_score_2),7)],
                        "Forest":[round(np.mean(forest_score),7), round(np.mean(forest_score_2),7)],
                        "Bagging":[round(np.mean(bagging_score),7), round(np.mean(bagging_score_2),7)]},  
                       index = ["Before", "After"])
print(color.BOLD + color.UNDERLINE + "RMSLE score" + color.END)
results

Great! The Lasso and Forest models have marginally improved. The bagging has got slightly worse!

How do these look across the test data?

In [None]:
y_pred_lasso_log1p       = lasso_best_grid.predict(X_test)
y_pred_bagging_log1p     = bagging_best_grid.predict(X_test)
y_pred_forest_log1p      = forest_best_grid.predict(X_test)


lasso_test_RMSLE         = np.sqrt(MSLE(y_test, y_pred_lasso_log1p))
bagging_test_RMSLE       = np.sqrt(MSLE(y_test, y_pred_bagging_log1p))
forest_test_RMSLE        = np.sqrt(MSLE(y_test, y_pred_forest_log1p))

y_pred_lasso             = np.expm1(y_pred_lasso_log1p)
y_pred_bagging           = np.expm1(y_pred_bagging_log1p)
y_pred_forest            = np.expm1(y_pred_forest_log1p)
y_test_expm1             = np.expm1(y_test)

lasso_test_RMSLE_expm1   = np.sqrt(MSLE(y_test_expm1, y_pred_lasso))
bagging_test_RMSLE_expm1 = np.sqrt(MSLE(y_test_expm1, y_pred_bagging))
forest_test_RMSLE_expm1  = np.sqrt(MSLE(y_test_expm1, y_pred_forest))

results = pd.DataFrame({"Lasso":[round(np.mean(lasso_score),7), round(np.mean(lasso_score_2),7), lasso_test_RMSLE, lasso_test_RMSLE_expm1],
                        "Forest":[round(np.mean(forest_score),7), round(np.mean(forest_score_2),7), forest_test_RMSLE, forest_test_RMSLE_expm1],
                        "Bagging":[round(np.mean(bagging_score),7), round(np.mean(bagging_score_2),7), bagging_test_RMSLE, bagging_test_RMSLE_expm1]},  
                       index = ["No Tuning", "Hyper Tuning", "Test Data", "Test Data (expm1)"])
print(color.BOLD + color.UNDERLINE + "RMSLE score" + color.END)
results

Pretty good.  The test data has performed similar to the hyper tuned training data.
We applied a log1p transformation to the SalePrice, due to skewness.  When we revere this (expm1), the score is slightly worse but still good, except for Bagging.

We'll proceeed with Lasso and Forest only.

How does this look when visualised?

##### 4.6 Visualisation of results

In [None]:
t = np.linspace(min(y_test_expm1), max(y_test_expm1), len(y_test_expm1))

# Figure
plt.figure(figsize=(20, 8))

# Distribution Plot
plt.subplot(1, 2, 1)
plt.title("Lasso | Difference in predicted value vs expected value")
plt.plot(t, np.linspace(0,0,len(t)), c = "red")
plt.scatter(y_test_expm1, y_pred_lasso - y_test_expm1, alpha = 0.3)

plt.subplot(1,2,2)
plt.title("Random Forest | Difference in predicted value vs expected value")
plt.plot(t, np.linspace(0,0,len(t)), c = "red")
plt.scatter(y_test_expm1, y_pred_forest - y_test_expm1, alpha = 0.3)

plt.show()

plt.figure(figsize=(20, 8))
plt.title("Mean of Lasso and Random Forest | Difference in predicted value vs expected value")
plt.plot(t, np.linspace(0,0,len(t)), c = "red")
plt.scatter(y_test_expm1, np.mean([y_pred_forest, y_pred_lasso],axis=0) - y_test_expm1, alpha = 0.3)
plt.show()


print(color.BOLD + color.RED + "RMSLE of the mean of the lasso and random forest predictions" + color.END)
print(
    round(np.sqrt(MSLE(y_test_expm1, np.mean([y_pred_forest, y_pred_lasso],axis=0))),4)
)

We can see that both Lasso and Random Forest are poor at predicting the more expensive properties.  This isn't surprising considering we have much less data.

However, by taking the mean of both results, we get a better result and an improved RMSLE.

**Summary**

Whilst a good result, it's clear that properties with a low sale price, or (more significantly) properties with a higher sale price are being predicted incorrectly.

Would training a model on only high-end properties make a difference?

For now, I'll submit my results and delve deeper another day.

In [None]:
final_y_pred_lasso_log1p       = lasso_best_grid.predict(test_df_scaled)
final_y_pred_forest_log1p      = forest_best_grid.predict(test_df_scaled)

final_y_pred_lasso             = np.expm1(final_y_pred_lasso_log1p)
final_y_pred_forest            = np.expm1(final_y_pred_forest_log1p)

final_y_pred = np.mean([final_y_pred_lasso, final_y_pred_forest], axis = 0)

# Submitting Prediction
submission = pd.DataFrame({'Id': test_df_scaled.index, 'SalePrice': final_y_pred})
submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)
print('Submission saved.')