In [None]:
# Essentials
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Read in the data
# !ls ../input/house-prices-advanced-regression-techniques
train_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

# Check the shapes
train_data.shape, test_data.shape

# EDA

We can clearly see below that there is a good mix of Numerical and Categorical Features which could pose difficulties in exploring the data comprehensively.

In [None]:
train_data.head()

In [None]:
# Get a rough overview of the training data
train_data.describe()

In [None]:
# Get a look at the column names and object types
train_data.info()

We don't need the `Id` column for the Training Set so we drop it (it'd act as an unnecessary feature otherwise).

In [None]:
# Drop the ID column
train_data.drop('Id', axis=1, inplace=True)

Our Pipelines will be easier to construct if we can gather the names of the Numerical and Categorical/Object Features.

In [None]:
# Collect the names of the Categorical and Numeric Variables seperately
num_columns = train_data.select_dtypes(include=np.number).columns.tolist()
num_columns.remove("SalePrice") # Capturing feature names exclusively
cat_columns = train_data.select_dtypes(exclude=np.number).columns.tolist()

# Check if the number makes sense (+1 for the target variable that was dropped)
len(num_columns) + len(cat_columns) + 1 == len(train_data.columns)

Other than collecting the names of features based on what kind of data they store, we can also find a number of features with similar names. Specifically there are four keywords that are repeated a few times so they may require similar Engineering or special treatment.

In [None]:
# Explore Categorical Columns
# cat_columns <- Find a number of variables related to each other by name

# Explore overlapping variable names
repetitive = ["Bsmt", "Garage", "Sale", "Kitchen"]
similar_cols = []
print("Looking for highly similar variable names")
print('--'*30)
for col in (num_columns + cat_columns):
    if any(x in col for x in repetitive):
        print(col)
        similar_cols.append(col)

The Cardinality of a Feature refers to the number of Unique Values in that set. We should be on the lookout for:
* High Cardinality Categorical Variables: for example, if Zipcodes were not stored as numbers, it could cause a Transformer like `OneHotEncoder` to explode
* Low Cardinality Numeric Variables: for instance, if a feature had values `[1,2,3,4,5]`, we could create new useful features out of it

In [None]:
# Check the cardinality of each of these variables
print("Looking at Categorical Variable Cardinalities")
print('--'*30)
for col in cat_columns:
    uniques = train_data[col].unique()
    if len(uniques) > 10:
        print(f"{len(uniques)} values in {col}")
    else:
        print(f"{len(uniques)} values in {col}: {uniques}")

In [None]:
# Are there any low cardinality numeric variables?
print("Checking for Low Cardinality Numeric Variables")
print("--"*30)
for col in num_columns:
    uniques = train_data[col].unique()
    if len(uniques) < 20:
        print(f"{len(uniques)} unique values in {col}: {sorted(uniques)}")

Even though some features like `YrSold` are discrete enough to become Categorical Variables, we can leave them this way since their hierarchy being preserved makes sense.

Exploring the Linear Correlation between Features is helpful since it can 
* Highlight which features *look to be the* most useful for predicting the target variable 
* Shed some light on which features are highly correlated together in which case they could be *mutually redundant* (this helps in Feature Selection)

In [None]:
# Explore which numeric columns have high linear correlation
corr_matrix = train_data.corr()
plt.figure(figsize=(12,8))
sns.heatmap(corr_matrix, cmap='Blues')

In [None]:
# Sort the highest linear correlations with target variable
target_var = "SalePrice"
corr_matrix[target_var].apply(lambda x: abs(x)).sort_values(ascending=False)

We find that the two features `OverallQual` and `GrLivArea` are the most *linearly* correlated with our target so we can choose to explore them a bit further.

In [None]:
# Explore the second variable (notice the significant Linear Correlation)
sns.scatterplot(x="OverallQual", y="SalePrice", data=train_data)

The following cells explore the distribution of the Target Variable and it can be seen that it is **left-skewed** and has Outliers (as seen from the Percentiles and the critical values). Some ways to deal with such distributions are to:
* Perform a logarithmic transformation on the values (Normalization and Standardization do not alter the skewness, just the scale)
* Explore the Percentiles to come up with some upper/lower thresholds beyond which the values are set to something more common (Boxplots also help here)

We use a Log Transform here: it is simple and the resulting distribution is visually very similar to a Gaussian.

In [None]:
# Explore the Distribution of the Target Variable
sns.distplot(train_data[target_var])

In [None]:
# Where do most values lie under? Explore the Percentiles.
for i in range(95,100):
    print(f"{i}% of the target values lie under: {int(np.percentile(train_data[target_var], i))}")
print(f"Critical Values:\n\tMax:{train_data[target_var].max()}\n\tMin:{train_data[target_var].min()}")

In [None]:
# We can get rid of Outliers by setting some thresholds
upper_thresh = 38500
# train_data[train_data[target_var] > upper_thresh][target_var] = upper_thresh

In [None]:
# Log Transform to reduce skewness of the Target Distribution
print(f"Before Log Transform: Skewness {stats.skew(train_data.SalePrice)}")
train_data["SalePrice"] = np.log1p(train_data["SalePrice"])
print(f"After Log Transform: Skewness {stats.skew(train_data.SalePrice)}")
# y = np.expm1(y)
print(f"Applying Inverse Transformation: Skewness {stats.skew(np.expm1(train_data.SalePrice))}") # This is to demonstrate retaining our original targets
print(f"Final Skewness: {stats.skew(train_data.SalePrice)}")
sns.distplot(train_data["SalePrice"])

In [None]:
# Split the data before moving on
X = train_data.drop(target_var, axis=1)
y = train_data[target_var]
X.shape, y.shape

# Data Cleaning: Dealing with Missing Values

The only two necessary steps for preparing Data for Modeling are:
1. Dealing with Missing values (`nan`)
2. Finding some numerical representation for Categorical/Non-numeric Variables

In Data Cleaning, the focus is on the first step.

In [None]:
# Get a visual of how many values are missing
missing_count = X.isnull().sum()
missing_count = missing_count[missing_count > 0]
missing_cols = pd.DataFrame(missing_count).index.tolist()
plt.figure(figsize=(12,8))
sns.heatmap(X[missing_cols].isnull(), cmap='viridis', cbar=False)

In [None]:
# Get actual numbers 
missing_count.sort_values(ascending=False) / len(X) * 100

The variable `PoolQC` has enough values that we could be lazy and drop the entire column, but it may be of benefit to us if we fill the missing values with `0` since the values may just *not exist*. 

We can apply similar thinking to the features related to the Garage and Basements of these houses.

In [None]:
# Which missing value columns are numeric and which are categorical
print(X[missing_cols].dtypes)
X[missing_cols].head(10)

In [None]:
# Define a function for Data Cleaning
def handle_missing(df):
    # LotFrontage, MasVnrArea are generic numeric features so we can fill with the median
    cols = ['LotFrontage', 'MasVnrArea']
    for col in cols:
        df[col] = df[col].fillna(df[col].median())
    
    # Some features have missing values because one does not exist for that instance
    none_fill_cols = "Alley BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2 Electrical FireplaceQu GarageType GarageFinish GarageQual GarageCond PoolQC Fence MiscFeature".split()
    df[none_fill_cols] = df[none_fill_cols].fillna('NONE')
    
    # Deal with Electrical, MasVnrType and GarageYrBlt
    df['Electrical'] = df['Electrical'].fillna("SBrkr") # This is the average
    df['MasVnrType'] = df['MasVnrType'].fillna(df.MasVnrType.mode()) # The mode makes more sense based on feature description
    df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0) # This house instance has no garage
    
    # If the testing data has any surprises, we can apply a generic strategy
    num_cols = df.select_dtypes(include=np.number).columns
    cat_cols = df.select_dtypes(exclude=np.number).columns
    for col in num_cols:
        df[col] = df[col].fillna(df[col].median())
    for col in cat_cols:
        df[col] = df[col].fillna('NONE')
    
    return df
    
# Apply this to a copy of the DataFrame and check
tmp = X.copy()
tmp = handle_missing(tmp)
tmp.isnull().sum()[tmp.isnull().sum() > 0]

In [None]:
# Apply this to the actual data
X = handle_missing(X)
X.isnull().sum().max()

# Feature Engineering and Feature Selection

Some aspects of Feature Engineering include:
* Create Aggregated Features (can result in more robust models, as will be seen later)
* Dealing with heavily Skewed features (either dropping them, transforming them or doing nothing)
* Scaling/Normalizing numeric variables (really only required for **non-Tree-based** algorithms like SVMs, Linear Regression, MLP etc.)
* Encodings for Categorical Variables (the only thing that's necessary to carry out here)

In [None]:
# Some interesting features we can create
def new_features(X):
    X['HasWoodDeck'] = (X['WoodDeckSF'] == 0) * 1

    X['HasOpenPorch'] = (X['OpenPorchSF'] == 0) * 1
    X['HasEnclosedPorch'] = (X['EnclosedPorch'] == 0) * 1
    X['Has3SsnPorch'] = (X['3SsnPorch'] == 0) * 1
    X['HasScreenPorch'] = (X['ScreenPorch'] == 0) * 1

    X['Total_Home_Quality'] = X['OverallQual'] + X['OverallCond']
    X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
    X['TotalSquareFootage'] = (X['BsmtFinSF1'] + X['BsmtFinSF2'] + X['1stFlrSF'] + X['2ndFlrSF'])

    X['HasPool'] = X['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    X['Has2ndFloor'] = X['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    X['HasGarage'] = X['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    X['HasBsmt'] = X['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    X['HasFireplace'] = X['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
    
    return X

X  = new_features(X)
len(X.columns)

In [None]:
# Find the heavily skewed features
num_columns = X.select_dtypes(include=np.number).columns
skewed_features = X[num_columns].apply(lambda x: abs(stats.skew(x))).sort_values(ascending=False)
high_skewed = skewed_features[skewed_features > 0.5]
high_skewed 

We can set some threshold above which the filtered features will undergo a Logarithmic Transformation but a couple of the features justify their high skewness; e.g. only a few luxury homes have pools so there would naturally be outliers and a pulled distribution.

In [None]:
X.shape, y.shape

As a final jab at EDA, we shall explore Feature Selection. There are many classes made specifically for this making use of tools like Chi-square tests and whatnot, but a simple and effective approach is to simply **fit a `RandomForest` on the data**.

The forest is made of many Decision Trees so by iterating through each Tree and seeing which splits contribute to better drops in impurity/entropy, the Ensemble can get an idea of which features are *more important* for predicting the target. Since the Ensemble is based on Trees, we only have to 
* Make sure there are no missing values
* Numerically Encode the Categorical Variables (Label vs One Hot doesn't matter for tree-based models since the whole space is explored value-by-value)

In [None]:
# Label Encode a copy of the data
from sklearn import preprocessing
cat_columns = X.select_dtypes(exclude=np.number).columns
fi_data = X.copy()
for feat in cat_columns:
    fi_data[feat] = preprocessing.LabelEncoder().fit_transform(fi_data[feat])
# Use a RandomForest model to look at the Feature Importances 
from sklearn.ensemble import RandomForestRegressor
forest_fi = RandomForestRegressor(n_estimators=100,
                                 min_samples_leaf=5,
                                 min_samples_split=5,
                                 n_jobs=-1).fit(fi_data, y)

In [None]:
# Extract and visualize the importances
importances = forest_fi.feature_importances_
feat_imps = pd.Series(importances, index=fi_data.columns)
feat_imps

In [None]:
# Check the top 10 most relevant features to the target variable
feat_imps = feat_imps.sort_values(ascending=False)
feat_imps[:10]

In [None]:
# Visualize the importances
plt.figure(figsize=(11,9))
plt.title("Feature Importances after Engineering")
feat_imps[:15].plot.bar()

Incredible. Three of our aggregated features are among the best that can be used for predicting the Target Variable. 

The `OverallQual` feature still reigns the highest and by a significant margin at that. Other than that we can also see the `YearBuilt` feature among the top predictors which implies that the later a house was built, the price tended to increase a lot more.

Another interesting note is that there are no Basement related features in this top-15 list, but there are a number of *Garage* related features. This could imply that in practice, Garages are more valuable than Basements which makes sense.

# Full Pipeline and Modeling

Now that we have gone through the whole process step-by-step, it would help to functionalize everything (create a Pipeline) for efficiency and code-reproducibility in case we want to run more experiments later.

The following cell applies every transformation/piece of engineering we did thus far.

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn import preprocessing
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
test_ids = test_data.Id # TEST IDS STORED HERE!

# Scale and seperate Target Variable
target_var = 'SalePrice'
train_data[target_var] = np.log1p(train_data[target_var]) # TARGET SCALED HERE!
X = train_data.drop(['Id', target_var], axis=1)
y = train_data[target_var]
X_test = test_data.drop('Id', axis=1)

# Data Cleaning
X = handle_missing(X)
X_test = handle_missing(test_data)

# Feature Engineering
X = new_features(X)
X_test = new_features(X_test)

# Pipelines
num_columns = X.select_dtypes(include=np.number).columns
cat_columns = X.select_dtypes(exclude=np.number).columns
pipeline = ColumnTransformer([
    ("one_hot_encoder", preprocessing.OneHotEncoder(sparse=False, handle_unknown='ignore'), cat_columns),
    ("standard_scaler", preprocessing.StandardScaler(), num_columns)
])
X = pipeline.fit_transform(X)
X_test = pipeline.transform(X_test) # Making sure to not fit to the testing set

In [None]:
X.shape, y.shape, X_test.shape

Now to finally begin modeling. 

In keeping up with proper Cross Validation schemes, we define a function that returns the RMSE of a model when validated over 5 folds of the data. This is better than using `train_test_split` since that validates the model on only one fold and is vulnerable to randomness influencing the model's performance.

In [None]:
# Import dependencies and models
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Instantiate a dict (+function) for storing model scores
scores = {}
def get_cv_score(estimator):
    return np.sqrt(-1 * cross_val_score(estimator, X=X, y=y, cv=5, scoring='neg_mean_squared_error', n_jobs=-1))

We start with a Linear Regression model that performs *much much worse* than the other strong learners.

In [None]:
# Start with a simple Linear Model
lin_reg = LinearRegression()
scores['linear_regression'] = get_cv_score(lin_reg)
scores['linear_regression']

The cell below takes ~5mins to finish evaluating the models

In [None]:
%%time
import time

# Change the boolean to Cross Validate the Base Models
check_cv = False

if check_cv:
    start = time.time()
    # SVM with Linear Kernel
    linear_svr = SVR(kernel='linear', 
                     C=10, 
                     epsilon=0.01, 
                     gamma=0.0005)
    scores['linear_svr'] = get_cv_score(linear_svr)
    print(f"Finished Linear SVR: {time.time()-start:0.2f}sec")
    start = time.time()
    # SVM with RBF kernel
    svr = SVR(kernel='rbf', 
              C=10, 
              epsilon=0.01, 
              gamma=0.0005)
    scores['svr'] = get_cv_score(svr)
    print(f"Finished Kernel SVR: {time.time()-start:0.2f}sec")
    start = time.time()
    # Random Forest
    rfr = RandomForestRegressor(n_estimators=250, 
                                max_depth=15, 
                                min_samples_leaf=5, 
                                min_samples_split=5, 
                                n_jobs=-1,
                               random_state=42)
    scores['rfr'] = get_cv_score(rfr)
    print(f"Finished Random Forest: {time.time()-start:0.2f}sec")
    start = time.time()
    # Gradient Boosting
    gbr = GradientBoostingRegressor(n_estimators=350, 
                                    learning_rate=0.1, 
                                    loss='huber',
                                   random_state=42)
    scores['gbr'] = get_cv_score(gbr)
    print(f"Finished Gradient Boosting: {time.time()-start:0.2f}sec")
    start = time.time()
    # LGBM
    lgbr = LGBMRegressor(objective='regression',
                        n_estimators=300,
                        learning_rate=0.1,
                        random_state=42)
    scores['lgbr'] = get_cv_score(lgbr)
    print(f"Finished LGBM: {time.time()-start:0.2f}sec")
    start = time.time()
    # AdaBoost with DT Base Estimator
    ada = AdaBoostRegressor(n_estimators=150, 
                            random_state=42)
    scores['ada'] = get_cv_score(ada)
    print(f"Finished AdaBoost: {time.time()-start:0.2f}sec")
    start = time.time()
    # Ending with XGBoost
    xgb = XGBRegressor(n_estimators=300,
                      max_depth=5, 
                      learning_rate=0.1,
                      random_state=42)
    scores['xgb'] = get_cv_score(xgb)
    print(f"Finished XGBoost: {time.time()-start:0.2f}sec")

    # Evaluate models before any serious Hyperparameter tuning
    print(f"AdaBoost: {scores['ada'].mean()}")
    print(f"LGBM: {scores['lgbr'].mean()}")
    print(f"GradientBoosting: {scores['gbr'].mean()}")
    print(f"RandomForest: {scores['rfr'].mean()}")
    print(f"Linear SVR: {scores['linear_svr'].mean()}")
    print(f"Kernel SVR: {scores['svr'].mean()}")
    print(f"XGBoost: {scores['xgb'].mean()}")

```
AdaBoost: 34566.742546702626
LGBM: 29158.163160149958
GradientBoosting: 26674.092765014524
RandomForest: 30125.682357409016
Kernel SVR: 43880.25669866572
Linear SVR: 81085.44385548346
XGBoost: 26470.758028378285
```
What we can observe:
* The SVMs performed the worst (though the RBF Kernel performed much better than the Linear Kernel)
* AdaBoost got 5th place
* RandomForests got 4th place
* LGBM got 3rd
* Gradient Boosting got 2nd
* XGBoost performed the best, unsurprisingly, although the training time was nearly 19 times higher

Following cell inspired from Notebook (Hyperparameter values noted):
https://www.kaggle.com/lavanyashukla01/how-i-made-top-0-3-on-a-kaggle-competition#Train-a-model

In [None]:
# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=6,
                       learning_rate=0.01, 
                       n_estimators=7000,
                       max_bin=200, 
                       bagging_fraction=0.8,
                       bagging_freq=4, 
                       bagging_seed=8,
                       feature_fraction=0.2,
                       feature_fraction_seed=8,
                       min_sum_hessian_in_leaf = 11,
                       verbose=-1,
                       random_state=42)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=6000,
                       max_depth=4,
                       min_child_weight=0,
                       gamma=0.6,
                       subsample=0.7,
                       colsample_bytree=0.7,
                       nthread=-1,
                       scale_pos_weight=1,
                       seed=27,
                       reg_alpha=0.00006,
                       random_state=42)

# Support Vector Regressor
svr = SVR(C= 20, epsilon= 0.008, gamma=0.0003)

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=6000,
                                learning_rate=0.01,
                                max_depth=4,
                                max_features='sqrt',
                                min_samples_leaf=15,
                                min_samples_split=10,
                                loss='huber',
                                random_state=42)  

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=2200,
                          max_depth=15,
                          min_samples_split=5,
                          min_samples_leaf=5,
                          max_features=None,
                          oob_score=True,
                          random_state=42)

# Stack up all the models above, optimized using xgboost
stack_gen = StackingRegressor(estimators=[
                                ('xgboost',xgboost), 
                                ('lightgbm',lightgbm), 
                                ('svr',svr),  
                                ('gbr',gbr), 
                                ('rf',rf)],
                                final_estimator=xgboost,
                                n_jobs=-1)

We make use of a `StackingRegressor` here which takes a collection of models and aggregates their predictions by having a *meta-learner* treat it as independent variables and the true values as the targets. This is another form of Ensembling.

## Fit the Models

In [None]:
%%time
# Stacking Regressor
stack_gen.fit(X, y)

In [None]:
%%time
# Random Forest
rf.fit(X, y)

In [None]:
%%time
# XGBoost
xgboost.fit(X, y)

In [None]:
%%time
# Gradient Boosting
gbr.fit(X, y)

In [None]:
%%time
# SVR
svr.fit(X, y)

In [None]:
%%time
# LGBM
lightgbm.fit(X, y)

With all the models trained on the data, we can get their predictions and *blend* them (the final predictions will be a linear combination/weighted sum of the individual models' predictions so that the stronger models aren't totally neglected). The weights should obviously add up to 1.

In [None]:
0.1 + 0.2 + 0.2 + 0.1 + 0.05 + 0.35

In [None]:
# Blend the predictions
def blended_predictions(X):
    return ((0.1 * svr.predict(X)) + \
            (0.2 * gbr.predict(X)) + \
            (0.2 * xgboost.predict(X)) + \
            (0.1 * lightgbm.predict(X)) + \
            (0.05 * rf.predict(X)) + \
            (0.35 * stack_gen.predict(X)))

# Get the submission file ready, REMEMBERING to invert the log transform we applied earlier
submission = pd.read_csv("../input/house-prices-advanced-regression-techniques/sample_submission.csv")
submission.iloc[:,1] = np.floor(np.expm1(blended_predictions(X_test)))

submission.to_csv("submission_regression.csv", index=False)

In [None]:
print("FIN")