# House Prices

final model, voting regressor, equal weights <br>

# Data

In [None]:
import time
from datetime import datetime
#measure notebook running time
start_time = time.time()

%matplotlib inline

# backbone
import os, warnings
import numpy as np 
from numpy.random import seed
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor,RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge,LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.svm import SVR

import xgboost as xgb
import lightgbm as lgb

from category_encoders import MEstimateEncoder

sns.set(style='darkgrid', context='notebook', palette='deep', rc={'figure.figsize':(10,8)})
print("loaded ...")

In [None]:
# Reproducibility
def set_seed(sd=13):
    seed(sd)
    np.random.seed(sd)
    os.environ['PYTHONHASHSEED'] = str(sd)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
set_seed(13)

In [None]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [None]:
mm = StandardScaler()
#mm = MinMaxScaler()
#mm = RobustScaler()

In [None]:
TRAIN = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
TEST = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
TRAIN['Set'] = "Train"
TEST['Set'] = "Test"
TEST['SalePrice'] = -1
DATA = TRAIN.append(TEST)
DATA.reset_index(inplace=True)
print("DATA set ...")

## Drop Outliers

In [None]:
DATA = DATA.drop(DATA[DATA.Set == "Train"][DATA[DATA.Set == "Train"]['GrLivArea'] > 4000].index)
DATA.reset_index(inplace=True, drop = True)

## Missing data

In [None]:
DATA[DATA.columns[DATA.isna().sum() > 0]].isna().sum().sort_values().plot.bar();

## Fill NA

In [None]:
DATA['MSZoning'].fillna("RL", inplace = True)
DATA.Alley.fillna("NO", inplace = True)
DATA.Utilities.fillna('AllPub',inplace = True)
DATA.Exterior1st.fillna("VinylSd", inplace = True)
DATA.Exterior2nd.fillna("VinylSd", inplace = True)
DATA.MasVnrArea.fillna(0., inplace=True)
DATA.BsmtCond.fillna("No", inplace=True)
DATA.BsmtQual.fillna("No", inplace=True)
DATA.BsmtExposure.fillna("NB", inplace=True)
DATA.BsmtFinType1.fillna("NB", inplace=True)
DATA.BsmtFinType2.fillna("NB", inplace=True)
DATA.BsmtFinSF1.fillna(0., inplace=True)
DATA.BsmtFinSF2.fillna(0., inplace=True)
DATA.BsmtUnfSF.fillna(0., inplace=True)
DATA.TotalBsmtSF.fillna(0., inplace=True)
DATA.Electrical.fillna("SBrkr", inplace = True)
DATA.BsmtFullBath.fillna(0., inplace=True)
DATA.BsmtHalfBath.fillna(0., inplace=True)
DATA.KitchenQual.fillna("TA", inplace = True)
DATA.Functional.fillna('Typ', inplace = True)
DATA.FireplaceQu.fillna("No", inplace = True)
DATA.GarageType.fillna("No", inplace = True)
DATA.GarageYrBlt.fillna(0, inplace = True)
DATA.GarageFinish.fillna("No", inplace = True)
DATA.GarageCars.fillna(0, inplace = True)
DATA.GarageArea.fillna(0, inplace = True)
DATA.GarageQual.fillna("No", inplace = True)
DATA.GarageCond.fillna("No", inplace = True)
DATA.PoolQC.fillna("No", inplace = True)
DATA.Fence.fillna("No", inplace = True)
DATA.MiscFeature.fillna("No", inplace = True)
DATA.SaleType.fillna("Con", inplace = True)
DATA.SaleCondition.fillna("Normal", inplace = True)

## Feature Eng

In [None]:
DATA['LotFrontage'] = DATA.groupby(['Neighborhood', 'Street'])['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [None]:
DATA.BsmtFullBath.replace(3.0, 2.0, inplace=True) #better score
DATA.BsmtFullBath = DATA.BsmtFullBath.astype('int')
DATA.BsmtHalfBath = DATA.BsmtHalfBath.astype('int')
DATA.KitchenAbvGr = pd.cut(DATA.KitchenAbvGr,2)
DATA.KitchenAbvGr = DATA.KitchenAbvGr.astype('category').cat.rename_categories([0, 1])
DATA.Fireplaces = DATA.Fireplaces.apply(lambda row: 2 if row >= 2 else row)
DATA.Fireplaces = DATA.Fireplaces.astype('int')
DATA['GarageAgeCat'] = DATA.GarageYrBlt.apply(lambda row: 'recent' if row >= 2000 else 'old')
DATA.GarageCars = DATA.GarageCars.astype('int')

In [None]:
marks = {"No":0, "Po": 1, 'Fa': 2, "TA": 3, 'Gd': 4, 'Ex': 5}

def mark_to_num(mark):
    return marks[mark]

DATA['ExterQual'] = DATA['ExterQual'].apply(mark_to_num)
DATA['ExterCond'] = DATA['ExterCond'].apply(mark_to_num)
DATA['HeatingQC'] = DATA['HeatingQC'].apply(mark_to_num)
DATA['KitchenQual'] = DATA['KitchenQual'].apply(mark_to_num)
DATA['FireplaceQu'] = DATA['FireplaceQu'].apply(mark_to_num)
DATA['GarageQual'] = DATA['GarageQual'].apply(mark_to_num)
DATA['GarageCond'] = DATA['GarageCond'].apply(mark_to_num)
DATA['PoolQC'] = DATA['PoolQC'].apply(mark_to_num)
DATA['BsmtCond'] = DATA['BsmtCond'].apply(mark_to_num)
DATA['BsmtQual'] = DATA['BsmtQual'].apply(mark_to_num)

In [None]:
DATA['BsmtFinSF'] = DATA.BsmtFinSF1 + DATA.BsmtFinSF2
DATA['Porch'] = DATA.ScreenPorch + DATA.EnclosedPorch + DATA.OpenPorchSF + DATA.WoodDeckSF + DATA['3SsnPorch']
DATA['Total_surface'] = DATA.TotalBsmtSF + DATA['1stFlrSF'] + DATA['2ndFlrSF']
DATA['Age'] = DATA.YrSold - DATA.YearBuilt
DATA['RemodAge'] = DATA.YrSold - DATA.YearRemodAdd
DATA['GarageAge'] = DATA.YrSold - DATA.GarageYrBlt
DATA['Overall'] = (DATA['OverallCond'] * DATA.OverallQual)
DATA['External_Overall'] = DATA['ExterCond'] * DATA['ExterQual']
DATA['LotArea_log'] = np.log(DATA['LotArea'])
DATA["Spaciousness"] = (DATA['1stFlrSF'] + DATA['2ndFlrSF']) / DATA.TotRmsAbvGrd
DATA['Porch_types'] = DATA[['ScreenPorch', 'EnclosedPorch', 'OpenPorchSF', 'WoodDeckSF', '3SsnPorch']].gt(0.0).sum(axis=1)
DATA['WOW'] = np.sqrt(DATA['Overall'] * DATA['GrLivArea']) # WOW factor
DATA["MedNhbdArea"] = DATA.groupby("Neighborhood")["GrLivArea"].transform("median")
DATA['GarageOverall'] = DATA.GarageQual * DATA.GarageCond
DATA['GarageWow'] = DATA.GarageOverall * DATA.GarageArea
DATA['BsmtWow'] = DATA.BsmtCond * DATA.BsmtFinSF #not used
DATA['Freshness'] = DATA.Age * DATA.RemodAge #not used
DATA['Newness'] = np.sqrt(DATA.YearRemodAdd * DATA.GrLivArea)
DATA['TotalOverall'] = DATA['Overall'] + DATA['GarageOverall'] + DATA['External_Overall'] #not used
DATA['TotalWow'] = DATA['WOW'] + DATA['GarageWow'] + DATA['BsmtWow']
DATA['NewWOW'] = np.sqrt(DATA['Overall'] * DATA['GrLivArea'] * DATA.YearRemodAdd) #not used
DATA['New'] = DATA.Age.apply(lambda row: 1 if row == 0 else 0)
DATA['Fresh'] = DATA.RemodAge.apply(lambda row: 1 if row == 0 else 0) #not used
DATA['MSZ_Age'] =(DATA.groupby(['MSZoning'])['Age'].transform(lambda x: x.median()) + DATA.Age)/2 #not used

#PCA inspired
DATA['Grand_Total'] = DATA.GrLivArea * 0.55 + DATA.GarageArea * 0.55 + DATA.BsmtFinSF * 0.4 + DATA.Porch * 0.5 #not used

### 'Existentitial'

In [None]:
DATA['hasBsmt'] = DATA.TotalBsmtSF.apply(lambda row: 1 if row > 0 else 0)
DATA['hasGarage'] = DATA.GarageArea.apply(lambda row: 1 if row > 0 else 0)
DATA['hasFireplace'] = DATA.Fireplaces.apply(lambda row: 1 if row > 0 else 0) #excluded, worsens score
DATA['hasPool'] = DATA.PoolArea.apply(lambda row: 1 if row > 0 else 0) #excluded, worsens score

In [None]:
sns.jointplot(data = DATA[DATA.Set == "Train"], x="hasPool",y="SalePrice", kind='reg');
# sns.jointplot(data = DATA[DATA.Set == "Train"], x="TotalWow",y="SalePrice", kind='reg');

In [None]:
check = sorted(["Grand_Total","Newness", "MedNhbdArea","WOW","LotArea_log",'Total_surface','BsmtFinSF','Porch',"Spaciousness", "Age", 'Freshness','GarageOverall'])

In [None]:
%%time
def plot(x,y,hue, **kwargs):
    sns.scatterplot(x=x,y=y, hue=hue);
    _=plt.xticks(rotation=90)

f = pd.melt(DATA[DATA.Set == "Train"], id_vars=['SalePrice','SaleCondition'], value_vars=check)
g = sns.FacetGrid(f, col="variable",  col_wrap=6, sharex=False, sharey=True, height=4);
g = g.map(plot, "value", "SalePrice", 'SaleCondition');
g.add_legend();

In [None]:
%%time
def plot(x,y,hue, **kwargs):
    sns.scatterplot(x=x,y=y, hue=hue);
    _=plt.xticks(rotation=90)

f = pd.melt(DATA[DATA.Set == "Train"], id_vars=['SalePrice','SaleType'], value_vars=check)
g = sns.FacetGrid(f, col="variable",  col_wrap=6, sharex=False, sharey=True, height=4);
g = g.map(plot, "value", "SalePrice", 'SaleType');
g.add_legend();

## Mutual information

In [None]:
_X = DATA[DATA.Set == "Train"].copy()
_X.drop(['index','Id'], axis=1, inplace =True)
mi_scores = make_mi_scores(_X, _X.pop('SalePrice'))
mi_scores.head(20)

In [None]:
mi_scores.tail(20)
# utilities not dropped, it actually helps the score (a little ...)

## Interactions from MI (Feature encoding)

In [None]:
sns.lmplot(
    x='GrLivArea', y="SalePrice", hue="BldgType", col="BldgType",
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=5, height=4,
);

In [None]:
X1 = pd.get_dummies(DATA.BldgType, prefix = 'Bldg')
X1 = X1.mul(DATA.GrLivArea, axis=0)
X1[X1.columns] = mm.fit_transform(X1[X1.columns])

In [None]:
sns.lmplot(
    x='GrLivArea', y="SalePrice", hue="Neighborhood", col="Neighborhood",
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=6, height=3,
);

In [None]:
X2 = pd.get_dummies(DATA.Neighborhood, prefix = 'NB')
X2 = X2.mul(DATA.GrLivArea, axis=0)
X2[X2.columns] = mm.fit_transform(X2[X2.columns])

In [None]:
sns.lmplot(
    x='TotalBsmtSF', y="SalePrice", hue="BsmtQual", col="BsmtQual",
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=5, height=4,
);

In [None]:
X3 = pd.get_dummies(DATA.BsmtQual, prefix = 'BSQ')
X3 = X3.mul(DATA.TotalBsmtSF, axis=0)
X3[X3.columns] = mm.fit_transform(X3[X3.columns])

In [None]:
sns.lmplot(
    x='GrLivArea', y="SalePrice", hue='MSZoning', col='MSZoning',
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=5, height=4,
);

In [None]:
#excluded
X4 = pd.get_dummies(DATA.MSZoning, prefix = 'MSZ')
X4 = X4.mul(DATA.GrLivArea, axis=0)
X4[X4.columns] = mm.fit_transform(X4[X4.columns])

In [None]:
sns.lmplot(
    x='GrLivArea', y="SalePrice", hue='SaleCondition', col='SaleCondition',
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=6, height=4,
);

In [None]:
#exluded
X5 = pd.get_dummies(DATA.SaleCondition, prefix = 'SaleCond')
X5 = X5.mul(DATA.GrLivArea, axis=0)
X5[X5.columns] = mm.fit_transform(X5[X5.columns])

In [None]:
sns.lmplot(
    x='Age', y="SalePrice", hue='MSZoning', col='MSZoning',
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=6, height=4,
);

In [None]:
#excluded
X6 = pd.get_dummies(DATA.MSZoning, prefix = 'MSZ')
X6 = X6.mul(DATA.Age, axis=0)
X6[X6.columns] = mm.fit_transform(X6[X6.columns])

In [None]:
sns.lmplot(
    x='WOW', y="SalePrice", hue='SaleCondition', col='SaleCondition',
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=6, height=4,
);

In [None]:
#exluded
X7 = pd.get_dummies(DATA.SaleCondition, prefix = 'SC2')
X7 = X7.mul(DATA.WOW, axis=0)
X7[X7.columns] = mm.fit_transform(X7[X7.columns])

In [None]:
sns.lmplot(
    x='Age', y="SalePrice", hue='MSSubClass', col='MSSubClass',
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=8, height=3,
);

In [None]:
X8 = pd.get_dummies(DATA.MSSubClass, prefix = 'MSSC_AGE')
X8 = X8.mul(DATA.Age, axis=0)
X8[X8.columns] = mm.fit_transform(X8[X8.columns])

In [None]:
sns.lmplot(
    x='WOW', y="SalePrice", hue='SaleType', col='SaleType',
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=6, height=4,
);

In [None]:
#excluded
X9 = pd.get_dummies(DATA.SaleType, prefix = 'ST')
X9 = X9.mul(DATA.WOW, axis=0)
X9[X9.columns] = mm.fit_transform(X9[X9.columns])

In [None]:
sns.lmplot(
    x='GarageWow', y="SalePrice", hue='GarageFinish', col='GarageFinish',
    data=DATA[DATA.Set == "Train"], scatter_kws={"edgecolor": 'w'}, col_wrap=6, height=4,
);

In [None]:
X10 = pd.get_dummies(DATA.GarageFinish, prefix = 'GF')
X10 = X10.mul(DATA.GarageWow, axis=0)
X10[X10.columns] = mm.fit_transform(X10[X10.columns])

## Target encoding

In [None]:
DATA['SaleTypeCat'] = DATA.SaleType
DATA['SaleConditionCat'] = DATA.SaleCondition

In [None]:
encode_features = ['MSSubClass','SaleType','OverallCond','HouseStyle','GarageType', 'SaleCondition', "FullBath"]
#encode_features = ['MSSubClass','SaleType','OverallCond','HouseStyle','GarageType', 'SaleCondition']
X_encode = DATA[DATA.Set == 'Train'].sample(frac=0.2, random_state=13)
y_encode = X_encode.pop("SalePrice")

In [None]:
encoder = MEstimateEncoder(cols=encode_features,m=1)
encoder.fit(X_encode, y_encode)
ENC = encoder.transform(DATA.drop("SalePrice", axis=1))
DATA[encode_features] = ENC[encode_features]
DATA[encode_features].head()

## Drop & Classify

In [None]:
drop_cols = ['LowQualFinSF','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea',
             'MiscVal','MoSold','YrSold','1stFlrSF','2ndFlrSF' ,'BsmtUnfSF', 'YearBuilt','YearRemodAdd', 'BldgType','Neighborhood','BsmtQual','MiscFeature','Street', 'PoolQC',
             'LandSlope','RoofMatl','LotConfig','RoofStyle','BsmtHalfBath','Functional','Heating','Grand_Total', "Fresh",'MSZ_Age','BsmtWow', 'Freshness','GarageFinish','TotalOverall',
            'NewWOW','hasFireplace','hasPool']

numeric = sorted(['LotFrontage','MasVnrArea','BsmtFinSF','GrLivArea','GarageArea','Porch','Total_surface', 'Age','RemodAge','OverallQual', 'GarageCars','LotArea',
           'ExterQual','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond','Overall','External_Overall', 'LotArea_log',
                  'TotRmsAbvGrd', "Spaciousness", 'BedroomAbvGr', 'Porch_types','WOW',"MedNhbdArea", 'TotalBsmtSF', 'Newness', 'SaleType','OverallCond','HouseStyle',
                  'GarageType', 'SaleCondition','MSSubClass','GarageOverall','GarageWow','BsmtCond','BsmtFinSF1','BsmtFinSF2','TotalWow',"FullBath",
                 'hasGarage','hasBsmt'])

categorical = sorted(['Alley','LotShape','LandContour','MasVnrType','Foundation','BsmtExposure','Electrical', 'BsmtFullBath','HalfBath',
                'Fireplaces','KitchenAbvGr','PavedDrive','GarageAgeCat','Utilities','ExterCond', 'CentralAir', 'MSZoning','Fence',
                     'SaleTypeCat','SaleConditionCat',"New"])

In [None]:
DATA.drop(drop_cols, inplace = True, axis=1)
DATA[categorical] = DATA[categorical].astype('category')
DATA[numeric] = DATA[numeric].astype('float')

In [None]:
# remaining potential candidates for target encoding
DATA.select_dtypes(["category"]).nunique().sort_values(ascending=False).head(10)

## Numeric

### Worst fit

In [None]:
# %%time
# def scatterplot(x,y,**kwargs):
#     sns.scatterplot(x=x,y=y)
#     _=plt.xticks(rotation=90)

# f = pd.melt(DATA[(DATA.Set == "Train")].loc[worst], id_vars=['SalePrice'], value_vars=numeric)
# g = sns.FacetGrid(f, col="variable",  col_wrap=6, sharex=False, sharey=True, height=4)
# g = g.map(scatterplot, "value", "SalePrice")

### All

In [None]:
%%time
def scatterplot(x,y,**kwargs):
    sns.regplot(x=x,y=y)
    _=plt.xticks(rotation=90)

f = pd.melt(DATA[DATA.Set == "Train"], id_vars=['SalePrice'], value_vars=numeric)
g = sns.FacetGrid(f, col="variable",  col_wrap=5, sharex=False, sharey=True, height=5)
g = g.map(scatterplot, "value", "SalePrice")

In [None]:
DATA[numeric] = mm.fit_transform(DATA[numeric])

## Categorical

### Worst fit

In [None]:
# %%time
# def boxplot(x,y,**kwargs):
#     sns.boxplot(x=x,y=y)
#     _=plt.xticks(rotation=90)

# f = pd.melt(DATA[DATA.Set == "Train"].loc[worst], id_vars=['SalePrice'], value_vars=categorical)
# g = sns.FacetGrid(f, col="variable",  col_wrap=7, sharex=False, sharey=True, height=3)
# g = g.map(boxplot, "value", "SalePrice")

### All

In [None]:
%%time
def boxplot(x,y,**kwargs):
    sns.boxplot(x=x,y=y)
    _=plt.xticks(rotation=90)

f = pd.melt(DATA[DATA.Set == "Train"], id_vars=['SalePrice'], value_vars=categorical)
g = sns.FacetGrid(f, col="variable",  col_wrap=6, sharex=False, sharey=True, height=4)
g = g.map(boxplot, "value", "SalePrice")

## Correlations

In [None]:
fig, ax = plt.subplots(figsize=(30,30))     
g = sns.heatmap(DATA[DATA.Set == 'Train'][[*numeric,'SalePrice']].corr(),annot=True, fmt = ".2f", cmap = "coolwarm")

## PCA

In [None]:
%%time
#corr_features = sorted(['GrLivArea','GarageArea','RemodAge','TotalBsmtSF','Total_surface',"Porch","OverallQual"])
corr_features = sorted(['GrLivArea','GarageArea','BsmtFinSF',"Porch"])
#pca = PCA(0.90)
pca = PCA(3)
X_PCA = pca.fit_transform(DATA.loc[:, corr_features])
component_names = [f"PC{i+1}" for i in range(X_PCA.shape[1])]
X_PCA = pd.DataFrame(X_PCA, columns=component_names)
X_PCA.head()

In [None]:
pca.explained_variance_ratio_

In [None]:
DATA_PCA = X_PCA.copy()
DATA_PCA['SalePrice'] = DATA['SalePrice']
DATA_PCA=DATA_PCA[DATA_PCA['SalePrice'] != -1]

In [None]:
%%time
def scatterplot(x,y,**kwargs):
    sns.regplot(x=x,y=y)
    _=plt.xticks(rotation=90)

f = pd.melt(DATA_PCA, id_vars=['SalePrice'], value_vars=component_names)
g = sns.FacetGrid(f, col="variable",  col_wrap=4, sharex=False, sharey=True, height=5)
g = g.map(scatterplot, "value", "SalePrice")

In [None]:
fig, axs = plt.subplots(1, 2)
n = pca.n_components_
grid = np.arange(1, n + 1)
# Explained variance
evr = pca.explained_variance_ratio_
axs[0].bar(grid, evr)
axs[0].set(xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0))
# Cumulative Variance
cv = np.cumsum(evr)
axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
axs[1].set(xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0))
# Set up figure
fig.set(figwidth=8, dpi=100);

In [None]:
loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=DATA.loc[:, corr_features].columns,  # and the rows are the original features
    )
loadings

## Joining 'double categories' and converting to dummies

### Condition1,2

In [None]:
cond = DATA[['Condition1','Condition2']]
condition_cats = ["Condition_"+s for s in set([*cond.Condition1.unique(), *cond.Condition2.unique()])]
COND_FRAME = pd.DataFrame(columns=condition_cats, index = DATA.index).fillna(0)
for i in cond.index:
    cs = set(cond.loc[i, ['Condition1','Condition2']].values)
    for c in cs:
        COND_FRAME.loc[i]["Condition_"+c] = 1

     
DATA = DATA.join(COND_FRAME)
DATA.drop(['Condition1','Condition2'], axis=1, inplace = True)

In [None]:
%%time
def scatterplot(x,y,**kwargs):
    sns.boxplot(x=x,y=y)
    _=plt.xticks(rotation=90)

f = pd.melt(DATA[DATA.Set == "Train"], id_vars=['SalePrice'], value_vars=DATA[DATA.Set == 'Train'][DATA.filter(like='Condition_').columns])
g = sns.FacetGrid(f, col="variable",  col_wrap=5, sharex=False, sharey=True, height=4)
g = g.map(scatterplot, "value", "SalePrice")

### Exterior1st, Exterior2nd

In [None]:
ext = DATA[['Exterior1st','Exterior2nd']]
ext_cats = ["Ext_"+s for s in set([*ext.Exterior1st.unique(), *ext.Exterior2nd.unique()])]
EXT_FRAME = pd.DataFrame(columns=ext_cats, index = DATA.index).fillna(0)
for i in ext.index:
    cs = set(ext.loc[i, ['Exterior1st','Exterior2nd']].values)
    for c in cs:
        EXT_FRAME.loc[i]["Ext_"+c] = 1

EXT_FRAME = EXT_FRAME.mul(DATA.GrLivArea, axis=0)   
DATA = DATA.join(EXT_FRAME)
DATA.drop(['Exterior1st','Exterior2nd'], axis=1, inplace = True)

In [None]:
%%time
def scatterplot(x,y,**kwargs):
    sns.regplot(x=x,y=y)
    _=plt.xticks(rotation=90)

f = pd.melt(DATA[DATA.Set == "Train"], id_vars=['SalePrice'], value_vars=DATA[DATA.Set == 'Train'][DATA.filter(like='Ext_').columns])
g = sns.FacetGrid(f, col="variable",  col_wrap=7, sharex=False, sharey=True, height=3)
g = g.map(scatterplot, "value", "SalePrice")

### BsmtFinType1,BsmtFinType2

In [None]:
mulCols = ['BsmtFinSF1','BsmtFinSF2']
bf = DATA[['BsmtFinType1','BsmtFinType2']]
bf_cats = ["BF_"+s for s in set([*bf.BsmtFinType1.unique(), *bf.BsmtFinType2.unique()])]
BF_FRAME = pd.DataFrame(columns=bf_cats, index = DATA.index).fillna(0).astype("float")

for i in bf.index:
    cs = set(bf.loc[i, ['BsmtFinType1','BsmtFinType2']].values)
    for j,c in enumerate(cs):
        BF_FRAME.loc[i]["BF_"+c] =  DATA.loc[i][mulCols[j]]
        #BF_FRAME.loc[i]["BF_"+c] =  1

DATA = DATA.join(BF_FRAME)
DATA.drop(['BsmtFinType1','BsmtFinType2','BsmtFinSF1','BsmtFinSF2'], axis=1, inplace = True)

In [None]:
%%time
def scatterplot(x,y,**kwargs):
    #sns.boxplot(x=x,y=y)
    sns.regplot(x=x,y=y)
    _=plt.xticks(rotation=90)

f = pd.melt(DATA[DATA.Set == "Train"], id_vars=['SalePrice'], value_vars=DATA[DATA.Set == 'Train'][DATA.filter(like="BF_").columns])
g = sns.FacetGrid(f, col="variable",  col_wrap=7, sharex=False, sharey=True, height=3)
g = g.map(scatterplot, "value", "SalePrice")

## Categorical to dummies

In [None]:
DATA = pd.get_dummies(DATA,columns=categorical, drop_first=True)

## Clustering

In [None]:
clustering_features = ['GrLivArea','Total_surface', 'LotArea_log', 'Overall','WOW',"Spaciousness",'Porch','BsmtFinSF','Age', 'RemodAge']
kmeans = KMeans(n_clusters = 13, random_state=13)
clust_data = DATA[DATA.Set == 'Train'].loc[:, clustering_features]
clust_data['cluster'] = kmeans.fit_predict(clust_data)
clust_data['cluster'] = clust_data['cluster'].astype('category')
clust_data['SalePrice'] = DATA[DATA.Set == 'Train']['SalePrice']

In [None]:
sns.relplot(data = clust_data.melt(value_vars=clustering_features, id_vars = ["SalePrice", "cluster"]), x="value", y="SalePrice", hue='cluster', col= "variable", col_wrap=5, height=4);

In [None]:
clust_data = DATA.loc[:, clustering_features]
X_CD = kmeans.fit_transform(clust_data)
X_CD = mm.fit_transform(X_CD)
X_CD = pd.DataFrame(X_CD, columns=[f"Centroid_{i}" for i in range(X_CD.shape[1])])

## Join interaction features

In [None]:
DATA = DATA.join([X1, X2, X3, X_CD, X8, X10])
DATA.head()

## Split

In [None]:
TRAIN = DATA[DATA.Set == 'Train']
TEST = DATA[DATA.Set == 'Test']
HouseIds = TEST.Id.to_list()
TEST = TEST.drop(['Id','Set',"SalePrice",'index'], axis = 1)
y = TRAIN.SalePrice
X = TRAIN.drop(['SalePrice','Id','Set','index'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.25, random_state = 13)

In [None]:
print("Currently using {} feature columns".format(len(TRAIN.columns)))

# Models


## Random Forest

In [None]:
# %%time
# param_grid = {'n_estimators': [100, 200, 500],'max_depth': [4, 10, None], 'max_features':['auto','sqrt',0.9,0.75]}
# rf_grid = GridSearchCV(RandomForestRegressor(random_state=13, n_jobs=-1), param_grid, cv=4)
# rf_grid.fit(X, y)
# print(rf_grid.best_estimator_)
# print(rf_grid.best_params_)
# rf_score = rf_grid.best_score_
# print(rf_score) 

In [None]:
%%time
#best submission score
RF_model = RandomForestRegressor(n_estimators=500, n_jobs=-1, random_state=13).fit(X_train,y_train)

#best internal score
#RF_model = RandomForestRegressor(max_features='sqrt', n_estimators=1000, n_jobs=-1,random_state=13).fit(X_train,y_train)

In [None]:
features = {}
for feature, importance in zip(X_train.columns, RF_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"RF":features})
importances.sort_values("RF", ascending = False, inplace=True)
importances[:15].plot.bar();

## Ridge

In [None]:
# %%time
# param_grid = {'alpha': [0.01,0.1, 1, 10, 100, 1000]}
# ridge_grid = GridSearchCV(Ridge(random_state=13), param_grid, cv=4)
# ridge_grid.fit(X, y)
# print(ridge_grid.best_estimator_)
# print(ridge_grid.best_params_)
# ridge_score = ridge_grid.best_score_
# print(ridge_score) 

In [None]:
%%time
ridge_model = Ridge(alpha=10, random_state=13).fit(X_train,y_train)

## Lasso

In [None]:
# %%time
# param_grid = {'alpha': [1, 10, 100, 200, 1000]}
# lasso_grid = GridSearchCV(Lasso(random_state=13, max_iter = 10000, fit_intercept = False), param_grid, cv=4)
# lasso_grid.fit(X, y)
# print(lasso_grid.best_estimator_)
# print(lasso_grid.best_params_)
# lasso_score = lasso_grid.best_score_
# print(lasso_score) 

In [None]:
%%time
lasso_model = Lasso(alpha=100, max_iter=10000, random_state=13).fit(X_train,y_train)

## ElasticNet

In [None]:
# %%time
# param_grid = {'alpha': [0.05,0.1,0.5, 1],'l1_ratio':[0.1,0.25, 0.5, 0.75,0.9]}
# elastic_grid = GridSearchCV(ElasticNet(random_state=13, max_iter = 10000), param_grid, cv=4)
# elastic_grid.fit(X, y)
# print(elastic_grid.best_estimator_)
# print(elastic_grid.best_params_)
# elastic_score = elastic_grid.best_score_
# print(elastic_score) 

In [None]:
%%time
elastic_model= ElasticNet(alpha=0.05, l1_ratio=0.75, max_iter=10000, random_state=13).fit(X_train,y_train)

## XGBoost

In [None]:
# %%time
# #gbm_param_grid = {'learning_rate': [0.01,0.1,0.5],'n_estimators': [500, 750],'subsample': [0.75, 0.8, 0.9],'reg_alpha':[0.001, 0.01], 'reg_lambda':[0.1,1,10]}
# gbm_param_grid = {'learning_rate': [0.1],'n_estimators': [750, 1000],'subsample': [0.8, 0.9],'reg_alpha':[0.001, 0.01], 'reg_lambda':[0.1,1,10], 'max_depth': [4,6,8]}
# grid_xgb = GridSearchCV(xgb.XGBRegressor(seed = 13), gbm_param_grid,cv=4, scoring='neg_mean_squared_error', verbose = 1)
# grid_xgb.fit(X, y)
# print(grid_xgb.best_estimator_)
# print(grid_xgb.best_params_)
# xgb_score = grid_xgb.best_score_
# print(xgb_score) 

In [None]:
%%time
xg_model = xgb.XGBRegressor(objective = 'reg:squarederror', n_estimators = 500, seed = 13, subsample = 0.8, learning_rate = 0.1, reg_alpha=0.01, reg_lambda = 10).fit(X_train,y_train)

In [None]:
feature_imporances = xg_model.get_booster().get_score(importance_type='weight')
importances = pd.DataFrame({"XGB":feature_imporances.values()}, index = feature_imporances.keys())
importances.sort_values("XGB", ascending = False, inplace=True)
importances[:15].plot.bar();

## ADA Boost

In [None]:
# %%time
# ada_param_grid = {'learning_rate': [0.5, 1],'n_estimators': [500, 750], 'base_estimator__max_depth':[4, 8,  None], 'base_estimator__max_features':['auto','sqrt',0.8] }
# grid_ada = GridSearchCV(AdaBoostRegressor(random_state = 13, base_estimator=DecisionTreeRegressor()), ada_param_grid, cv=4)
# grid_ada.fit(X, y)
# print(grid_ada.best_estimator_)
# print(grid_ada.best_params_)
# ada_score = grid_ada.best_score_
# print(ada_score) 

In [None]:
%%time

#original
#ada_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_depth=20),learning_rate=1, n_estimators=500, random_state=13).fit(X_train,y_train)

#best
ada_model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(max_features='sqrt'),learning_rate=1, n_estimators=500, random_state=13).fit(X_train,y_train)

In [None]:
features = {}
for feature, importance in zip(X_train.columns, ada_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"ADA":features})
importances.sort_values("ADA", ascending = False, inplace=True)
importances[:15].plot.bar();

## Extra Trees

In [None]:
# %%time
# param_grid = {'n_estimators': [500, 750], 'max_depth': [10,20, None], 'bootstrap': [False, True], 'max_samples': [0.8, 0.9, 1], 'max_features':['auto',0.9, 0.8, 0.75]}
# grid_ET = GridSearchCV(ExtraTreesRegressor(random_state = 13, n_jobs=-1), param_grid, cv=4)
# grid_ET.fit(X, y)
# print(grid_ET.best_estimator_)
# print(grid_ET.best_params_)
# ET_score = grid_ET.best_score_
# print(ET_score) 

In [None]:
%%time
ET_model = ExtraTreesRegressor(max_samples=0.8, n_estimators=500, random_state=13, n_jobs=-1).fit(X_train,y_train)

In [None]:
features = {}
for feature, importance in zip(X_train.columns, ET_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"ET":features})
importances.sort_values("ET", ascending = False, inplace=True)
importances[:20].plot.bar();

## GradientBoostingRegressor

In [None]:
# %%time
# #param_grid = {'n_estimators': [2000, 3000], 'max_depth': [4, 10, None], 'min_samples_leaf': [5,15], 'min_samples_split': [5, 10],'learning_rate': [0.05, 0.1]}
# #param_grid = {'n_estimators': [3000], 'max_depth': [4,10],'learning_rate': [0.05], 'max_features':['auto','sqrt',0.9, 0.8, 0.75]}
# param_grid = {'n_estimators': [3000], 'max_depth': [4,10],'learning_rate': [0.05], 'max_features':['auto','sqrt', 0.8]}
# #grid_GB = GridSearchCV(GradientBoostingRegressor(random_state = 13, loss='huber',  max_features='sqrt'), param_grid, cv=3)
# grid_GB = GridSearchCV(GradientBoostingRegressor(random_state = 13, loss='huber'), param_grid, cv=3)
# grid_GB.fit(X, y)
# print(grid_GB.best_estimator_)
# print(grid_GB.best_params_)
# GB_score = grid_GB.best_score_
# print(GB_score) 

In [None]:
%%time
GBoost_model = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,max_depth=4, max_features='sqrt', min_samples_leaf=15, min_samples_split=10, loss='huber', 
                                         random_state =13).fit(X_train,y_train)

In [None]:
features = {}
for feature, importance in zip(X_train.columns, GBoost_model.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"GBoost":features})
importances.sort_values("GBoost", ascending = False, inplace=True)
importances[:20].plot.bar();

## LGBM

In [None]:
%%time
np.random.seed(13)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5, learning_rate=0.05, n_estimators=720, max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319, feature_fraction_seed=9, bagging_seed=9,min_data_in_leaf =6, 
                              min_sum_hessian_in_leaf = 11).fit(X_train,y_train)

In [None]:
features = {}
for feature, importance in zip(X_train.columns, model_lgb.feature_importances_):
    features[feature] = importance

importances = pd.DataFrame({"LGBM":features})
importances.sort_values("LGBM", ascending = False, inplace=True)
importances[:20].plot.bar();

## KRR


In [None]:
# %%time
# param_grid = {'kernel': ['linear', 'polynomial'], 'degree': [1,2,3], 'alpha':[0.1,0.5, 0.9, 1],'coef0': [1,2,2.5]}
#param_grid = {'alpha': np.linspace(0, 1, 20), 'kernel': ['polynomial'], 'degree': [1, 2], 'coef0':np.linspace(0, 3.5,21)}
# grid_KRR = GridSearchCV(KernelRidge(coef0=2.5), param_grid, cv=4)
# grid_KRR.fit(X, y)
# print(grid_KRR.best_estimator_)
# print(grid_KRR.best_params_)
# KRR_score = grid_KRR.best_score_
# print(KRR_score) 

In [None]:
%%time
#original
KRR_model = KernelRidge(alpha=0.1, degree=1, kernel='polynomial').fit(X_train,y_train)

#retune
#KRR_model = KernelRidge(alpha=0.10526315789473684, coef0=0.0, degree=1, kernel='polynomial').fit(X_train,y_train)

## SVR

In [None]:
# %%time
# param_grid = {'kernel': ['linear'],  'C':[1000, 10**4, 10**5], 'gamma': ['scale']}
# grid_SVR = GridSearchCV(SVR(), param_grid, cv=4)
# grid_SVR.fit(X, y)
# print(grid_SVR.best_estimator_)
# print(grid_SVR.best_params_)
# SVR_score = grid_SVR.best_score_
# print(SVR_score) 

In [None]:
# %%time
# SVR_model = SVR(C=1000, kernel='linear').fit(X_train,y_train)

## Scores

In [None]:
#10 models (no SVR)
models = [RF_model, ridge_model, elastic_model, xg_model, ada_model,ET_model, GBoost_model, model_lgb, KRR_model, lasso_model]
model_names = ["RF", 'Ridge', "Elastic", "XGB", "ADA", "ET", "GBoost", "LGBM", "KRR", "Lasso"]

#9 models (no SVR, Lasso)
# models = [RF_model, ridge_model, elastic_model, xg_model, ada_model,ET_model, GBoost_model, model_lgb, KRR_model]
# model_names = ["RF", 'Ridge', "Elastic", "XGB", "ADA", "ET", "GBoost", "LGBM", "KRR"]

In [None]:
%%time
scoreList = []
for i, m in enumerate(models):
    score = [model_names[i]]
    score.append(m.score(X_train,y_train))
    score.append(m.score(X_test,y_test))
    score.append(np.sqrt(mean_squared_error(np.log(y_train),np.log(m.predict(X_train)))))
    score.append(np.sqrt(mean_squared_error(np.log(y_test),np.log(m.predict(X_test)))))
    scoreList.append(score)

SCORES = pd.DataFrame(scoreList, columns = ['model', 'train_score', 'test_score', 'train_RMSE', 'test_RMSE'])
SCORES['overfit'] = SCORES.test_RMSE - SCORES.train_RMSE
SCORES.sort_values(['test_RMSE'], ascending = True, inplace = True)
SCORES.reset_index(drop=True, inplace=True)
SCORES

In [None]:
sns.barplot(data = SCORES, x="model", y="test_RMSE");

## Observe TRAIN predictions 

In [None]:
%%time

N_cols = 3
col_width = 8
N_rows = round(len(models) / N_cols + 0.49)
fig, axs = plt.subplots(nrows = N_rows, ncols=N_cols, figsize=(col_width * N_cols, N_rows * col_width))

for i in range(len(models)):
    axs[i//N_cols, i%N_cols].scatter(models[i].predict(X), y, alpha = 0.8, color="b", label = "X")
    axs[i//N_cols, i%N_cols].scatter(models[i].predict(X_test), y_test, alpha = 0.3, color="g", label = "X_test")
    axs[i//N_cols, i%N_cols].set_title(model_names[i])
    axs[i//N_cols, i%N_cols].legend();

In [None]:
train_dict = {"Id": TRAIN.Id}
for i, m in enumerate(models):
    train_dict[model_names[i]] = m.predict(X)

ALL_TRAIN = pd.DataFrame(train_dict)
ALL_TRAIN['Voting'] = ALL_TRAIN[model_names].mean(axis=1)
ALL_TRAIN['True'] = y
ALL_TRAIN['DIFF'] = ALL_TRAIN['True'] - ALL_TRAIN.Voting

ALL_TRAIN.head(10)

In [None]:
checkN = 20
ALL_TRAIN.loc[np.abs(ALL_TRAIN.DIFF).nlargest(checkN).index]

In [None]:
#print(np.abs(ALL_TRAIN.DIFF).nlargest(checkN).index)
# X.loc[np.abs(ALL_TRAIN.DIFF).nlargest(checkN).index]

In [None]:
print("Voting RMSE:", np.sqrt(mean_squared_error(np.log(y),np.log(ALL_TRAIN['Voting']))))

In [None]:
plt.scatter(ALL_TRAIN['Voting'], y, alpha = 0.8, color='b', label = 'Voting');
plt.title("Voting")
plt.legend()
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show();

In [None]:
colors = ["red", 'blue', "yellow", "aquamarine", "coral", "magenta", "gold", "hotpink", "teal", "aqua", "peru"]
colors = colors[:len(models)]
colors.append('lawngreen') #Voting
colors.append('black') #True

In [None]:
disp = 80
fig, ax = plt.subplots(figsize=(30,10))  
for i, col in enumerate(ALL_TRAIN.columns[1:-1].tolist()):
    plt.scatter(x = ALL_TRAIN[:disp].Id, y= ALL_TRAIN[:disp][col], alpha=0.8, c = colors[i], edgecolors= "white", s=80)
    plt.legend(ALL_TRAIN.columns[1:].tolist())

plt.show()

## Predict

### Observe TEST predictions

In [None]:
test_dict = {"Id": HouseIds}
for i, m in enumerate(models):
    test_dict[model_names[i]] = m.predict(TEST)

ALL = pd.DataFrame(test_dict)
ALL['Voting'] = ALL[model_names].mean(axis=1)
ALL.head(10)

In [None]:
disp = 150
fig, ax = plt.subplots(figsize=(30,10))  
for i, col in enumerate(ALL.columns[1:].tolist()):
    plt.scatter(x = ALL[:disp].Id, y= ALL[:disp][col], alpha=0.8, c = colors[i], edgecolors= "white", s=80)
    plt.legend(ALL.columns[1:].tolist())

plt.show()

In [None]:
output = pd.DataFrame({"Id": HouseIds, "SalePrice": ALL['Voting']})
output.head(10)

In [None]:
output.to_csv('submission.csv', index=False)
print("Submission was successfully saved!")

In [None]:
end_time = time.time()
print("Notebook run time: {:.1f} seconds. Finished at {}".format(end_time - start_time, datetime.now()) )