## Imports and Configuration ##

In [None]:
import numpy as np
import pandas as pd

import os
import copy
from pathlib import Path

import warnings

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display

from pandas.api.types import CategoricalDtype
from category_encoders import MEstimateEncoder
from category_encoders.cat_boost import CatBoostEncoder

from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer

from sklearn.decomposition import PCA

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.linear_model import LarsCV
from sklearn.linear_model import BayesianRidge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVR
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor

from sklearn.ensemble import VotingRegressor

import eli5
from eli5.sklearn import PermutationImportance
from sklearn.model_selection import train_test_split

from sklearn.neighbors import LocalOutlierFactor

# for Box-Cox Transformation
from scipy import stats
from scipy.stats import norm, skew

# for min_max scaling
from mlxtend.preprocessing import minmax_scaling

warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=18,
    titlepad=10,
)

target = 'SalePrice'

In [None]:
def simple_impute(df):
    numerical = []
    categorical = []
    enc = OrdinalEncoder()
    for name in df.select_dtypes(['float64', 'int64']):
        df[name] = df[name].fillna(0)
        numerical.append(name)
    for name in df.select_dtypes("object"):
        df[name] = df[name].fillna("None")
        categorical.append(name)
    df[categorical] = enc.fit_transform(df[categorical])    
    return df

# Load dataset

In [None]:
df_train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', index_col="Id")
df_test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv', index_col="Id")
df_description = pd.read_csv('../input/description/housing.csv')

# Explore dataset

#### Make feature descriptions dictionary

In [None]:
description = dict()
curr = ''
for index, row in df_description.iterrows():
    txt = str(row['a'])
    if ':' in txt:
        if curr != '':
            description[curr] = item
        column = txt.split(':')[0]
        desc = txt.split(':')[1]
        curr = column
        if curr in ['Bedroom', 'Kitchen']: curr += 'AbvGr' # correct error in data_description.txt
        item = dict()
        item['description'] = desc
        item['values'] = dict()
    else:
        n = txt
        v = row['b']
        item['values'][n] = v
item = dict()
item['description'] = 'Condition of sale'
item['values'] = dict()
description['SaleCondition'] = item 


# MI scores

In [None]:
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")


In [None]:
def scorer(estimator, X, y):
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    scores = cross_val_score(
        estimator, X, log_y, cv=5, scoring="neg_mean_squared_error",
    )
    score = -1 * scores.mean()
    score = np.sqrt(score)
    return score   

In [None]:
def dataset_info(train, test):
    X = train.copy()
    X['df'] = 'train'
    X_test = test.copy()
    X_test['df'] = 'test'
    
    X_all = pd.concat([X, X_test], ignore_index=True)
    X_all_missing = X_all.isnull()
    X_missing = X.isnull()
    types = dict(X_all.dtypes) 
    
    Z = X.copy()
    Z = simple_impute(Z)
    yz = Z.pop(target)
    mi_scores = pd.DataFrame(make_mi_scores(Z, yz))
    
    train_Z, val_Z, train_yz, val_yz = train_test_split(Z, yz, random_state=1)
    first_model = XGBRegressor(n_estimators=100, random_state=1).fit(train_Z, train_yz)
    perm = PermutationImportance(first_model, random_state=1).fit(val_Z, val_yz)
    feature_importances = pd.DataFrame(perm.feature_importances_, index=Z.columns)
    
    correlation = X.corr()
   
    for feature in X.columns.values.tolist():
        if feature in ['df', target]: 
            continue
        
        print(feature)
        print('_'*20)
        if feature in description:
            print(description[feature]['description'])
        dtype = types[feature]
        nunique = len(X_all[feature].unique())
        print('dtype:', dtype, end=' ')
        if  dtype == 'int64' and nunique<20:
            print('AS OBJECT', end='')
        print()    
        if dtype == 'object' or (dtype == 'int64' and nunique<20):
            if feature in description:
                print('values meaning:', end=' ')
                for v in description[feature]['values']:
                    print(v.strip(), ':', description[feature]['values'][v], end=', ')
                print()
            print('unique values:', nunique, '[', end='')
            for v in X_all[feature].unique():
                print(v, end=', ')
            print(']')
        null = X[feature].isnull().sum()
        test_null = X_test[feature].isnull().sum()
        if null > 0 or test_null > 0:
            print(f"Missing values, train: {null} ({100*null/X.shape[0]:.1f}%), test: {test_null} ({100*test_null/X_test.shape[0]:.1f}%)")
        if dtype != 'object':
            print(f"Train min: {X[feature].min()}, mean: {X[feature].mean():.3f}, max: {X[feature].max()}, std: {X[feature].std():.3f}")
        
        print(f"MI score: {mi_scores.loc[feature]['MI Scores']}")
        print(f"Feature importance: {feature_importances.loc[feature][0]}")
        if feature in correlation.columns:
            print('\nCorrelation with target and top most correlated features:')
            tmcf =np.abs(correlation.loc[feature]).drop(labels=[feature, target]).sort_values(ascending=False).index.tolist()[:5]
            top_most_correlated_features = correlation.loc[feature][[target]+tmcf]
            fig, ax = plt.subplots(1, 1, figsize=(12, 1.2))
            sns.heatmap(data=pd.DataFrame(top_most_correlated_features).T, annot=True, cbar=False)
        
        n = 2
        fig, ax = plt.subplots(1, n, figsize=(12, 3))
        if dtype == 'object' or (dtype == 'int64' and nunique<20):
            sns.countplot(data=X_all, x=feature, hue='df', ax=ax[0])
            sns.violinplot(x=feature,y=target,data=X, ax=ax[1])
        else:   
            sns.kdeplot(data=X_all, x=feature, hue='df', ax=ax[0]) 
            sns.regplot(x=X[feature], y=X[target], line_kws={'color': 'r'}, ax=ax[1])
        plt.show()
        print()

# Features

In [None]:
dataset_info(df_train, df_test)

# Target distribution

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 4)) 
sns.distplot(a=df_train[target], fit=norm, ax=ax[0])
stats.probplot(df_train[target], plot=ax[1])

### After log transform

In [None]:
def log_transform(X, column):
    df = X.copy()
    df[column] = np.log1p(df[column])
    return df

df = log_transform(df_train, target)
fig, ax = plt.subplots(1, 2, figsize=(12, 4)) 
sns.distplot(a=df[target], fit=norm, ax=ax[0])
stats.probplot(df[target], plot=ax[1])

# Preprocessing

In [None]:
def ht(df):
    display(df.head(2))
    display(df.tail(2))
    display(df.shape)

In [None]:
def load_data():
    # Read data
    data_dir = Path("../input/house-prices-advanced-regression-techniques/")
    df_train = pd.read_csv(data_dir / "train.csv", index_col="Id")
    df_test = pd.read_csv(data_dir / "test.csv", index_col="Id")
    # Merge the splits so we can process them together
    df = pd.concat([df_train, df_test])
    # Preprocessing
    df = clean(df)
    df = handleNA(df)
    df = encode(df)
    df = impute(df)
    # Reform splits
    df_train = df.loc[df_train.index, :]
    df_test = df.loc[df_test.index, :]
    return df_train, df_test


# Clean Data 

In [None]:
def clean(df):
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Brk Cmn": "BrkComm"})
    df["Exterior2nd"] = df["Exterior2nd"].replace({"Wd Shng": "WdShing"})
    # Some values of GarageYrBlt are corrupt, so we'll replace them
    # with the year the house was built
    df["GarageYrBlt"] = df["GarageYrBlt"].where(df.GarageYrBlt <= 2010, df.YearBuilt)
    # Names beginning with numbers are awkward to work with
    df.rename(columns={
        "1stFlrSF": "FirstFlrSF",
        "2ndFlrSF": "SecondFlrSF",
        "3SsnPorch": "Threeseasonporch",
    }, inplace=True,
    )
    return df

# Encode the Statistical Data Type ###

In [None]:
# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]

# The ordinal (ordered) categorical features 

# Pandas calls the categories "levels"
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(10))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# Add a None level for missing values
ordered_levels = {key: ["None"] + value for key, value in ordered_levels.items()}

def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name].cat.add_categories("None", inplace=True)
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels, ordered=True))
    return df


# Handle Missing Values ###

In [None]:
def handleNA(X):
    # Thanks YatishDua
    df = X.copy()
    df['Alley'].fillna(value='No alley access',inplace=True)    
    df['BsmtQual'].fillna(value='No Basement',inplace=True)
    df['BsmtCond'].fillna(value='No Basement',inplace=True)
    df['BsmtExposure'].fillna(value='No Basement',inplace=True)
    df['BsmtFinType1'].fillna(value='No Basement',inplace=True)    
    df['BsmtFinType2'].fillna(value='No Basement',inplace=True)    
    df['FireplaceQu'].fillna(value='No Fireplace',inplace=True)    
    df['GarageType'].fillna(value='No Garage',inplace=True)  
    df['GarageYrBlt'].fillna(value=0,inplace=True)
    df['GarageFinish'].fillna(value='No Garage',inplace=True)
    df['GarageQual'].fillna(value='No Garage',inplace=True)
    df['GarageCond'].fillna(value='No Garage',inplace=True)
    df['MasVnrType'].fillna(value='None',inplace=True)
    df['MasVnrArea'].fillna(value=0.0,inplace=True)
    df['PoolQC'].fillna(value='No Pool',inplace=True)    
    df['Fence'].fillna(value='No Fence',inplace=True)
    df['MiscFeature'].fillna(value='None',inplace=True)
    return df

def impute(df):
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

# Baseline

### Scoring function

In [None]:
def score_dataset(X, y, model=XGBRegressor(), cv=5):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    scores = cross_val_score(
        model, X, log_y, cv=cv, scoring="neg_mean_squared_error",
    )
    score = -1 * scores.mean()
    score = np.sqrt(score)
    scores_sqrt = [np.sqrt(-1*x) for x in scores]
    return score, scores_sqrt


#### Baseline

In [None]:
X, X_test = load_data()
y = X.pop(target)

baseline_score, scores = score_dataset(X, y)
print(f"Baseline score: {baseline_score:.5f}", scores)

## Drop uninformative features

In [None]:
mi_scores = make_mi_scores(X, y)

def drop_uninformative(df, mi_scores):
    return df.loc[:, mi_scores > 0.0]

X = drop_uninformative(X, mi_scores)
score_dataset(X, y)

# Create Features

In [None]:
def label_encode(df):
    X = df.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    return X

In [None]:
def mathematical_transforms(df):
    X = pd.DataFrame()  # dataframe to hold new features
    X["LivLotRatio"] = df.GrLivArea / df.LotArea
    X["Spaciousness"] = (df.FirstFlrSF + df.SecondFlrSF) / df.TotRmsAbvGrd
    return X

def interactions(df):
    X = pd.get_dummies(df.BldgType, prefix="Bldg")
    X = X.mul(df.GrLivArea, axis=0)
    return X

def counts(df):
    X = pd.DataFrame()
    X["PorchTypes"] = df[[
        "WoodDeckSF",
        "OpenPorchSF",
        "EnclosedPorch",
        "ScreenPorch",
    ]].gt(0.0).sum(axis=1)
    return X

def group_transforms(df):
    X = pd.DataFrame()
    X["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")
    return X

In [None]:
def add_features(all_data):
    X = pd.DataFrame()

    X['TotalSF'] = (all_data['TotalBsmtSF'] 
                           + all_data['FirstFlrSF'] 
                           + all_data['SecondFlrSF'])

    X['YrBltAndRemod'] = all_data['YearBuilt'] + all_data['YearRemodAdd']

    X['Total_sqr_footage'] = (all_data['BsmtFinSF1'] 
                                     + all_data['BsmtFinSF2'] 
                                     + all_data['FirstFlrSF'] 
                                     + all_data['SecondFlrSF']
                                    )


    X['Total_Bathrooms'] = (all_data['FullBath'] 
                                   + (0.5 * all_data['HalfBath']) 
                                   + all_data['BsmtFullBath'] 
                                   + (0.5 * all_data['BsmtHalfBath'])
                                  )


    X['Total_porch_sf'] = (all_data['OpenPorchSF'] 
                                  + all_data['Threeseasonporch'] 
                                  + all_data['EnclosedPorch'] 
                                  + all_data['ScreenPorch'] 
                                  + all_data['WoodDeckSF']
                                 )
    X['haspool'] = all_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
    X['has2ndfloor'] = all_data['SecondFlrSF'].apply(lambda x: 1 if x > 0 else 0)
    X['hasgarage'] = all_data['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
    X['hasbsmt'] = all_data['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
    X['hasfireplace'] = all_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)    
    
    return X

# Latitude & Longitude
*Thanks YatishDua*

In [None]:
dict_neighbor = {
'NAmes'  :{'lat': 42.045830,'lon': -93.620767},
'CollgCr':{'lat': 42.018773,'lon': -93.685543},
'OldTown':{'lat': 42.030152,'lon': -93.614628},
'Edwards':{'lat': 42.021756,'lon': -93.670324},
'Somerst':{'lat': 42.050913,'lon': -93.644629},
'Gilbert':{'lat': 42.060214,'lon': -93.643179},
'NridgHt':{'lat': 42.060357,'lon': -93.655263},
'Sawyer' :{'lat': 42.034446,'lon': -93.666330},
'NWAmes' :{'lat': 42.049381,'lon': -93.634993},
'SawyerW':{'lat': 42.033494,'lon': -93.684085},
'BrkSide':{'lat': 42.032422,'lon': -93.626037},
'Crawfor':{'lat': 42.015189,'lon': -93.644250},
'Mitchel':{'lat': 41.990123,'lon': -93.600964},
'NoRidge':{'lat': 42.051748,'lon': -93.653524},
'Timber' :{'lat': 41.998656,'lon': -93.652534},
'IDOTRR' :{'lat': 42.022012,'lon': -93.622183},
'ClearCr':{'lat': 42.060021,'lon': -93.629193},
'StoneBr':{'lat': 42.060227,'lon': -93.633546},
'SWISU'  :{'lat': 42.022646,'lon': -93.644853}, 
'MeadowV':{'lat': 41.991846,'lon': -93.603460},
'Blmngtn':{'lat': 42.059811,'lon': -93.638990},
'BrDale' :{'lat': 42.052792,'lon': -93.628820},
'Veenker':{'lat': 42.040898,'lon': -93.651502},
'NPkVill':{'lat': 42.049912,'lon': -93.626546},
'Blueste':{'lat': 42.010098,'lon': -93.647269}
}

def lat_lon(df):
    X = df.copy()
    X['Lat'] = [dict_neighbor[n]['lat'] for n in df['Neighborhood']]
    X['Lon'] = [dict_neighbor[n]['lon'] for n in df['Neighborhood']]   
    return X

# k-Means Clustering

In [None]:
cluster_features = [
    "LotArea",
    "TotalBsmtSF",
    "FirstFlrSF",
    "SecondFlrSF",
    "GrLivArea",
]

def cluster_labels(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=n_clusters, n_init=50, random_state=0)
    X_new = pd.DataFrame()
    X_new["Cluster"] = kmeans.fit_predict(X_scaled)
    return X_new

def cluster_distance(df, features, n_clusters=20):
    X = df.copy()
    X_scaled = X.loc[:, features]
    X_scaled = (X_scaled - X_scaled.mean(axis=0)) / X_scaled.std(axis=0)
    kmeans = KMeans(n_clusters=20, n_init=50, random_state=0)
    X_cd = kmeans.fit_transform(X_scaled)
    # Label features and join to dataset
    X_cd = pd.DataFrame(
        X_cd, columns=[f"Centroid_{i}" for i in range(X_cd.shape[1])]
    )
    return X_cd

# Principal Component Analysis

In [None]:
def apply_pca(X, standardize=True):
    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)

    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings

def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    return axs

In [None]:
features = [
    "GarageArea",
    "YearRemodAdd",
    "TotalBsmtSF",
    "GrLivArea",
]

print("Correlation with SalePrice:\n")
print(df[features].corrwith(df.SalePrice))

In [None]:
Z = X.copy()
# y = Z.pop("SalePrice")
Z = Z.loc[:, features]

In [None]:
pca, Z_pca, loadings = apply_pca(Z)

In [None]:
loadings

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 3))
sns.heatmap(loadings, annot=True)

In [None]:
plot_variance(pca)

In [None]:
def pca_inspired(df):
    X = pd.DataFrame()
    X["Feature1"] = df.GrLivArea + df.TotalBsmtSF
    X["Feature2"] = df.YearRemodAdd * df.TotalBsmtSF
    return X

def pca_components(df, features):
    X = df.loc[:, features]
    _, X_pca, _ = apply_pca(X)
    return X_pca

pca_features = [
    "GarageArea",
    "YearRemodAdd",
    "TotalBsmtSF",
    "GrLivArea",
]

# Correlation

In [None]:
def corrplot(df, method="pearson", annot=True, **kwargs):
    sns.clustermap(
        df.corr(method),
        vmin=-1.0,
        vmax=1.0,
        cmap="icefire",
        method="complete",
        annot=annot,
        **kwargs,
    )

corrplot(df_train, annot=None)

In [None]:
# def indicate_outliers(df):
#     X_new = pd.DataFrame()
#     X_new["Outlier"] = (df.Neighborhood == "Edwards") & (df.SaleCondition == "Partial")
#     return X_new


# Target Encoding

In [None]:
class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=5)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

# Create Final Feature Set

In [None]:
def create_features(df, df_test=None):

    X = df.copy()
    y = X.pop(target)
    mi_scores = make_mi_scores(X, y)

    if df_test is not None:
        X_test = df_test.copy()
        if target in X_test.columns:
            X_test.pop(target)
        X = pd.concat([X, X_test])
        
    X = drop_uninformative(X, mi_scores)
    
    X = X.join(mathematical_transforms(X))
    X = X.join(interactions(X))
    X = X.join(counts(X))
    X = X.join(group_transforms(X))
    
#     X = X.join(add_features(X))
    
    X = lat_lon(X)
    
    X = X.join(pca_inspired(X))

    X = label_encode(X)

    # Reform splits
    if df_test is not None:
        X_test = X.loc[df_test.index, :]
        X.drop(df_test.index, inplace=True)

    # Target Encoder
    encoder = CrossFoldEncoder(CatBoostEncoder, a=1)
    X = X.join(encoder.fit_transform(X, y, cols=["Neighborhood", "MSSubClass"]))
    if df_test is not None:
        X_test = X_test.join(encoder.transform(X_test))

    if df_test is not None:
        return X, X_test
    else:
        return X

## Load, preprocess and add features

In [None]:
df_train, df_test = load_data()
X_train, X_test = create_features(df_train, df_test)
y_train = df_train.loc[:, target]

### Score once more

In [None]:
print(f"Baseline score: {baseline_score:.5f}")
new_score, scores = score_dataset(X_train, y_train)
print(f"New score: {new_score:.5f}")

## New features

In [None]:
new_features = X_train.columns.tolist()[-16:]
print(new_features)
dataset_info(pd.concat([X_train[new_features], y_train], axis=1), X_test[new_features])

# Let's see the map

In [None]:
import plotly.express as px
import plotly.graph_objs as go

neighborhoods = pd.DataFrame(dict_neighbor).T
msp = df_train.groupby('Neighborhood').SalePrice.agg([np.mean])
neighborhoods['msp'] = msp

fig = go.Figure(go.Scattermapbox(lat=neighborhoods['lat'],
                                 lon=neighborhoods['lon'],
                                 text=neighborhoods.index,
                                 hoverinfo="text",
                                 marker=dict(colorbar=dict(title="Mean SalePrice"),
                                             color=neighborhoods['msp'],
                                             size=15)
                                ), layout= {'height': 800, 'width': 800}
               )
map_center = go.layout.mapbox.Center(lat=(neighborhoods['lat'].max()+neighborhoods['lat'].min())/2,
                                     lon=(neighborhoods['lon'].max()+neighborhoods['lon'].min())/2)
fig.update_layout(mapbox_style="open-street-map", mapbox=dict(center=map_center, zoom=12))

# Permutation Importance

In [None]:
X = create_features(df_train)
y = df_train.loc[:, "SalePrice"]

train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)
first_model = XGBRegressor(n_estimators=50, random_state=1).fit(train_X, train_y)

perm = PermutationImportance(first_model, random_state=1).fit(val_X, val_y)

eli5.show_weights(perm, feature_names = X.columns.tolist(), top=None)

In [None]:
fi = pd.DataFrame(perm.feature_importances_, index=X.columns)
fi.columns = ['score']
important_features = list(fi[fi.score > 0].index)
print('Important features', len(important_features), 'of', len(X.columns))


# Compare regressors

In [None]:
regressors = []
regressors.append(('Linear', LinearRegression()))
regressors.append(('Ridge', Ridge()))
regressors.append(('Lasso', Lasso()))
regressors.append(('Elastic', ElasticNet()))
regressors.append(('LARS', LarsCV()))
regressors.append(('Bayes', BayesianRidge()))
regressors.append(('KNeighbor', KNeighborsRegressor()))
regressors.append(('DTree', DecisionTreeRegressor()))
regressors.append(('LSVR', LinearSVR()))
regressors.append(('SVR', SVR()))
regressors.append(('AdaBoost', AdaBoostRegressor()))
regressors.append(('Bagging', BaggingRegressor()))
regressors.append(('ExtraTree', ExtraTreesRegressor()))
regressors.append(('GBoost', GradientBoostingRegressor()))
regressors.append(('RForest', RandomForestRegressor()))
regressors.append(('LGBM', LGBMRegressor()))
regressors.append(('XGB', XGBRegressor()))
regressors.append(('CatBoost', CatBoostRegressor(silent=True)))

In [None]:
df_train, df_test = load_data()
X, X_test = create_features(df_train, df_test)
X = X[important_features]
X_test = X_test[important_features]
y = df_train.loc[:, target]

strange = ['BsmtFinSF2', 'BsmtUnfSF', 'Condition2']
X = X.drop(strange, axis=1)
X_test = X_test.drop(strange, axis=1)

In [None]:
print(X.shape, X_test.shape)

In [None]:
scores = []
results = []

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
kfold = KFold(n_splits=5, random_state=7, shuffle=True)

for name, regressor in regressors:
    print('scoring', name, end='.'*(10-len(name)))

    mean_score, scores = score_dataset(X, y, regressor, cv=kfold)
    for score in scores:
        if score < 0.25:
            results.append({'model': name, 'score': score})
    print(f'{mean_score:.5f}')

results_df = pd.DataFrame(results)
fig = plt.figure(figsize=(25,8))
sns.boxplot(x=results_df.model, y=results_df['score'])
plt.show()    

# SHAP

In [None]:
import shap  # package used to calculate Shap values

pars = {'max_depth': 6,
 'learning_rate': 0.002483868626800697,
 'n_estimators': 7989,
 'min_child_weight': 1,
 'colsample_bytree': 0.5030185650975951,
 'subsample': 0.44705436353703526,
 'reg_alpha': 0.007053643198184307,
 'reg_lambda': 0.03677328533642659}

df_train, df_test = load_data()
X, X_test = create_features(df_train, df_test)
y = df_train.loc[:, "SalePrice"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)

regressor = XGBRegressor(**pars)
regressor.fit(X_train, y_train)
prediction = regressor.predict(X_valid)

In [None]:
for i in range(2):
    data_for_prediction = X_valid.iloc[[i]]  
    explainer = shap.TreeExplainer(regressor)
    shap_values = explainer.shap_values(data_for_prediction)
    shap.initjs()
    plt = shap.force_plot(explainer.expected_value, shap_values[0], data_for_prediction)
    display(plt)

In [None]:
explainer = shap.TreeExplainer(regressor)    
shap_values = explainer.shap_values(X_valid)
print(X_valid.shape, shap_values.shape)
plt = shap.summary_plot(shap_values, X_valid, max_display=X_valid.shape[1])
display(plt)

`BsmtFinSF2`, `BsmtUnfSF` and `Condition2` have strange outliers, I'm to drop them

In [None]:
df_train, df_test = load_data()
X, X_test = create_features(df_train, df_test)
y = df_train.loc[:, target]
strange = ['BsmtFinSF2', 'BsmtUnfSF', 'Condition2']
X = X.drop(strange, axis=1)
X_test = X_test.drop(strange, axis=1)

# OPTUNA

In [None]:
import optuna

## Tune XGB regressor

In [None]:
# XGB = {
#     'tree_method': 'hist',
#     'booster': 'gbtree',
#     'random_state': 228,
#     'use_label_encoder': False,
#     'eval_metric': 'rmse'
# }

# def objective(trial):
#     params = dict(
#         max_depth=trial.suggest_int("max_depth", 2, 20),
#         learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
#         n_estimators=trial.suggest_int("n_estimators", 1000, 25000),
#         min_child_weight=trial.suggest_int("min_child_weight", 1, 300),
#         colsample_bytree=trial.suggest_float("colsample_bytree", 0.2, 1.0),
#         subsample=trial.suggest_float("subsample", 0.2, 1.0),
#         reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e2, log=True),
#         reg_lambda=trial.suggest_float("reg_lambda", 1e-4, 1e2, log=True),
#         gamma=trial.suggest_float("gamma", 1e-4, 1e2, log=True),
#     )
#     regressor = XGBRegressor(**params, **XGB)
#     return score_dataset(X, y, regressor, cv=5)[0]

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=30)
# params = study.best_params

# display(params)

## Tune CAT regressor

In [None]:
# CB = {
#     'grow_policy': 'Depthwise', 
#     'leaf_estimation_method': 'Newton',
#     'random_seed': 228,
#     'loss_function': 'RMSE',
#     'eval_metric': 'RMSE',
#     'bootstrap_type': 'Bernoulli',
#     'silent': True,
# }

# def objective(trial):
#     params = dict(
#         depth=trial.suggest_int("depth", 2, 10),
#         learning_rate=trial.suggest_float("learning_rate", 1e-4, 1e-1, log=True),
#         iterations=trial.suggest_int("iterations", 300, 25000),
#         max_bin=trial.suggest_int("max_bin", 10, 300),
#         min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 10, 500),
#         l2_leaf_reg=trial.suggest_float("l2_leaf_reg", 0.2, 1.0),
#         subsample=trial.suggest_float("subsample", 0.2, 1.0),
#     )
#     regressor = CatBoostRegressor(**params, **CB)
#     return score_dataset(X, y, regressor, cv=5)[0]

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=30)
# params = study.best_params

# display(params)

### Tune LGBM

In [None]:
# lgbm_def = LGBMRegressor(
#     boosting_type='gbdt', 
#     num_leaves=31,
#     max_depth=- 1, 
#     learning_rate=0.1, 
#     n_estimators=100, 
#     subsample_for_bin=200000, 
#     objective=None, 
#     class_weight=None, 
#     min_split_gain=0.0, 
#     min_child_weight=0.001, 
#     min_child_samples=20, 
#     subsample=1.0, 
#     subsample_freq=0, 
#     colsample_bytree=1.0, 
#     reg_alpha=0.0, 
#     reg_lambda=0.0, 
#     random_state=None, 
#     n_jobs=- 1, 
#     silent=True, 
#     importance_type='split'
# )
# # {   
# #  'learning_rate': 0.09039083345568485,
# #  'reg_alpha': 0.0001190115814442082,
# #  'reg_lambda': 1.5174576281379848,
# #  'cat_l2': 13.530516606143303
# # }
# LGBM = {
#     'n_estimators': 14522,
#     'max_depth': 8,
#     'min_data_in_leaf': 31,
#     'random_state': 228,
#     'metric': 'rmse'
# }

# def objective(trial):
#     params = dict(
# #         max_depth=trial.suggest_int("max_depth", 2, 10),
#         learning_rate=trial.suggest_float("learning_rate", 1e-3, 1e-1, log=True),
# #         n_estimators=trial.suggest_int("n_estimators", 300, 25000),
#         reg_alpha=trial.suggest_float("reg_alpha", 1e-4, 1e-2, log=True),
#         reg_lambda=trial.suggest_float("reg_lambda", 1e-1, 1e2, log=True),
#         num_leaves=trial.suggest_int("num_leaves", 10, 300),
# #         min_data_in_leaf=trial.suggest_int("min_data_in_leaf", 10, 500),
#         cat_l2=trial.suggest_float("cat_l2", 1, 20),
# #         min_child_samples=trial.suggest_float("min_child_samples", 2, 50),
#     )
#     regressor = LGBMRegressor(**params, **LGBM)
#     return score_dataset(X, y, regressor, cv=5)[0]

# study = optuna.create_study(direction="minimize")
# study.optimize(objective, n_trials=50)
# params = study.best_params

# display(params)

# Voting Regressor

In [None]:
xgb_pars = {'max_depth': 6,
 'learning_rate': 0.002483868626800697,
 'n_estimators': 7989,
 'min_child_weight': 1,
 'colsample_bytree': 0.5030185650975951,
 'subsample': 0.44705436353703526,
 'reg_alpha': 0.007053643198184307,
 'reg_lambda': 0.03677328533642659}

xgb = XGBRegressor(**xgb_pars)

CB = {
    'grow_policy': 'Depthwise', 
    'leaf_estimation_method': 'Newton',
    'random_seed': 228,
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'bootstrap_type': 'Bernoulli',
    'silent': True,
}

cat_pars = {'depth': 5,
 'learning_rate': 0.002058251099330786,
 'iterations': 10294,
 'max_bin': 207,
 'min_data_in_leaf': 14,
 'l2_leaf_reg': 0.36181081978772367,
 'subsample': 0.5798334683744184}

cat = CatBoostRegressor(**cat_pars, **CB)

lgbm_pars = {'learning_rate': 0.001816775252684838, 'reg_alpha': 0.0015734456737639636, 'reg_lambda': 0.10747950720829248, 'num_leaves': 67, 'cat_l2': 8.38470951659679}

LGBM = {
    'n_estimators': 14522,
    'max_depth': 8,
    'min_data_in_leaf': 31,
    'random_state': 228,
    'metric': 'rmse'
}

lgbm = LGBMRegressor(**lgbm_pars, **LGBM)

lgbm2 = LGBMRegressor(objective='regression', 
                                       num_leaves=4,
                                       learning_rate=0.01, 
                                       n_estimators=5000,
                                       max_bin=200, 
                                       bagging_fraction=0.75,
                                       bagging_freq=5, 
                                       bagging_seed=7,
                                       feature_fraction=0.2,
                                       feature_fraction_seed=7,
                                       verbose=-1,
                                       )

kfolds = KFold(n_splits=10, shuffle=True, random_state=42)
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]

ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kfolds))
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, 
                                              alphas=alphas2, 
                                              random_state=42, 
                                              cv=kfolds))
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kfolds, l1_ratio=e_l1ratio))                                
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))


all_estimators=[('CAT', cat), ('XGB', xgb), ('LR', LinearRegression()), ('Ridge', ridge), ('Lasso', lasso), ('Elastic', elasticnet), ('SVR', svr), ('LGBM', lgbm), ('LGBM2', lgbm2)]


In [None]:
# scores = []
# results = []

# X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
# kfold = KFold(n_splits=5, random_state=7, shuffle=True)

# for name, estimator in all_estimators:
#     print('scoring', name, end='.'*(10-len(name)))

#     mean_score, scores = score_dataset(X, y, estimator, cv=kfold)
#     for score in scores:
#         if score < 0.25:
#             results.append({'estimator': name, 'score': score})
#     print(f'{mean_score:.5f}')

# results_df = pd.DataFrame(results)
# fig = plt.figure(figsize=(25,8))
# sns.boxplot(x=results_df.estimator, y=results_df['score'])
# plt.show()   

In [None]:
estimators=[('cat', cat), ('xgb', xgb), ('Linear', LinearRegression()), ('Ridge', Ridge()), ('lgbm', lgbm2)]

# weights = [0.40, 0.48, 0.05, 0.07]
# weights = [0.38, 0.50, 0.04, 0.08] # 0.11851
weights = [0.38, 0.48, 0.04, 0.08, 0.02] # 0.011848
# weights = [0.38, 0.47, 0.04, 0.04, 0.07]

model = VotingRegressor(estimators=estimators, weights=weights, n_jobs=-1)

# Submition

In [None]:
model.fit(X, np.log(y))
predictions = 1.009 * np.exp(model.predict(X_test)) 
output = pd.DataFrame({'Id': X_test.index, 'SalePrice': predictions})
output.to_csv('submission.csv', index=False)
print("Your submission was successfully saved!")

# THE END