In [None]:
import numpy as np
import pandas as pd
import os
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.ensemble import VotingClassifier
import torch
from torch import nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import math
from torch.utils.data import TensorDataset, DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
import catboost
from sklearn.metrics import mean_squared_error
import scipy
from scipy import stats
from scipy.stats import norm

In [None]:
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', sep=',')

In [None]:
train

first of all we should drop the most useless feature. It is a feature "*Id*"

In [None]:
test_ID = test['Id']
train.drop("Id", axis = 1, inplace = True)
test.drop("Id", axis = 1, inplace = True)

# Analysis of target

In [None]:
target = 'SalePrice'
print(train.loc[:, target].isnull().any()) # all target values is filled
(mu, sigma) = norm.fit(train['SalePrice']) # get the fitted parameters used by the function
print(f'mu = {mu:.2f} and sigma = {sigma:.2f}')

In [None]:
sns.set_style('whitegrid')

In [None]:
y = train[target]
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(21, 4))
fig.suptitle(f'Original $\mu=$ {mu:.2f}, $\sigma=$ {sigma:.2f}', fontsize=16)
sns.distplot(y, fit=norm, ax=ax[0], label = 'asdasdasdasd')
ax[0].set_title('SalePrice distribution')
ax[0].set_ylabel('Frequency')
ax[0].legend(labels=['Normal dist', 'Our dist.'])

res = stats.probplot(y, plot=ax[1])
plt.show()

> Target have massive right tail. It's not seems like normal destribution. We need to change it!

In [None]:
y = train[target]
log_y = np.log1p(y)
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(21, 4))
fig.suptitle(f'Log-transformation $\mu=$ {mu:.2f}, $\sigma=$ {sigma:.2f}', fontsize=16)
sns.distplot(log_y, fit=norm, ax=ax[0])
ax[0].set_title('SalePrice distribution')
ax[0].set_ylabel('Frequency')
ax[0].legend(labels=['Normal dist.', 'Our log transformed dist.'])

res = stats.probplot(log_y, plot=ax[1])
plt.show()

In [None]:
y.corr(np.expm1(log_y))

That is better! :)

# Analysis of featutes

First of all in this section we should fill empty values in features by data description

### Filling empties from data description

In [None]:
train['Alley'] = train['Alley'].fillna('No alley access')
train['BsmtQual'] = train['BsmtQual'].fillna('No Basement')
train['BsmtCond'] = train['BsmtCond'].fillna('No Basement')
train['BsmtExposure'] = train['BsmtExposure'].fillna('No Basement')
train['BsmtFinType1'] = train['BsmtFinType1'].fillna('No Basement')
train['BsmtFinType2'] = train['BsmtFinType2'].fillna('No Basement')
train['FireplaceQu'] = train['FireplaceQu'].fillna('No Fireplace')
train['GarageType'] = train['GarageType'].fillna('No Garage')
train['GarageFinish'] = train['GarageFinish'].fillna('No Garage')
train['GarageQual'] = train['GarageQual'].fillna('No Garage')
train['GarageCond'] = train['GarageCond'].fillna('No Garage')
train['PoolQC'] = train['PoolQC'].fillna('No Pool')
train['Fence'] = train['Fence'].fillna('No Fence')
train['MiscFeature'] = train['MiscFeature'].fillna('None')

test['Alley'] = test['Alley'].fillna('No alley access')
test['BsmtQual'] = test['BsmtQual'].fillna('No Basement')
test['BsmtCond'] = test['BsmtCond'].fillna('No Basement')
test['BsmtExposure'] = test['BsmtExposure'].fillna('No Basement')
test['BsmtFinType1'] = test['BsmtFinType1'].fillna('No Basement')
test['BsmtFinType2'] = test['BsmtFinType2'].fillna('No Basement')
test['FireplaceQu'] = test['FireplaceQu'].fillna('No Fireplace')
test['GarageType'] = test['GarageType'].fillna('No Garage')
test['GarageFinish'] = test['GarageFinish'].fillna('No Garage')
test['GarageQual'] = test['GarageQual'].fillna('No Garage')
test['GarageCond'] = test['GarageCond'].fillna('No Garage')
test['PoolQC'] = test['PoolQC'].fillna('No Pool')
test['Fence'] = test['Fence'].fillna('No Fence')
test['MiscFeature'] = test['MiscFeature'].fillna('None')

### Little feature engeneering

So "CentralAir" and "Street" is a binary features, so we can encoding ones with 0 and 1. Also we can create a new feature that mean pool presence

In [None]:
train['Pool_presence'] = [0 if val=='No Pool' else 1 for val in train['PoolQC']]
test['Pool_presence'] = [0 if val=='No Pool' else 1 for val in test['PoolQC']]
train['CentralAir'] = [1 if x == train['CentralAir'].unique()[0] else 0 for x in train.loc[:,'CentralAir'].values]
test['CentralAir'] = [1 if x == test['CentralAir'].unique()[0] else 0 for x in test.loc[:,'CentralAir'].values]

Save changed train dataset

### Feature meaning evaluation by Catboost

In [None]:
from catboost.eval.catboost_evaluation import *
from catboost.utils import create_cd

In [None]:
cat_features = list(train.select_dtypes(include='object').columns)
train[cat_features].describe().T.sort_values('unique', ascending=False)

In [None]:
cols = list(train.columns)
a, b = cols.index('SalePrice'), cols.index('Pool_presence')
cols[b], cols[a] = cols[a], cols[b]
train = train[cols]
train.head()

In [None]:
train.to_csv('train2.csv', index=False)

In [None]:
feature_names = dict(list(enumerate(train.keys())))
cat_dict  = {i:label for i,label in enumerate(train.columns) if label in cat_features+['Fence', 'MiscFeature', 'Alley', 'PoolQC']}
del feature_names[80]
# train[cat_dict.values()].dtypes, \
# cat_dict

In [None]:
create_cd(
    label=80,
    cat_features=cat_dict.keys(),
    feature_names=feature_names,
    output_path='train.cd'
)
!cat ./train.cd

In [None]:
fold_size = int(round(len(train)/2, -2))
folds_count=10
description_file='./train.cd'
train2_file='./train2.csv'

# We can chose best params with grid_search function
learn_params={'iterations':505,
              'task_type' : 'GPU',
              'random_seed': 2, 
              'learning_rate' : 0.1, 
              'max_depth': 10, 
              'l2_leaf_reg': 9.8, 
              'loss_function': 'RMSE', 
              'max_ctr_complexity' : 2, 
              'logging_level': 'Silent', 
              'boosting_type': 'Plain',}

evaluator = CatboostEvaluation(train2_file,
                               fold_size,
                               folds_count,
                               delimiter=',',
                               column_description=description_file,
                               partition_random_seed=2,
                               has_header=True,
)

# print(evaluator.get_working_dir())

result = evaluator.eval_features(learn_config=learn_params,
                                 eval_metrics=["RMSE"],
                                 features_to_eval=range(1,80),
                                )

In [None]:
logloss_result = result.get_metric_results("RMSE")
logloss_result.get_baseline_comparison()

In [None]:
good_features = logloss_result.get_baseline_comparison()[logloss_result.get_baseline_comparison()['Decision']=='GOOD'].index
good_features = [pd.read_csv(train2_file).columns[int(str(feature)[-2:])] for i,feature in enumerate(good_features)]
len(good_features), good_features

> We will use only 'good' features

In [None]:
train = train[good_features+[target]]
test = test[good_features]

First of all in this section we should fill empty values in features by data description

### Almost empty features

In [None]:
def empty_features(test, train, threshhold=0, verbose=False):
    if verbose:
        print(' column \t test \t\t train\n', '*'*40)
    i=1
    empty_list=[]
    for col in test.columns:
        percentage_train = (train[col].isnull().sum()/len(train))*100
        percentage_test = (test[col].isnull().sum()/len(test))*100
        if percentage_train and percentage_test:
            if ((percentage_test>=threshhold)|(percentage_train>=threshhold)):
                empty_list.append(col)
                if verbose:
                    print(i,'{}{}% \t{}%'.format(col.ljust(15,' '), round(percentage_test, 3), round(percentage_train,3 )))
                    i+=1
    return empty_list

In [None]:
empty_features_list=empty_features(test=test, train=train, verbose=True, threshhold=80) # features with more then 80% (threshold) empty values
empty_features_list

We should drop this features

### Fetures with only 1 value

In [None]:
only_1_value = [feature for feature in train.nunique().index if train.nunique()[feature]==1]
only_1_value

### Correlation-Matrix with Heatmap

In [None]:
corr_matrix = train.corr()

sns.set(rc={'figure.figsize':(20,15)})
ax = sns.heatmap(corr_matrix,
                 annot = True, 
                 annot_kws = {'size': 8}, 
                 fmt = '.1f', 
                 cmap = 'PiYG', 
                 linewidths = 1, 
                )

In [None]:
 # Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.85
drop_by_corr = [column for column in upper.columns if any(upper[column] > 0.875)]
print(drop_by_corr)

We also should drop this useless feature

### Almost constant features

In [None]:
def quasi_constant(df, threshold=0, verbose=True):
    features_list = []
    for feature in df:
        table = df[feature].value_counts() / np.float(len(df))
        first=1
        for name in table.index:
            if table[name]>=threshold:
                if first:
                    if verbose:
                        print(f'for feature "{feature}":', 'value\t\tscore', sep='\n')
                    first=0
                if verbose:
                    print(name, table[name], sep='\t', end='\n'+'*'*50+'\n')
        if not first:
            features_list.append(feature)
    return features_list

In [None]:
quasi_constant_features = quasi_constant(train, 0.95)
quasi_constant_features

This features are almost constant. We should drop it too

In [None]:
bad_features = drop_by_corr+only_1_value+quasi_constant_features+empty_features_list
len(bad_features), bad_features

In [None]:
len(train.columns), train.columns

In [None]:
good_features = [feature for feature in good_features if feature not in bad_features]
len(good_features), good_features

In [None]:
train = train[good_features+[target]]
test = test[good_features]

In [None]:
len(train.columns), len(pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv', sep=',').columns)

We droped almost 40 useless features

---
# Pipeline function making with embedding categorical features, fillna and normalization

Trying to use embeddings for encoding categorical features

In [None]:
def pipelining_preprocessor(df1, dropcolls=None, target=None, filler=None, type='cat'): # return processed  dataFrame
    def fill_empty_by(df, filler):
        df_nums = df.select_dtypes(exclude='object')
        df_cats = df.select_dtypes(include='object')
        if filler == 'pop':
            obj = df_cats.describe().loc['top',:]
            digits = df_nums.median()
        elif filler == 'zero':
            obj = 'No info'
            digits = 0
        elif filler == 'out_of_range':
            obj = 'No info'
            digits = -9999
        elif filler == None:
            return df
        else:
            raise ValueError('filler vallues is not allowed ["zero", "pop", "out_of_range", None]')
        return df_cats.fillna(obj).join(df_nums.fillna(digits))[df.columns]
    
    df = df1.copy()
    if target:
        y = df[target]
        df.drop(target, axis=1, inplace=True)
    if dropcolls:
        df.drop(dropcolls, axis=1, inplace=True)
    if type=='cat':
        output = fill_empty_by(df, filler=filler)
    elif type=='ohe':
        output = pd.get_dummies(fill_empty_by(df, filler=filler), drop_first=True, prefix_sep=': ',)
    else:
        raise ValueError('type vallues is not allowed ["cat", "ohe"]')
    if target:
        return output,y
    else:
        return output

# Modeling

## Gradient boosting with CatBoost

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

In [None]:
X,y = pipelining_preprocessor(train, filler = 'out_of_range', target='SalePrice', type='cat')
y = np.log1p(y)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, shuffle=True, random_state=2)

In [None]:
cats = list(train.select_dtypes(include='object').columns)

In [None]:
full_pool = catboost.Pool(X, label = y, cat_features = cats)
train_pool = catboost.Pool(X_train, label = y_train, cat_features = cats)
val_pool = catboost.Pool(X_val, label = y_val, cat_features = cats)
test_pool = catboost.Pool(pipelining_preprocessor(test[X.columns], filler = 'out_of_range', type='cat'), cat_features = cats)

In [None]:
cat_params = {
#     'l2_leaf_reg' : [7, 8, 9],
#     'random_strength' : [1,2,3,4],
#     'learning_rate' : [0.01, 0.05, 0.005],
#     'max_ctr_complexity': [1, 2, 3],
#     'grow_policy': ['SymmetricTree', 'Depthwise', 'Lossguide']
}

cat_model = catboost.CatBoostRegressor(loss_function='RMSE', 
                                       random_seed=2, 
                                       max_depth = 10,
                                       learning_rate = 0.05,
                                       random_strength=1,
                                       max_ctr_complexity=1,
                                       l2_leaf_reg=8,
                                       grow_policy = 'Lossguide',
                                       task_type='GPU',
                                      )
grid_search_results = cat_model.grid_search(cat_params, full_pool, 
                                            partition_random_seed=2, cv = skf, 
                                            search_by_train_test_split=True, 
                                            plot=True)

grid_search_results['params']

In [None]:
cat_model = catboost.CatBoostRegressor(loss_function='RMSE', 
                                       random_seed=2, 
                                       max_depth = 10,
                                       learning_rate = 0.05,
                                       random_strength=1,
                                       max_ctr_complexity=1,
                                       l2_leaf_reg=8,
                                       grow_policy = 'Lossguide',
                                       task_type='GPU',
                                      )

In [None]:
cat_model.fit(train_pool, eval_set=val_pool)

## SHAP

In [None]:
import shap

In [None]:
shap_values = cat_model.get_feature_importance(train_pool, type='ShapValues')
shap.summary_plot(shap_values[:,:-1], X_train, plot_type='bar', max_display=30)

In [None]:
vals= np.abs(shap_values).mean(0)
feature_importance_shap = pd.DataFrame(list(zip(X_train.columns, vals)), columns=['feature','feature_importance_shap'])
feature_importance_shap.sort_values(by=['feature_importance_shap'], ascending=False, inplace=True)
feature_importance_shap.reset_index(inplace=True, drop=True)
feature_importance_shap

# Final step

In [None]:
gb_pred = list(map(int, map(np.expm1, cat_model.predict(test_pool))))
gb_output = pd.DataFrame({'Id': pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')['Id'], 
                          'SalePrice': gb_pred})

gb_output.to_csv('submission_catboost.csv', index=False)
gb_output