In [None]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 105)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
%matplotlib inline
sns.set()

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

from subprocess import check_output

In [None]:
# setting the number of cross validations used in the Model part
nr_cv = 5
# switch for using log values for SalePrice and features
use_logvals = 1
# target used for correlation
target = 'SalePrice_log'
# only columns with correlation above this threshold value
min_val_corr = 0.4
drop_similar = 1

In [None]:
def get_best_score(grid):
  best_score = np.sqrt(-grid.best_score_)
  print(best_score)
  print(grid.best_params_)
  print(grid.best_estimator_)

  return best_score

In [None]:
def print_cols_large_corr(df, nr_c, targ):
  corr = df.corr()
  corr_abs = corr.abs()
  print (corr_abs.nlargest(nr_c, targ)[targ])

In [None]:
def plot_corr_matrix(df, nr_c, targ):
  corr = df.corr()
  corr_abs = corr.abs()
  cols = corr_abs.nlargest(nr_c, targ)[targ].index
  cm = np.corrcoef(df[cols].values.T)

  plt.figure(figsize=(nr_c/1.5, nr_c/1.5))
  sns.set(font_scale=1.25)
  sns.heatmap(cm, linewidths=1.5, annot=True, square=True,
              fmt='.2f', annot_kws={'size': 10},
              yticklabels=cols.values, xticklabels=cols.values)
  plt.show

In [None]:
df_train = pd.read_csv("/content/train.csv")
df_test = pd.read_csv("/content/test.csv")

In [None]:
print(df_train.shape)
print("*"*50)
print(df_test.shape)

In [None]:
print(df_train.info())
print("*"*50)
print(df_test.info())

In [None]:
df_train.head()

In [None]:
df_train.describe()

In [None]:
df_test.head()

In [None]:
df_test.describe()

In [None]:
sns.distplot(df_train['SalePrice']);

print("Skewness: %f" % df_train['SalePrice'].skew())
print("Kurtosis: %f" % df_train['SalePrice'].kurt())

In [None]:
df_train['SalePrice_Log'] = np.log(df_train['SalePrice'])

sns.distplot(df_train['SalePrice_Log']);
print("Skewness: %f" % df_train['SalePrice_Log'].skew())
print("Kurtosis: %f" % df_train['SalePrice_Log'].kurt())

df_train.drop('SalePrice', axis= 1, inplace=True)

In [None]:
numerical_feats = df_train.dtypes[df_train.dtypes != 'object'].index
print("Number of Numerical feature: ", len(numerical_feats))

categorical_feats = df_train.dtypes[df_train.dtypes == "object"].index
print("Number of Categorical features: ", len(categorical_feats))

In [None]:
print(df_train[numerical_feats].columns)
print("*"*100)
print(df_train[categorical_feats].columns)

In [None]:
df_train[numerical_feats].head()

In [None]:
df_train[categorical_feats].head()

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)


In [None]:
cols_fillna = ['PoolQC', 'MiscFeature','Alley','Fence','MasVnrType','FireplaceQu',
               'GarageQual','GarageCond','GarageFinish','GarageType','Electrical',
               'KitchenQual','SaleType','Functional','Exterior2nd','Exterior1st',
               'BsmtExposure','BsmtCond','BsmtQual','BsmtFinType1','BsmtFinType2',
               'MSZoning','Utilities']

for col in cols_fillna:
  df_train[col].fillna('None',inplace=True)
  df_test[col].fillna('None',inplace=True)

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(5)

In [None]:
df_train.fillna(df_train.mean(), inplace=True)
df_test.fillna(df_test.mean(), inplace=True)

In [None]:
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(5)

In [None]:
df_train.isnull().sum().sum()

In [None]:
df_test.isnull().sum().sum()

In [None]:
for col in numerical_feats:
  print('{:15}'.format(col),
        'Skewness: {:05.2f}'.format(df_train[col].skew()),
        'Kurtosis: {:06.2f}'.format(df_train[col].kurt()))

In [None]:
sns.distplot(df_train['GrLivArea']);

print("Skewness: %f" % df_train['GrLivArea'].skew())
print("Kurtosis: %f" % df_train['GrLivArea'].kurt())

In [None]:
sns.distplot(df_train['LotArea']);

print("Skewness: %f" % df_train['LotArea'].skew())
print("Kurtosis: %f" % df_train['LotArea'].kurt())

In [None]:
for df in [df_train, df_test]:
  df['GrLivArea_Log'] = np.log(df['GrLivArea'])
  df.drop('GrLivArea', inplace=True, axis=1)
  df['LotArea_Log'] = np.log(df['LotArea'])
  df.drop('LotArea', inplace=True, axis=1)

numerical_feats = df_train.dtypes[df_train.dtypes != "object"].index

In [None]:
sns.distplot(df_train['GrLivArea_Log']);

print("Skewness: %f" % df_train['GrLivArea_Log'].skew())
print("Kurtosis: %f" % df_train['GrLivArea_Log'].kurt())

In [None]:
sns.distplot(df_train['LotArea_Log']);

print("Skewness: %f" % df_train['LotArea_Log'].skew())
print("Kurtosis: %f" % df_train['LotArea_Log'].kurt())

In [None]:
nr_rows=12
nr_cols=3

fig, axs = plt.subplots(nr_rows, nr_cols, figsize=(nr_cols*3.5,nr_rows*3))

li_num_feats = list(numerical_feats)
li_not_plot = ['Id', 'SalePrice', 'SalePrice_Log']
li_plot_num_feats = [c for c in list(numerical_feats) if c not in li_not_plot]

for r in range(0,nr_rows):
  for c in range(0,nr_cols):
    i = r*nr_cols+c
    if i < len(li_plot_num_feats):
      sns.regplot(df_train[li_plot_num_feats[i]], df_train[target], ax=axs[r][c])
      stp = stats.pearsonr(df_train[li_plot_num_feats[i]], df_train[target])

      str_title = "r=" + "{0:2f}".format(stp[0]) + "" "p =" + "{0:2f}".format(stp[1])

      axs[r][c].set_title(str_title,fontsize=1)

plt.tight_layout()
plt.show()

In [None]:
df_train = df_train.drop(
    df_train[(df_train['OverallQual']==10) & (df_train['SalePrice_Log']<12.3)].index)

In [None]:
df_train = df_train.drop(
    df_train[(df_train['GrLivArea_Log']>8.3) & (df_train['SalePrice_Log']<12.5)].index)