In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from matplotlib.ticker import MaxNLocator           
%matplotlib inline 
import seaborn as sns   

from scipy import stats 
from scipy.stats import skew, boxcox_normmax, norm
from scipy.special import boxcox1p 

import warnings
warnings.filterwarnings('ignore')

# Variable Identification and Descriptive Statistics


In [None]:
train_data = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')

In [None]:
train_data.head()

In [None]:
train_data.dtypes

In [None]:
# Segregating numerical data
numeric_columns = train_data.dtypes[train_data.dtypes != 'object'].index
len(numeric_columns)

In [None]:
# Segregating Catagorical Data
categorical_columns = train_data.dtypes[train_data.dtypes == 'object'].index
len(categorical_columns)

In [None]:
train_data.describe()

In [None]:
train_data.shape

# Univariate Analysis


In [None]:
# Segregating important independent variables for sales price
imp_columns = ['SalePrice','GrLivArea', 'TotalBsmtSF', 'OverallQual', 'YearBuilt']

In [None]:
# SalePrice
for col in imp_columns:
  print("Skewness of ", col,": " , train_data[col].skew());
  print("Kurtosis of ",col,": " , train_data[col].kurtosis());
  print("---------------------------")
  sns.set_style('white');
  plt.figure();
  sns.distplot(train_data[col], fit = norm);  

## SalePrice and GrLivArea are positively skewed and not normally distributed, can be transformed using log

In [None]:
# Histogram to display skewness of every numeric variable
f = pd.melt(train_data, value_vars = numeric_columns)
g = sns.FacetGrid(f, col="variable",  col_wrap=4, sharex=False, sharey=False)
g.map(lambda _x, **kwargs: sns.distplot(_x, fit = norm), 'value');

## No variable is normally distributed, but LotFrontage, 1stFlrSF, GrLivArea, LotArea can be transformed using log.

# Handling Missing Values



In [None]:
# Null Values and their Percentages
null_value_train = pd.DataFrame(train_data.isnull().sum()).reset_index()
null_value_train = null_value_train.rename(columns = {'index': 'Column Name', 0: 'Number of Null Values'}, inplace = False)
null_value_train['Percentage of Null Values'] = (null_value_train['Number of Null Values']/len(train_data)*100) 
null_value_train.sort_values(by = 'Percentage of Null Values', ascending = False).head(20)

In [None]:
# Replacing NaN with None in columns
## NaN in these columns represents No Pool Quality, No Garage Type, etc. 
columns_fillnone = ['PoolQC', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
                    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

for col in columns_fillnone:
    train_data[col].fillna('None',inplace=True)

In [None]:
# Updated Null Values and their Percentages
null_value_train = pd.DataFrame(train_data.isnull().sum()).reset_index()
null_value_train = null_value_train.rename(columns = {'index': 'Column Name', 0: 'Number of Null Values'}, inplace = False)
null_value_train['Percentage of Null Values'] = (null_value_train['Number of Null Values']/len(train_data)*100) 
null_value_train.sort_values(by = 'Percentage of Null Values', ascending = False).head(10)

In [None]:
# BoxPlot for LotFrontage  
plt.subplots(figsize=(15,10))
sns.boxplot( x = 'LotFrontage', y = 'Neighborhood', data = train_data);

In [None]:
# Houses in similar area will have same Lot Frontage, Masonry veneer area and Masonry type
## Replacing them with median 

train_data['LotFrontage'] = train_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

train_data['MasVnrType'] = train_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

train_data['MasVnrArea'] = train_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

In [None]:
# Updated Null Values and their Percentages
null_value_train = pd.DataFrame(train_data.isnull().sum()).reset_index()
null_value_train = null_value_train.rename(columns = {'index': 'Column Name', 0: 'Number of Null Values'}, inplace = False)
null_value_train['Percentage of Null Values'] = (null_value_train['Number of Null Values']/len(train_data)*100) 
null_value_train.sort_values(by = 'Percentage of Null Values', ascending = False).head(7)

In [None]:
# Using Mode to replace missing values for catagorical data such as Electrical System
train_data['Electrical'] = train_data['Electrical'].fillna(train_data['Electrical'].mode()[0])

In [None]:
# Replcing the rest with None
columns_fillnone = ['MiscFeature', 'Alley', 'Fence', 'FireplaceQu']

for col in columns_fillnone:
    train_data[col].fillna('None',inplace=True)

In [None]:
# Updated Null Values and their Percentages
null_value_train = pd.DataFrame(train_data.isnull().sum()).reset_index()
null_value_train = null_value_train.rename(columns = {'index': 'Column Name', 0: 'Number of Null Values'}, inplace = False)
null_value_train['Percentage of Null Values'] = (null_value_train['Number of Null Values']/len(train_data)*100) 
null_value_train.sort_values(by = 'Percentage of Null Values', ascending = False).head(3)

# Log Transformation



In [None]:
# Calculating Skewness and Kurtosis for numerical columns
for col in numeric_columns:
    print('{:15}'.format(col), 
          'Skewness: {:05.2f}'.format(train_data[col].skew()) , 
          '   ' ,
          'Kurtosis: {:06.2f}'.format(train_data[col].kurt())  
         )

In [None]:
# Performing log transformation on some positively skewed features
for df in [train_data]:
  df['SalePrice_Log'] = np.log(df['SalePrice'])
  df.drop('SalePrice', inplace= True, axis = 1)
  df['GrLivArea_Log'] = np.log(df['GrLivArea'])
  df.drop('GrLivArea', inplace= True, axis = 1)
  df['LotArea_Log'] = np.log(df['LotArea'])
  df.drop('LotArea', inplace= True, axis = 1)

In [None]:
log_trans_columns = ['SalePrice_Log', 'GrLivArea_Log', 'LotArea_Log']
for col in log_trans_columns:
  print("Skewness of ", col,": " , train_data[col].skew());
  print("Kurtosis of ",col,": " , train_data[col].kurtosis());
  print("---------------------------")
  sns.set_style('white');
  plt.figure();
  sns.distplot(train_data[col], fit = norm); 

In [None]:
# Correlation Matrix 1
corrmat = train_data.corr()
k = 10 
cols = corrmat.nlargest(k, 'SalePrice_Log')['SalePrice_Log'].index
cm = np.corrcoef(train_data[cols].values.T)
f, ax = plt.subplots(figsize=(10, 10))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', yticklabels=cols.values, xticklabels=cols.values, cmap = 'Blues')
plt.show()

In [None]:
imp_columns_corrmat1 = ['SalePrice_Log', 'OverallQual', 'GrLivArea_Log', 'GarageCars', 'TotalBsmtSF','FullBath', 'YearBuilt', 'YearRemodAdd']

# Converting YearBuilt and YearRemodAdd to str 
train_data['YearBuilt'] = train_data['YearBuilt'].astype('str')
train_data['YearBuilt'] = train_data['YearRemodAdd'].astype('str') 

# Bi-Variate Analysis


In [None]:
# Updating numeric_columns
numeric_columns = train_data.dtypes[train_data.dtypes != 'object'].index
numeric_columns

In [None]:
# Regression Plots for Numeric Features
def srt_reg(y, df):
    fig, axes = plt.subplots(12, 3, figsize=(25, 80))
    axes = axes.flatten()
    for i, j in zip(train_data[[col for col in numeric_columns]], axes):

        sns.regplot(x=i, y=y, data=df, ax=j, order=3, ci=None, color='#e74c3c', 
                    line_kws={'color': 'black'}, scatter_kws={'alpha':0.4})
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=10))
        plt.tight_layout()

srt_reg('SalePrice_Log', train_data)

In [None]:
# Correlational Analysis for numerical features 
train_data_num = pd.DataFrame(train_data[[col for col in numeric_columns]])
corrmat = train_data_num.corr()
f, ax = plt.subplots(figsize=(30, 25))
sns.heatmap(corrmat, vmax=.8, square=True, annot = True, cmap = 'Blues');

In [None]:
## Some numerical features such as OverAllQual, GrLivArea_Log, TotalRmsAbvGrd, etc. are highly correlated to SalePrice_Log.
## Other numerical features such as Id, LotArea, OverallCond, etc. are weakly correlated to the target variable, therefore, can be dropped.

## Observations for numerical features:  
# OverallQual: sale price of the house increases with overall quality.
# OverallCondition: Most of the houses are in 5/10 condition. Does not have much effect on SalePrice
# YearBuilt: Again new buildings are generally expensive than the old ones.
# Basement: Bigger basements are increasing the price. 
# GrLivArea: This feature is linear but two outliers can be spotted.
# SaleDates: They seem to have no effect on sale prices.

In [None]:
# Box Plot Analysis for Categorical Variables
def srt_box(y, df):
    fig, axes = plt.subplots(14, 3, figsize=(25, 80))
    axes = axes.flatten()

    for i, j in zip(train_data[[col for col in categorical_columns]], axes):

        sortd = df.groupby([i])[y].median().sort_values(ascending=False)
        sns.boxplot(x=i,
                    y=y,
                    data=df,
                    palette='plasma',
                    order=sortd.index,
                    ax=j)
        j.tick_params(labelrotation=45)
        j.yaxis.set_major_locator(MaxNLocator(nbins=18))

        plt.tight_layout()

srt_box('SalePrice_Log', train_data)

In [None]:
## Observations for categorical features: 

# MSZoning: #Floating village houses, has the highest median value.
            #Residental low density houses comes second with the some outliers. 
            #Residental high and low seems similar meanwhile commercial is the lowest.

# LandContour: Hillside houses seems a little bit higher expensive than the rest meanwhile banked houses are the lowest.

# Neighborhood: #Northridge Heights, Northridge and Timberland are top 3 expensive places for houses.
                #Somerset, Veenker, Crawford, Clear Creek, College Creek and Bloomington Heights seems above average.
                #Sawyer West has wide range for prices related to similar priced regions.
                #Old Town and Edwards has some outlier prices but they generally below average.
                #Briardale, Iowa DOT and Rail Road, Meadow Village are the cheapest places for houses it seems

# Conditions: #Meanwhile having wide range of values being close to North-South Railroad seems having positive effect on the price.
              #Being near or adjacent to positive off-site feature (park, greenbelt, etc.) increases the price.

# MasVnrType: Having stone masonry veneer seems better priced than having brick.

# CentralAir: Having central air system has decent positive effect on sale prices.

# GarageType: #Built-In garage typed houses are the most expensive ones.
              #Attached garage types follow the built-in ones in prices.
              #Car ports are the lowest


# Categorical to Numerical Conversion

In [None]:
# Converting some important Categorical Features to Numerical Features

neigh_map = {
    'MeadowV': 1,
    'IDOTRR': 1,
    'BrDale': 1,
    'BrkSide': 2,
    'OldTown': 2,
    'Edwards': 2,
    'Sawyer': 3,
    'Blueste': 3,
    'SWISU': 3,
    'NPkVill': 3,
    'NAmes': 3,
    'Mitchel': 4,
    'SawyerW': 5,
    'NWAmes': 5,
    'Gilbert': 5,
    'Blmngtn': 5,
    'CollgCr': 5,
    'ClearCr': 6,
    'Crawfor': 6,
    'Veenker': 7,
    'Somerst': 7,
    'Timber': 8,
    'StoneBr': 9,
    'NridgHt': 10,
    'NoRidge': 10
}
train_data['Neighborhood'] = train_data['Neighborhood'].map(neigh_map).astype('int')
ext_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
train_data['ExterQual'] = train_data['ExterQual'].map(ext_map).astype('int')
train_data['ExterCond'] = train_data['ExterCond'].map(ext_map).astype('int')
bsm_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
train_data['BsmtQual'] = train_data['BsmtQual'].map(bsm_map).astype('int')
train_data['BsmtCond'] = train_data['BsmtCond'].map(bsm_map).astype('int')
bsmf_map = {
    'None': 0,
    'Unf': 1,
    'LwQ': 2,
    'Rec': 3,
    'BLQ': 4,
    'ALQ': 5,
    'GLQ': 6
}
train_data['BsmtFinType1'] = train_data['BsmtFinType1'].map(bsmf_map).astype('int')
train_data['BsmtFinType2'] = train_data['BsmtFinType2'].map(bsmf_map).astype('int')
heat_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
train_data['HeatingQC'] = train_data['HeatingQC'].map(heat_map).astype('int')
train_data['KitchenQual'] = train_data['KitchenQual'].map(heat_map).astype('int')
train_data['FireplaceQu'] = train_data['FireplaceQu'].map(bsm_map).astype('int')
train_data['GarageCond'] = train_data['GarageCond'].map(bsm_map).astype('int')
train_data['GarageQual'] = train_data['GarageQual'].map(bsm_map).astype('int')

In [None]:
# Updating numerical and categorical feature columns
numeric_columns = train_data.dtypes[train_data.dtypes != 'object'].index
categorical_columns = train_data.dtypes[train_data.dtypes == 'object'].index

In [None]:
# Correlation Matrix 2
corrmat = train_data.corr()
k = 10 
cols = corrmat.nlargest(k, 'SalePrice_Log')['SalePrice_Log'].index
cm = np.corrcoef(train_data[cols].values.T)
f, ax = plt.subplots(figsize=(10, 10))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', yticklabels=cols.values, xticklabels=cols.values, cmap = 'Blues')
plt.show()

In [None]:
# Updating the list of highly correlated features
imp_columns_corrmat2 = ['SalePrice_Log', 'OverallQual', 'Neighborhood', 'GrLivArea_Log', 'GarageCars', 'ExterQual','KitchenQual','BsmtQual','TotalBsmtSF']

# Handling Outliers

In [None]:
# Detecting outliers and percentages using Extreme Value Analysis
def quantile_info(qu_dataset, qu_field):
  
    iqr = qu_dataset[qu_field].quantile(0.75) - qu_dataset[qu_field].quantile(0.25)
    print("Inter-Quartile Range:", iqr)
    
    upper_boundary = qu_dataset[qu_field].quantile(0.75) + (iqr * 1.5)
    lower_boundary = qu_dataset[qu_field].quantile(0.25) - (iqr * 1.5)
    print("Upper Boundary:", upper_boundary)
    print("Lower Boundary:", lower_boundary)
    
    upper_boundary_extreme = qu_dataset[qu_field].quantile(0.75) + (iqr * 3)
    lower_boundary_extreme = qu_dataset[qu_field].quantile(0.25) - (iqr * 3)
    print("Upper Extreme Boundary:", upper_boundary_extreme)
    print("Lower Extreme Boundary:", lower_boundary_extreme)

    count_over_upper = len(qu_dataset[qu_dataset[qu_field] > upper_boundary])
    count_under_lower = len(qu_dataset[qu_dataset[qu_field] < lower_boundary])
    percentage = 100 * (count_under_lower + count_over_upper) / len(qu_dataset[qu_field])
    print("Percentage of records out of Upper and Lower Boundaries: %.2f"% (percentage))
    
    count_over_upper = len(qu_dataset[qu_dataset[qu_field]>upper_boundary_extreme])
    count_under_lower = len(qu_dataset[qu_dataset[qu_field]<lower_boundary_extreme])
    percentage = 100 * (count_under_lower + count_over_upper) / len(qu_dataset[qu_field])
    print("Percentage of records out of Upper and Lower Extreme Boundaries: %.2f"% (percentage))

for col in imp_columns_corrmat2:
  print("Outlier Detection for ", col, ":")
  quantile_info(train_data, col);
  print("---"*10)

In [None]:
# Removing Outliers that lie outside Upper and Lower Boundaries

def remove_outliers_quantiles(qu_dataset, qu_field, qu_fence):
  iqr = qu_dataset[qu_field].quantile(0.75) - qu_dataset[qu_field].quantile(0.25)
  upper_boundary = qu_dataset[qu_field].quantile(0.75) + (iqr * 1.5)
  lower_boundary = qu_dataset[qu_field].quantile(0.25) - (iqr * 1.5)
  upper_boundary_extreme = qu_dataset[qu_field].quantile(0.75) + (iqr * 3)
  lower_boundary_extreme = qu_dataset[qu_field].quantile(0.25) - (iqr * 3)

  if qu_fence == "inner":
        output_dataset = qu_dataset[qu_dataset[qu_field] <= upper_boundary]
        output_dataset = output_dataset[output_dataset[qu_field] >= lower_boundary]
      
  elif qu_fence == "extreme":
        output_dataset = qu_dataset[qu_dataset[qu_field]<=upper_boundary_extreme]
        output_dataset = output_dataset[output_dataset[qu_field] >= lower_boundary_extreme]

  else:
        output_dataset = qu_dataset
  
  return output_dataset

train_data_new = remove_outliers_quantiles(train_data, 'SalePrice_Log', 'inner')
train_data_new = remove_outliers_quantiles(train_data_new, 'OverallQual', 'inner')
train_data_new = remove_outliers_quantiles(train_data_new, 'Neighborhood', 'inner')
train_data_new = remove_outliers_quantiles(train_data_new, 'TotalBsmtSF', 'extreme')

In [None]:
train_data_new.shape

In [None]:
# Updating Numerical and Categorical Features
numeric_columns = train_data.dtypes[train_data.dtypes != 'object'].index
categorical_columns = train_data.dtypes[train_data.dtypes == 'object'].index

In [None]:
imp_columns_corrmat3 = list(cols)
imp_columns_corrmat3

# Feature Creation


In [None]:
# Creating new features by combining some features

train_data['TotalSF'] = (train_data['BsmtFinSF1'] + train_data['BsmtFinSF2'] + 
                       train_data['1stFlrSF'] + train_data['2ndFlrSF'])

train_data['TotalBathrooms'] = (train_data['FullBath'] +
                              (0.5 * train_data['HalfBath']) +
                              train_data['BsmtFullBath'] +
                              (0.5 * train_data['BsmtHalfBath']))

train_data['TotalPorchSF'] = (train_data['OpenPorchSF'] + train_data['3SsnPorch'] +
                            train_data['EnclosedPorch'] +
                            train_data['ScreenPorch'] + train_data['WoodDeckSF'])

In [None]:
# Merging Quality and Condition
train_data['TotalExtQual'] = (train_data['ExterQual'] + train_data['ExterCond'])

train_data['TotalBsmQual'] = (train_data['BsmtQual'] + train_data['BsmtCond'] +
                            train_data['BsmtFinType1'] + train_data['BsmtFinType2'])

train_data['TotalGrgQual'] = (train_data['GarageQual'] + train_data['GarageCond'])

In [None]:
train_data['HasPool'] = train_data['PoolArea'].apply(lambda x: 1 if x > 0 else 0)

train_data['Has2ndFloor'] = train_data['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)

train_data['HasFireplace'] = train_data['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [None]:
# Dropping Features 

drop_columns = ['BsmtFinSF1', 'BsmtFinSF2', '1stFlrSF', '2ndFlrSF', 'FullBath', 
                'HalfBath', 'BsmtFullBath','BsmtHalfBath', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
                'ScreenPorch', 'WoodDeckSF', 'ExterQual', 'ExterCond', 'BsmtQual',
                'BsmtFinType1', 'BsmtFinType2', 'BsmtCond', 'GarageQual', 
                'GarageCond','GarageArea', 'PoolArea','Fireplaces']

In [None]:
train_data.drop(columns = drop_columns, inplace=True)

In [None]:
# Correlation Matrix 3
corrmat = train_data.corr()
k = 10
cols = corrmat.nlargest(k, 'SalePrice_Log')['SalePrice_Log'].index
cm = np.corrcoef(train_data[cols].values.T)
f, ax = plt.subplots(figsize=(10, 10))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', yticklabels=cols.values, xticklabels=cols.values, cmap = 'Blues')
plt.show()

In [None]:
# Updating numerical and categorical feature columns
numeric_columns = train_data.dtypes[train_data.dtypes != 'object'].index
categorical_columns = train_data.dtypes[train_data.dtypes == 'object'].index

# Feature Transformation

In [None]:
# Sorting data according to skewness
skew_data = np.abs(train_data[numeric_columns].apply(lambda x: skew(x)).sort_values(ascending=False))

In [None]:
# Segregating highly positively skewed data 
high_skew = skew_data[skew_data > 0.4]
skew_index = high_skew.index

In [None]:
# Applying BoxCox Transformation for highly skewed features
for i in skew_index:
    train_data[i] = boxcox1p(train_data[i], boxcox_normmax(train_data[i] + 1))

In [None]:
# Correlation Matrix 4
corrmat = train_data.corr()
k = 10
cols = corrmat.nlargest(k, 'SalePrice_Log')['SalePrice_Log'].index
cm = np.corrcoef(train_data[cols].values.T)
f, ax = plt.subplots(figsize=(10, 10))
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', yticklabels=cols.values, xticklabels=cols.values, cmap = 'Blues')
plt.show()

In [None]:
train_data.shape

In [None]:
correlation = train_data.corrwith(train_data['SalePrice_Log'])
correlation ['Abs Corr'] = correlation.abs()
sorted_correlations = correlation['Abs Corr'].sort_values(ascending=False)
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(sorted_correlations.to_frame()[sorted_correlations>=.5], cmap='coolwarm', annot=True, vmin=-1, vmax=1, ax=ax);