In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

# Any results you write to the current directory are saved as output.

In [None]:
input_path = 'data/train.csv'
train_data = pd.read_csv(input_path, index_col='Id')
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print(train_data.shape)
print(features.shape)

In [None]:
# Missing Data
null_features = train_data.columns[train_data.isnull().any()]
missing_ratio = (train_data[null_features].isnull().sum()/len(train_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    train_data[col] = train_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    train_data[col] = train_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    train_data[col] = train_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. We can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    train_data[col] = train_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
train_data['LotFrontage'] = train_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
train_data['Electrical'] = \
    train_data['Electrical'].fillna(train_data['Electrical'].mode()[0])
    
null_features = train_data.columns[train_data.isnull().any()]
missing_ratio = (train_data[null_features].isnull().sum()/len(train_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))

In [None]:
# Numerical Variable Analysis
num_features = train_data.select_dtypes(include=np.number)
num_feature_nms = num_features.columns
print('Number of cat features : ' + str(num_feature_nms.size))

def find_spearman_corr(train_data, features):
    sprearman_corr_df = pd.DataFrame()
    sprearman_corr_df['feature'] = features
    sprearman_corr_df['spearman'] = [train_data[f].corr(train_data['SalePrice'], 'spearman') for f in features]
    sprearman_corr_df = sprearman_corr_df.sort_values('spearman')
    plt.figure(figsize=(20,10))
    sns.barplot(data=sprearman_corr_df, y='feature', x='spearman')
    return sprearman_corr_df
    
sprearman_corr_df = find_spearman_corr(train_data, num_feature_nms)

In [None]:
'''
Picked these features from correlation with SalePrice.
OverallQual, GrLivArea, GarageCars, YearBuilt, GarageArea, FullBath, GarageYrBlt,
TotalBsmtSF, 1stFlrSF, YearRemodAdd, TotRmsAbvGrd, Fireplaces, OpenPorchSF, 
LotArea, MasVnrArea.

Lets check correlation between one another.
'''

sig_num_cols = ['OverallQual', 'GrLivArea', 'GarageCars', 'YearBuilt', 'GarageArea',
               'FullBath', 'GarageYrBlt', 'TotalBsmtSF', '1stFlrSF', 'YearRemodAdd',
               'TotRmsAbvGrd', 'Fireplaces', 'OpenPorchSF', 'LotArea', 'MasVnrArea']

spearman_corr = train_data[sig_num_cols].corr('spearman')

sns.set(font_scale=1.10)
plt.figure(figsize=(10, 10))
sns.heatmap(spearman_corr, vmax=.6, linewidths=0.01, square=True,annot=True,cmap='viridis',linecolor="white")
plt.title('Spearman Correlation between features')

In [None]:
'''
GrLivArea & TotRmsAbvGrd = 0.83
GarageCars & GarageArea = 0.85
YearBuilt & GarageYrBlt = 0.86
1stFlrSF & TotalBsmtSF = 0.83
YearRemodAdd with YearBuilt & GarageYrBlt = 0.68
GarageYrBlt & GarageArea = 0.66
GarageYrBlt & GarageCars = 0.69
FullBath & GrLivArea = 0.66


Exclude GrLivArea, GarageArea, GarageYrBlt, 1stFlrSF features owing to 
high correlation.

Final Numerical Features :
'OverallQual', 'GarageCars', 'YearBuilt','FullBath', 'TotalBsmtSF', 
'YearRemodAdd', 'TotRmsAbvGrd', 'Fireplaces', 'OpenPorchSF', 'LotArea', 
'MasVnrArea'
'''

