In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os

# Any results you write to the current directory are saved as output.

In [None]:
input_path = '/Users/rjanaki/DataSets/ML-PredictHousingPrices/train.csv'
train_data = pd.read_csv(input_path, index_col='Id')
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print(train_data.shape)
print(features.shape)

In [None]:
# Missing Data
null_features = train_data.columns[train_data.isnull().any()]
missing_ratio = (train_data[null_features].isnull().sum()/len(train_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    train_data[col] = train_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    train_data[col] = train_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    train_data[col] = train_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. We can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    train_data[col] = train_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
train_data['LotFrontage'] = train_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
train_data['Electrical'] = \
    train_data['Electrical'].fillna(train_data['Electrical'].mode()[0])
    
null_features = train_data.columns[train_data.isnull().any()]
missing_ratio = (train_data[null_features].isnull().sum()/len(train_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))



In [None]:
# Categorical Variable Analysis
cat_features = train_data.select_dtypes(exclude=np.number)
cat_feature_nms = cat_features.columns
print('Number of cat features : ' + str(cat_feature_nms.size))


def anova(data):
    anvova_sig_values = pd.DataFrame()
    anvova_sig_values['feature'] = cat_feature_nms
    pvals = []
    for col in cat_feature_nms:
        col_group_vals = []
        for cls in data[col].unique():
            s = data[data[col] == cls]['SalePrice'].values
            col_group_vals.append(s)
        pval = stats.f_oneway(*col_group_vals)[1]
        pvals.append(pval)
    anvova_sig_values['pval'] = pvals
    return anvova_sig_values.sort_values('pval')

a = anova(train_data)
a['disparity'] = np.log(1./a['pval'].values)
plt.subplots(figsize=(15,10))
sns.barplot(data=a, x='feature', y='disparity')
x=plt.xticks(rotation=90)

In [None]:
#Select first 20 categorical columns with most correlation with SalePrice
sig_cat_columns = a['feature']
print('Significant columns : ' + str(sig_cat_columns.values))

# Correlation between Significant categorical variables
from itertools import combinations
sig_cat_columns_combo = [combo for combo in combinations(sig_cat_columns, 2)]

def isCorrelated(p):
    if p < 0.05:
        return 'YES'
    else:
        return 'NOT'

p_value_table = pd.DataFrame(index = sig_cat_columns, columns = sig_cat_columns)
from scipy.stats import chi2_contingency
from pandas import crosstab
for (col1, col2) in sig_cat_columns_combo:
    crosstable = crosstab(train_data[col1], train_data[col2])
    chi2, p, dof, expected = chi2_contingency(crosstable)
    p_value_table[col1][col2] = isCorrelated(p)
    
p_value_table

In [None]:
'''
Below are features that are not correlated to Neighborhood.
Neighborhood, PoolQC, Heating, MiscFeature, Condition2.
Lets check if they are coorelated to one another.
'''
sig_cat_columns_short = ['Neighborhood', 'PoolQC', 'Heating', 'MiscFeature', 'Condition2']

sig_cat_columns_combo_short = [combo for combo in combinations(sig_cat_columns_short, 2)]

p_value_table_short = pd.DataFrame(index = sig_cat_columns_short, columns = sig_cat_columns_short)

for (col1, col2) in sig_cat_columns_combo_short:
    crosstable = crosstab(train_data[col1], train_data[col2])
    chi2, p, dof, expected = chi2_contingency(crosstable)
    p_value_table_short[col1][col2] = isCorrelated(p)
    
print(p_value_table_short)

'''
MiscFeature is correlated to PoolQC.
Condition2 is correlated to MiscFeature.
So, lets remove MiscFeature from Model.
Final categorical columns are Neighborhood, PoolQC, Heating, Condition2.
'''