In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
%matplotlib inline
np.set_printoptions(precision=2)
pd.set_option('precision', 2)
np.set_printoptions(suppress=True)
sns.set(style='whitegrid')

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
train_path = '../input/train.csv'
test_path = '../input/test.csv'
train_data = pd.read_csv(train_path, index_col='Id')
test_data = pd.read_csv(test_path, index_col='Id')
features = train_data.iloc[:,:-1]
target = train_data.loc[:, ['SalePrice']]
print('Train Set Size : ', train_data.shape)
print('Test Set Size : ', test_data.shape)
print('Train Features Size : ', features.shape)
num_train_rows = train_data.shape[0]
num_test_rows = test_data.shape[0]
print('Train Rows : ', num_train_rows)
print('Test Rows : ', num_test_rows)
all_data = pd.concat((features, test_data)).reset_index(drop=True)

In [None]:
# Analyze SalePrice
print('Skewness of SalePrice before Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice before Log Transform : %f'% target.kurt())

'''
Skew = 1.882876 indicates positive skew with tail to the right.
Kurt = 6.536282 indicates heavy tails i.e. more data on tails.
'''

#Apply Log transformation
target['SalePrice'] = np.log(target['SalePrice'])
print('Skewness of SalePrice after Log Transform : %f'% target.skew())
print('Kurtosis of SalePrice after Log Transform : %f'% target.kurt())

In [None]:
# Missing Data
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))


for col in ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu']:
    all_data[col] = all_data[col].fillna('None')

for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'MasVnrType']:
    all_data[col] = all_data[col].fillna('None')
    
for col in ['BsmtExposure', 'BsmtFinType2', 'BsmtQual', 'BsmtCond', 'BsmtFinType1']:
    all_data[col] = all_data[col].fillna('None')

'''
No GarageYrBlt means no Garage. We can impute mean/median since it would 
incorrectly convey existence of Garage. same reasoning for MasVnrArea.
'''
for col in ['GarageYrBlt', 'MasVnrArea']:
    all_data[col] = all_data[col].fillna(0)

'''
Group data by neighborhood & imputed null LotFrontage columns with median of
grouped data.
'''
all_data['LotFrontage'] = all_data.groupby(['Neighborhood'])\
                    ['LotFrontage'].transform(lambda x : x.fillna(x.median()))
    
all_data['Electrical'] = \
    all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
    
all_data['MSZoning'] = \
    all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
    
all_data['Utilities'] = all_data['Utilities'].fillna('ELO')

all_data['Exterior1st'] = all_data['Exterior1st'].fillna('Other')

all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna('Other')

all_data['SaleType'] = all_data['SaleType'].fillna('Oth')

all_data['Functional'] = \
    all_data['Functional'].fillna(all_data['Functional'].mode()[0])

all_data['KitchenQual'] = \
    all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

for col in ['BsmtFullBath', 'BsmtHalfBath', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'GarageCars', 'GarageArea']:
    all_data[col] = all_data[col].fillna(0)
    
null_features = all_data.columns[all_data.isnull().any()]
missing_ratio = (all_data[null_features].isnull().sum()/len(all_data)) * 100
missing_data = pd.DataFrame({'Missing Ratio' :missing_ratio})
print(missing_data.sort_values(by='Missing Ratio',ascending=False))

In [None]:
#num_features = all_data.select_dtypes(include=np.number)

import statsmodels.formula.api as sm

significance = 0.05
all_data = pd.get_dummies(all_data)
train_num_features = all_data[:num_train_rows]
test_num_features = all_data[num_train_rows:]
train_num_features_ones = np.append(arr=np.ones((1460,1)).astype(int), values=train_num_features, axis=1)

cols = train_num_features.columns.values
cols = np.insert(cols, 0, 'intercept')

def backwardElimination(cols, train_num_features_ones):
    for i in range (0, train_num_features_ones.shape[1]):
        regressor_OLS = sm.OLS(endog=target, exog=train_num_features_ones).fit()
        maxPVal = max(regressor_OLS.pvalues)
        if maxPVal > significance:
            for j in range(0, train_num_features_ones.shape[1]):
                if (regressor_OLS.pvalues[j].astype(float) == maxPVal):
                    train_num_features_ones = np.delete(train_num_features_ones, j, 1)
                    cols = np.delete(cols, j)
    print(regressor_OLS.summary())
    return cols, train_num_features_ones

cols, train_num_features_ones_df = backwardElimination(cols, train_num_features_ones)
print(cols)

In [None]:
final_num_features = train_num_features.loc[:,['LotFrontage', 'LotArea', 'OverallQual', 
            'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'TotalBsmtSF', 
            'GrLivArea', 'BsmtFullBath', 'KitchenAbvGr', 'GarageCars', 'GarageArea', 
            'WoodDeckSF', 'EnclosedPorch', 'ScreenPorch', 'PoolArea',
            'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 
            'Street_Pave', 'Alley_Grvl', 'Alley_None', 'Alley_Pave', 'LotShape_IR1', 
            'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_Bnk', 
            'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'Utilities_AllPub',
            'Utilities_NoSeWa', 'LotConfig_CulDSac', 'LandSlope_Gtl', 'LandSlope_Mod', 
            'Neighborhood_BrkSide', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 
            'Neighborhood_MeadowV', 'Neighborhood_NridgHt', 'Neighborhood_StoneBr', 
            'Condition1_Feedr', 'Condition1_Norm', 'Condition1_PosN', 'Condition1_RRAn', 
            'Condition2_PosA', 'Condition2_PosN', 'Condition2_RRAe', 'BldgType_1Fam', 
            'BldgType_2fmCon', 'BldgType_Duplex', 'BldgType_TwnhsE', 'RoofStyle_Shed', 
            'RoofMatl_ClyTile', 'RoofMatl_CompShg', 'RoofMatl_Membran', 'RoofMatl_Metal',
            'RoofMatl_Roll', 'RoofMatl_Tar&Grv', 'RoofMatl_WdShake', 'RoofMatl_WdShngl', 
            'Exterior1st_BrkComm', 'Exterior1st_BrkFace', 'Exterior1st_MetalSd', 
            'MasVnrType_BrkCmn', 'MasVnrType_BrkFace', 'MasVnrType_None', 
            'MasVnrType_Stone', 'ExterQual_Ex', 'ExterQual_Fa', 'ExterQual_Gd', 
            'ExterQual_TA', 'ExterCond_TA', 'Foundation_PConc', 'Foundation_Stone', 
            'BsmtQual_Ex', 'BsmtQual_Fa', 'BsmtQual_Gd', 'BsmtQual_None', 'BsmtQual_TA', 
            'BsmtCond_None', 'BsmtCond_Po', 'BsmtExposure_Gd', 'BsmtFinType1_ALQ', 
            'BsmtFinType1_BLQ', 'BsmtFinType1_GLQ', 'BsmtFinType1_LwQ', 
            'BsmtFinType1_None', 'BsmtFinType1_Rec', 'BsmtFinType1_Unf', 
            'BsmtFinType2_ALQ', 'Heating_GasA', 'Heating_GasW', 'Heating_Wall', 
            'HeatingQC_Ex', 'CentralAir_N', 'CentralAir_Y', 'KitchenQual_Ex', 
            'KitchenQual_Fa', 'KitchenQual_Gd', 'KitchenQual_TA', 'Functional_Maj1', 
            'Functional_Min1', 'Functional_Min2', 'Functional_Mod', 'Functional_Typ', 
            'FireplaceQu_Ex', 'FireplaceQu_Fa', 'FireplaceQu_Gd', 'FireplaceQu_None', 
            'FireplaceQu_Po', 'FireplaceQu_TA', 'GarageCond_TA', 'PavedDrive_N', 
            'PavedDrive_P', 'PavedDrive_Y', 'PoolQC_None', 'Fence_GdPrv', 'Fence_GdWo', 
            'Fence_MnPrv', 'Fence_MnWw', 'Fence_None', 'SaleType_ConLD', 'SaleType_New',
            'SaleCondition_Normal']]
final_num_features.insert(0, 'intercept', np.ones((1460,1)))

# check whether there are any NaNs in the dataframe
print(np.any(np.isnan(final_num_features)))

#If True, then find where NaNs exist
print(np.where(np.isnan(final_num_features)))
'''
MSZoning_C, (all) contains NaN values. It contains only NaN's
Remove that column.
'''
from sklearn.cross_validation import train_test_split
X_train , X_test, y_train, y_test = train_test_split(final_num_features, target, test_size=0.4, random_state=0)

from sklearn.linear_model import Ridge
regressor = Ridge(alpha=0.05, normalize=True)
regressor.fit(X_train, y_train)

from sklearn import metrics
y_train_predict = regressor.predict(X_test)
print('MAE',metrics.mean_absolute_error(y_test, y_train_predict))
print('MSE',metrics.mean_squared_error(y_test, y_train_predict))
print('RMSE',np.sqrt(metrics.mean_squared_error(y_test, y_train_predict)))
print('RMSLE',np.sqrt(metrics.mean_squared_log_error(y_test, y_train_predict)))
print("Accuracy --> ", regressor.score(X_test, y_test)*100)

final_test_features = test_num_features.loc[:,['LotFrontage', 'LotArea', 'OverallQual', 
            'OverallCond', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'TotalBsmtSF', 
            'GrLivArea', 'BsmtFullBath', 'KitchenAbvGr', 'GarageCars', 'GarageArea', 
            'WoodDeckSF', 'EnclosedPorch', 'ScreenPorch', 'PoolArea',
            'MSZoning_FV', 'MSZoning_RH', 'MSZoning_RL', 'MSZoning_RM', 'Street_Grvl', 
            'Street_Pave', 'Alley_Grvl', 'Alley_None', 'Alley_Pave', 'LotShape_IR1', 
            'LotShape_IR2', 'LotShape_IR3', 'LotShape_Reg', 'LandContour_Bnk', 
            'LandContour_HLS', 'LandContour_Low', 'LandContour_Lvl', 'Utilities_AllPub',
            'Utilities_NoSeWa', 'LotConfig_CulDSac', 'LandSlope_Gtl', 'LandSlope_Mod', 
            'Neighborhood_BrkSide', 'Neighborhood_Crawfor', 'Neighborhood_Edwards', 
            'Neighborhood_MeadowV', 'Neighborhood_NridgHt', 'Neighborhood_StoneBr', 
            'Condition1_Feedr', 'Condition1_Norm', 'Condition1_PosN', 'Condition1_RRAn', 
            'Condition2_PosA', 'Condition2_PosN', 'Condition2_RRAe', 'BldgType_1Fam', 
            'BldgType_2fmCon', 'BldgType_Duplex', 'BldgType_TwnhsE', 'RoofStyle_Shed', 
            'RoofMatl_ClyTile', 'RoofMatl_CompShg', 'RoofMatl_Membran', 'RoofMatl_Metal',
            'RoofMatl_Roll', 'RoofMatl_Tar&Grv', 'RoofMatl_WdShake', 'RoofMatl_WdShngl', 
            'Exterior1st_BrkComm', 'Exterior1st_BrkFace', 'Exterior1st_MetalSd', 
            'MasVnrType_BrkCmn', 'MasVnrType_BrkFace', 'MasVnrType_None', 
            'MasVnrType_Stone', 'ExterQual_Ex', 'ExterQual_Fa', 'ExterQual_Gd', 
            'ExterQual_TA', 'ExterCond_TA', 'Foundation_PConc', 'Foundation_Stone', 
            'BsmtQual_Ex', 'BsmtQual_Fa', 'BsmtQual_Gd', 'BsmtQual_None', 'BsmtQual_TA', 
            'BsmtCond_None', 'BsmtCond_Po', 'BsmtExposure_Gd', 'BsmtFinType1_ALQ', 
            'BsmtFinType1_BLQ', 'BsmtFinType1_GLQ', 'BsmtFinType1_LwQ', 
            'BsmtFinType1_None', 'BsmtFinType1_Rec', 'BsmtFinType1_Unf', 
            'BsmtFinType2_ALQ', 'Heating_GasA', 'Heating_GasW', 'Heating_Wall', 
            'HeatingQC_Ex', 'CentralAir_N', 'CentralAir_Y', 'KitchenQual_Ex', 
            'KitchenQual_Fa', 'KitchenQual_Gd', 'KitchenQual_TA', 'Functional_Maj1', 
            'Functional_Min1', 'Functional_Min2', 'Functional_Mod', 'Functional_Typ', 
            'FireplaceQu_Ex', 'FireplaceQu_Fa', 'FireplaceQu_Gd', 'FireplaceQu_None', 
            'FireplaceQu_Po', 'FireplaceQu_TA', 'GarageCond_TA', 'PavedDrive_N', 
            'PavedDrive_P', 'PavedDrive_Y', 'PoolQC_None', 'Fence_GdPrv', 'Fence_GdWo', 
            'Fence_MnPrv', 'Fence_MnWw', 'Fence_None', 'SaleType_ConLD', 'SaleType_New',
            'SaleCondition_Normal']]
final_test_features.insert(0, 'intercept', np.ones((1459,1)))
y_test_predict = regressor.predict(final_test_features)
y_test_final_predict = np.exp(y_test_predict)
test = pd.read_csv('../input/test.csv')
my_submission = pd.DataFrame()
my_submission['Id'] = test.Id
my_submission['SalePrice'] = y_test_final_predict
my_submission.to_csv('submission.csv', index=False)
