In [67]:
#load libraries
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import graphviz

In [68]:
#using House Price Prediction dataset from Kaggle 
#https://www.kaggle.com/c/neolen-house-price-prediction/overview

#load traning data
house_train = pd.read_csv('train.csv')
house_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [69]:
#how many columns and rows?
house_train.shape

(1259, 81)

In [77]:
#PREPROCESSING

#1st column (ID) is irrelevant, dropping 
house_train = house_train.iloc[: , 1:]

#find number of NA values in each column
house_train.isna().sum()

#identify columns with NA values
na = [i for i in house_train.columns if house_train[i].isnull().sum() > 0 *len(house_train)]
print(na)
#how many NAs for each column?
na_cols = house_train[['MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']]
print(na_cols.isna().sum())

#removing columns with missing values because either (1) column has very high # of NAs or (2) column is a small detail e.g. 'masonry veneer type'
house_train = house_train.drop(labels=['MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'], axis=1)

['Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature']
MasVnrType         7
MasVnrArea         7
BsmtQual          35
BsmtCond          35
BsmtExposure      36
BsmtFinType1      35
BsmtFinType2      36
FireplaceQu      599
GarageType        71
GarageYrBlt       71
GarageFinish      71
GarageQual        71
GarageCond        71
PoolQC          1255
Fence           1019
MiscFeature     1208
dtype: int64


In [101]:
#inspect column types
house_train.dtypes.value_counts()

#select columns by 'object' type
house_train.select_dtypes('object').columns.to_list()

#converting categorical data columns into dummy variables
test = pd.get_dummies(house_train, columns=['Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','ExterQual','ExterCond','Foundation','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','PavedDrive','SaleType','SaleCondition'], drop_first=True)
test.head()

Unnamed: 0,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,8450,7,5,2003,2003,706,0,150,856,856,...,0,0,0,0,1,0,0,0,1,0
1,9600,6,8,1976,1976,978,0,284,1262,1262,...,0,0,0,0,1,0,0,0,1,0
2,11250,7,5,2001,2002,486,0,434,920,920,...,0,0,0,0,1,0,0,0,1,0
3,9550,7,5,1915,1970,216,0,540,756,961,...,0,0,0,0,1,0,0,0,0,0
4,14260,8,5,2000,2000,655,0,490,1145,1145,...,0,0,0,0,1,0,0,0,1,0


In [90]:
house_train.corr()['SalePrice'].sort_values(ascending = False)

SalePrice        1.000000
OverallQual      0.790786
GrLivArea        0.733480
GarageCars       0.642079
TotalBsmtSF      0.641946
GarageArea       0.625499
1stFlrSF         0.617868
FullBath         0.566303
TotRmsAbvGrd     0.552707
YearBuilt        0.524407
YearRemodAdd     0.512674
Fireplaces       0.460081
BsmtFinSF1       0.409673
2ndFlrSF         0.335212
WoodDeckSF       0.316402
OpenPorchSF      0.315975
HalfBath         0.298473
LotArea          0.267146
BsmtFullBath     0.223671
BsmtUnfSF        0.207560
BedroomAbvGr     0.170714
ScreenPorch      0.123616
PoolArea         0.101972
MoSold           0.049208
3SsnPorch        0.020845
BsmtFinSF2       0.000474
BsmtHalfBath    -0.018857
LowQualFinSF    -0.026166
MiscVal         -0.031333
YrSold          -0.036798
OverallCond     -0.083382
EnclosedPorch   -0.118917
KitchenAbvGr    -0.133655
Name: SalePrice, dtype: float64