In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
from sklearn.model_selection import train_test_split


pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
%matplotlib inline
plt.style.use('bmh')
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv("data/train.csv")
print(df.shape)
df.head()

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
print(train_df.shape)
print(test_df.shape)

(1168, 81)
(292, 81)


# Mising Value Imputation

In [11]:
info_df = pd.DataFrame(df.dtypes)
info_df['MissingVal'] = df.isnull().sum()
info_df['NUnique'] = df.nunique()
info_df['Count'] = df.count()
info_df = info_df.rename(columns = {0: 'DataType'}).reset_index()
info_df = info_df.rename(columns = {'index': 'Feature'})
info_df

Unnamed: 0,Feature,DataType,MissingVal,NUnique,Count
0,Id,int64,0,1460,1460
1,MSSubClass,int64,0,15,1460
2,MSZoning,object,0,5,1460
3,LotFrontage,float64,259,110,1201
4,LotArea,int64,0,1073,1460
5,Street,object,0,2,1460
6,Alley,object,1369,2,91
7,LotShape,object,0,4,1460
8,LandContour,object,0,4,1460
9,Utilities,object,0,2,1460


In [18]:
missing_features = info_df[info_df['MissingVal'] > 0]
missing_features

Unnamed: 0,Feature,DataType,MissingVal,NUnique,Count
3,LotFrontage,float64,259,110,1201
6,Alley,object,1369,2,91
25,MasVnrType,object,8,4,1452
26,MasVnrArea,float64,8,327,1452
30,BsmtQual,object,37,4,1423
31,BsmtCond,object,37,4,1423
32,BsmtExposure,object,38,4,1422
33,BsmtFinType1,object,37,6,1423
35,BsmtFinType2,object,38,6,1422
42,Electrical,object,1,5,1459


In [22]:
numeric_cols = missing_features[missing_features['DataType'] == 'float64']['Feature'].values.tolist()
for col in numeric_cols:
    print(col)

LotFrontage
MasVnrArea
GarageYrBlt


In [23]:
categorical_cols = missing_features[missing_features['DataType'] == 'object']['Feature'].values.tolist()
for col in categorical_cols:
    print(col)

Alley
MasVnrType
BsmtQual
BsmtCond
BsmtExposure
BsmtFinType1
BsmtFinType2
Electrical
FireplaceQu
GarageType
GarageFinish
GarageQual
GarageCond
PoolQC
Fence
MiscFeature


In [21]:
df['Electrical'].value_counts()

SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64