# Data Preprocessing
***
***
## Import Packages and Data

In [1]:
"""
Import Packages
"""

import numpy as np
import pandas as pd
import dataClean as dc

In [2]:
"""
Import the data
"""
path = 'data/'
data = pd.read_csv(path + 'train.csv')
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


***
***
## Data Cleaning

In [3]:
data = data.drop('Id', axis=1)
data.isna().sum()

MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
Street             0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 80, dtype: int64

***
### Drop columns missing more than 50% data

In [4]:
major_miss = dc.majorMissing(data)
print(major_miss)

{'Alley': '93.7671%', 'PoolQC': '99.5205%', 'Fence': '80.7534%', 'MiscFeature': '96.3014%'}


In [5]:
clean = data.copy()
clean = clean.drop('Alley', axis=1).drop('PoolQC', axis=1).drop('Fence',axis=1).drop('MiscFeature', axis=1)
print(clean.shape)

(1460, 76)


****
### Replace missing data

In [6]:
missing_data = dc.checkNAN(clean)
print(missing_data)

{'LotFrontage': '17.7397%', 'MasVnrType': '0.5479%', 'MasVnrArea': '0.5479%', 'BsmtQual': '2.5342%', 'BsmtCond': '2.5342%', 'BsmtExposure': '2.6027%', 'BsmtFinType1': '2.5342%', 'BsmtFinType2': '2.6027%', 'Electrical': '0.0685%', 'FireplaceQu': '47.2603%', 'GarageType': '5.5479%', 'GarageYrBlt': '5.5479%', 'GarageFinish': '5.5479%', 'GarageQual': '5.5479%', 'GarageCond': '5.5479%'}


In [7]:
for key in missing_data.keys():
    if clean[key].dtypes == object:
        clean[key] = clean[key].fillna(clean[key].mode()[0])
    else:
        clean[key] = clean[key].fillna(clean[key].mean())

In [8]:
print(dc.checkNAN(clean))

You got a clean Dataframe


***
***
## Convert Features to Numerical Type

In [9]:
categs = dc.getCatCols(clean)
print(categs)

['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']


In [10]:
for col in categs :
    ones = pd.get_dummies(clean[col],
                          prefix = col)
    clean = pd.concat([clean, ones], axis=1)
print(clean.shape)

(1460, 315)


In [11]:
clean_ints = clean.select_dtypes(exclude='object')
print(clean_ints.shape)

(1460, 276)


***
***
## Normalise Data

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_df = pd.DataFrame(scaler.fit_transform(clean_ints.select_dtypes(exclude=['uint8'])),
                         columns=clean_ints.select_dtypes(exclude=['uint8']).columns)
print(scaled_df.head())
final = pd.concat([scaled_df, clean_ints.select_dtypes(include='uint8')], 
                  axis=1)
print(final.head())
print(final.shape)

   MSSubClass  LotFrontage   LotArea  OverallQual  OverallCond  YearBuilt  \
0    0.073375    -0.229372 -0.207142     0.651479    -0.517200   1.050994   
1   -0.872563     0.451936 -0.091886    -0.071836     2.179628   0.156734   
2    0.073375    -0.093110  0.073480     0.651479    -0.517200   0.984752   
3    0.309859    -0.456474 -0.096897     0.651479    -0.517200  -1.863632   
4    0.073375     0.633618  0.375148     1.374795    -0.517200   0.951632   

   YearRemodAdd  MasVnrArea  BsmtFinSF1  BsmtFinSF2  ...  WoodDeckSF  \
0      0.878668    0.511418    0.575425   -0.288653  ...   -0.752176   
1     -0.429577   -0.574410    1.171992   -0.288653  ...    1.626195   
2      0.830215    0.323060    0.092907   -0.288653  ...   -0.752176   
3     -0.720298   -0.574410   -0.499274   -0.288653  ...   -0.752176   
4      0.733308    1.364570    0.463568   -0.288653  ...    0.780197   

   OpenPorchSF  EnclosedPorch  3SsnPorch  ScreenPorch  PoolArea   MiscVal  \
0     0.216503      -0.3593

**Preprocessing Done. Now we can focus on Neural Network**
***
***
## Write Data to Disk

In [13]:
"""
Saving data to disk
"""
final.to_pickle('data/scaled_final.pkl')
clean.to_pickle('data/clean.pkl')