# Clean the data

**First, import packages for data cleaning and read the data**

In [1]:
from scipy.stats.mstats import mode
import pandas as pd
import numpy as np
import time
from sklearn.preprocessing import LabelEncoder

"""
Read Data
"""
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
target = train['SalePrice']
train = train.drop(['SalePrice'],axis=1)
trainlen = train.shape[0]

**Combine the train and test set for cleaning**

In [2]:
df1 = train.head()
df2 = test.head()
pd.concat([df1, df2], axis=0, ignore_index=True)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,2,2008,WD,Normal
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,5,2007,WD,Normal
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,9,2008,WD,Normal
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,2,2006,WD,Abnorml
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,12,2008,WD,Normal
5,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
6,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
7,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
8,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
9,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [3]:
alldata = pd.concat([train, test], axis=0, join='outer', ignore_index=True)
alldata = alldata.drop(['Id','Utilities'], axis=1)
alldata.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
Alley             object
LotShape          object
LandContour       object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
OverallQual        int64
OverallCond        int64
YearBuilt          int64
YearRemodAdd       int64
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
MasVnrArea       float64
ExterQual         object
ExterCond         object
Foundation        object
BsmtQual          object
BsmtCond          object
                  ...   
HalfBath           int64
BedroomAbvGr       int64
KitchenAbvGr       int64
KitchenQual       object
TotRmsAbvGrd       int64
Functional        object
Fireplaces         int64
FireplaceQu       object
GarageType        object


** Convert all ints to floats for XGBoost **

In [4]:
alldata.ix[:,(alldata.dtypes=='int64') & (alldata.columns != 'MSSubClass')]=alldata.ix[:,(alldata.dtypes=='int64') & (alldata.columns!='MSSubClass')].astype('float64')

In [5]:
alldata.head(20)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450.0,Pave,,Reg,Lvl,Inside,Gtl,...,0.0,0.0,,,,0.0,2.0,2008.0,WD,Normal
1,20,RL,80.0,9600.0,Pave,,Reg,Lvl,FR2,Gtl,...,0.0,0.0,,,,0.0,5.0,2007.0,WD,Normal
2,60,RL,68.0,11250.0,Pave,,IR1,Lvl,Inside,Gtl,...,0.0,0.0,,,,0.0,9.0,2008.0,WD,Normal
3,70,RL,60.0,9550.0,Pave,,IR1,Lvl,Corner,Gtl,...,0.0,0.0,,,,0.0,2.0,2006.0,WD,Abnorml
4,60,RL,84.0,14260.0,Pave,,IR1,Lvl,FR2,Gtl,...,0.0,0.0,,,,0.0,12.0,2008.0,WD,Normal
5,50,RL,85.0,14115.0,Pave,,IR1,Lvl,Inside,Gtl,...,0.0,0.0,,MnPrv,Shed,700.0,10.0,2009.0,WD,Normal
6,20,RL,75.0,10084.0,Pave,,Reg,Lvl,Inside,Gtl,...,0.0,0.0,,,,0.0,8.0,2007.0,WD,Normal
7,60,RL,,10382.0,Pave,,IR1,Lvl,Corner,Gtl,...,0.0,0.0,,,Shed,350.0,11.0,2009.0,WD,Normal
8,50,RM,51.0,6120.0,Pave,,Reg,Lvl,Inside,Gtl,...,0.0,0.0,,,,0.0,4.0,2008.0,WD,Abnorml
9,190,RL,50.0,7420.0,Pave,,Reg,Lvl,Corner,Gtl,...,0.0,0.0,,,,0.0,1.0,2008.0,WD,Normal


** Save lightly prepared data (no encoding) **

In [6]:

train = alldata.ix[0:trainlen-1, :]
test = alldata.ix[trainlen:alldata.shape[0],:]
test.to_csv('data/test_prepared_light.csv')
train.to_csv('data/train_prepared_light.csv')

**Dealing with the NA values in the variables, some of them equal to 0 and some equal to median, based on the txt descriptions**

In [6]:
fMedlist=['LotFrontage']
fArealist=['MasVnrArea','TotalBsmtSF','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','BsmtFullBath', 'BsmtHalfBath','MasVnrArea','Fireplaces','GarageArea','GarageYrBlt','GarageCars']

for i in fArealist:
    alldata.ix[pd.isnull(alldata.ix[:,i]),i] = 0
        
for i in fMedlist:
    alldata.ix[pd.isnull(alldata.ix[:,i]),i] = np.nanmedian(alldata.ix[:,i])    

** Transforming Data **
Use integers to encode categorical data.

In [7]:
alldata.head(20)

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,RL,65.0,8450.0,Pave,,Reg,Lvl,Inside,Gtl,...,0.0,0.0,,,,0.0,2.0,2008.0,WD,Normal
1,20,RL,80.0,9600.0,Pave,,Reg,Lvl,FR2,Gtl,...,0.0,0.0,,,,0.0,5.0,2007.0,WD,Normal
2,60,RL,68.0,11250.0,Pave,,IR1,Lvl,Inside,Gtl,...,0.0,0.0,,,,0.0,9.0,2008.0,WD,Normal
3,70,RL,60.0,9550.0,Pave,,IR1,Lvl,Corner,Gtl,...,0.0,0.0,,,,0.0,2.0,2006.0,WD,Abnorml
4,60,RL,84.0,14260.0,Pave,,IR1,Lvl,FR2,Gtl,...,0.0,0.0,,,,0.0,12.0,2008.0,WD,Normal
5,50,RL,85.0,14115.0,Pave,,IR1,Lvl,Inside,Gtl,...,0.0,0.0,,MnPrv,Shed,700.0,10.0,2009.0,WD,Normal
6,20,RL,75.0,10084.0,Pave,,Reg,Lvl,Inside,Gtl,...,0.0,0.0,,,,0.0,8.0,2007.0,WD,Normal
7,60,RL,68.0,10382.0,Pave,,IR1,Lvl,Corner,Gtl,...,0.0,0.0,,,Shed,350.0,11.0,2009.0,WD,Normal
8,50,RM,51.0,6120.0,Pave,,Reg,Lvl,Inside,Gtl,...,0.0,0.0,,,,0.0,4.0,2008.0,WD,Abnorml
9,190,RL,50.0,7420.0,Pave,,Reg,Lvl,Corner,Gtl,...,0.0,0.0,,,,0.0,1.0,2008.0,WD,Normal


** Encode categorical data **

In [8]:
le = LabelEncoder()
nacount_category = np.array(alldata.columns[((alldata.dtypes=='int64') | (alldata.dtypes=='object')) & (pd.isnull(alldata).sum()>0)])
category = np.array(alldata.columns[((alldata.dtypes=='int64') | (alldata.dtypes=='object'))])
Bsmtset = set(['BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2'])
MasVnrset = set(['MasVnrType'])
Garageset = set(['GarageType','GarageYrBlt','GarageFinish','GarageQual','GarageCond'])
Fireplaceset = set(['FireplaceQu'])
Poolset = set(['PoolQC'])
NAset = set(['Fence','MiscFeature','Alley'])

# Put 0 and null values in the same category
for i in nacount_category:
    if i in Bsmtset:
        alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['TotalBsmtSF']==0), i]='Empty'
        alldata.ix[pd.isnull(alldata.ix[:,i]), i] = alldata.ix[:,i].value_counts().index[0]
    elif i in MasVnrset:
        alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['MasVnrArea']==0),i]='Empty'
        alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]
    elif i in Garageset:
        alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['GarageArea']==0),i]='Empty'
        alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]
    elif i in Fireplaceset:
        alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['Fireplaces']==0),i]='Empty'
        alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]
    elif i in Poolset:
        alldata.ix[pd.isnull(alldata.ix[:,i]) & (alldata['PoolArea']==0),i]='Empty'
        alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]
    elif i in NAset:
        alldata.ix[pd.isnull(alldata.ix[:,i]),i]='Empty'
    else:
        alldata.ix[pd.isnull(alldata.ix[:,i]),i]=alldata.ix[:,i].value_counts().index[0]

for i in category:
    alldata.ix[:,i]=le.fit_transform(alldata.ix[:,i])

train = alldata.ix[0:trainlen-1, :]
test = alldata.ix[trainlen:alldata.shape[0],:]

In [9]:
alldata.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,5,3,65.0,8450.0,1,0,3,3,4,0,...,0.0,0.0,0,0,0,0.0,2.0,2008.0,8,4
1,0,3,80.0,9600.0,1,0,3,3,2,0,...,0.0,0.0,0,0,0,0.0,5.0,2007.0,8,4
2,5,3,68.0,11250.0,1,0,0,3,4,0,...,0.0,0.0,0,0,0,0.0,9.0,2008.0,8,4
3,6,3,60.0,9550.0,1,0,0,3,0,0,...,0.0,0.0,0,0,0,0.0,2.0,2006.0,8,0
4,5,3,84.0,14260.0,1,0,0,3,2,0,...,0.0,0.0,0,0,0,0.0,12.0,2008.0,8,4


** Export data **

In [10]:
train.to_csv('data/train_prepared.csv')
test.to_csv('data/test_prepared.csv')

In [11]:
train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,LotConfig,LandSlope,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,5,3,65.0,8450.0,1,0,3,3,4,0,...,0.0,0.0,0,0,0,0.0,2.0,2008.0,8,4
1,0,3,80.0,9600.0,1,0,3,3,2,0,...,0.0,0.0,0,0,0,0.0,5.0,2007.0,8,4
2,5,3,68.0,11250.0,1,0,0,3,4,0,...,0.0,0.0,0,0,0,0.0,9.0,2008.0,8,4
3,6,3,60.0,9550.0,1,0,0,3,0,0,...,0.0,0.0,0,0,0,0.0,2.0,2006.0,8,0
4,5,3,84.0,14260.0,1,0,0,3,2,0,...,0.0,0.0,0,0,0,0.0,12.0,2008.0,8,4


In [21]:
target.to_csv('data/train_target.csv', header='SalePrice', index=False)