# Data Preprocessing

In [17]:
# Loading Libraries
import pandas as pd
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder

### 1. Importing Data

In [8]:
data = pd.read_csv('data/train.csv')

In [16]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [15]:
#Selecting Categorical Columns
categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)
data_categorical = data[categorical_columns]
data_categorical.head()

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


According to the type of information contained in each columns, we will divide the categorical variables into three different lists:

**ordinal:** variables which categories can be transformed into an ordinal sequence of numbers
**binary:** variables that have only two possible categories
**other:** Other tye of columns that cannot be encoded as ordinal or binary

In [None]:
ordinal = ['LotShape', 'Utilities', 'LandSlope', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'PavedDrive' ]
binary = ['Street', 'CentralAir']
other = ['MSSubClass', 'MSZoning', 'Alley', 'LandContour', 'LotConfig', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Foundation', 'Heating', 'Electrical', 'GarageType', 'MiscFeature', 'SaleType', 'SaleCondition']

In [21]:
#Converting ordinal variables into numerical

LotShape = {'Reg': 4, 'IR1': 3, 'IR2': 2, 'IR3': 1}
Utilities = {'AllPub': 4, 'NoSewr': 3, 'NoSeWa': 2, 'ELO': 1}
LandSlope = {'Gtl': 1, 'Mod': 2, 'Sev': 3}
ExterQual = {'Ex': 5, 'Gd': 4, 'Ta': 3, 'Fa': 2, 'Po': 1}
ExterCond = {'Ex': 5, 'Gd': 4, 'Ta': 3, 'Fa': 2, 'Po': 1}
BsmtQual = {'Ex': 6, 'Gd': 5, 'Ta': 4, 'Fa': 3, 'Po': 1, 'NA': 0}
BsmtCond = {'Ex': 6, 'Gd': 5, 'Ta': 4, 'Fa': 3, 'Po': 1, 'NA': 0}
BsmtExposure = {'Ex': 5, 'Gd': 4, 'Ta': 3, 'Fa': 2, 'Po': 1}
BsmtFinType1 = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'Lwq': 2, 'Unf': 1, 'Na':0}
BsmtFinType2 = {'GLQ': 6, 'ALQ': 5, 'BLQ': 4, 'Rec': 3, 'Lwq': 2, 'Unf': 1, 'Na':0}
HeatingQC = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
KitchenQual = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1}
Functional = {'Typ': 8, 'Min1': 7, 'Min2': 6, 'Mod': 5, 'Maj1': 4, 'Maj2': 3, 'Sev': 2, 'Sal': 1}
FireplaceQu = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
GarageFinish = {'Fin': 3, 'RFn': 2, 'Unf': 1, 'NA': 0}
GarageQual = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
GarageCond = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'NA': 0}
PoolQC = {'Ex': 4, 'Gd': 3, 'TA': 2, 'Fa': 1, 'NA': 0}
Fence = {'GdPrv': 4, 'MnPrv': 3, 'GdWo': 2, 'MnWw': 1, 'NA': 0}

data['LotShape'] = data['LotShape'].map(LotShape)
data['Utilities'] = data['Utilities'].map(Utilities)
data['LandSlope'] = data['LandSlope'].map(LandSlope)
data['ExterQual'] = data['ExterQual'].map(ExterQual)
data['ExterCond'] = data['ExterCond'].map(ExterCond)
data['BsmtQual'] = data['BsmtQual'].map(BsmtQual)
data['BsmtCond'] = data['BsmtCond'].map(BsmtCond)
data['BsmtExposure'] = data['BsmtExposure'].map(BsmtExposure)
data['BsmtFinType1'] = data['BsmtFinType1'].map(BsmtFinType1)
data['BsmtFinType2'] = data['BsmtFinType2'].map(BsmtFinType2)
data['HeatingQC'] = data['HeatingQC'].map(HeatingQC)
data['KitchenQual'] = data['KitchenQual'].map(KitchenQual)
data['Functional'] = data['Functional'].map(Functional)
data['FireplaceQu'] = data['FireplaceQu'].map(FireplaceQu)
data['GarageFinish'] = data['GarageFinish'].map(GarageFinish)
data['GarageQual'] = data['GarageQual'].map(GarageQual)
data['GarageCond'] = data['GarageCond'].map(GarageCond)
data['PoolQC'] = data['PoolQC'].map(PoolQC)
data['Fence'] = data['Fence'].map(Fence)

In [23]:
#Converting binary variables into numerical

Street = {'Grvl': 0, 'Pave': 1}
CentralAir = {'N': 0, 'Y': 1}

data['Street'] = data['Street'].map(Street)
data['CentralAir'] = data['CentralAir'].map(CentralAir)

In [24]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,1,,4,Lvl,4,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,1,,4,Lvl,4,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,1,,3,Lvl,4,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,1,,3,Lvl,4,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,1,,3,Lvl,4,...,0,,,,0,12,2008,WD,Normal,250000
