In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 

%matplotlib inline

In [2]:
housing = pd.read_csv('./data/train.csv')

## Deal With Missing Values

In [3]:
housing.isnull().sum().sort_values(ascending=False)

PoolQC         1453
MiscFeature    1406
Alley          1369
Fence          1179
FireplaceQu     690
               ... 
ExterQual         0
Exterior2nd       0
Exterior1st       0
RoofMatl          0
SalePrice         0
Length: 81, dtype: int64

What should we do with all this missing data?

- Get rid of the corresponding houses.
-  Get rid of the whole attribute or remove the whole column.
- Set the missing values to some value (zero, the mean, the median, etc.).


In [4]:
# Imputing Missing Values
housing_processed = housing

# Categorical columns:
cat_cols_fill_none = ['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
                     'GarageCond', 'GarageQual', 'GarageFinish', 'GarageType',
                     'BsmtFinType2', 'BsmtExposure', 'BsmtFinType1', 'BsmtQual', 'BsmtCond',
                     'MasVnrType']

# Replace missing values for categorical columns with None
for cat in cat_cols_fill_none:
    housing_processed[cat] = housing_processed[cat].fillna("None")
    
# Group by neighborhood and fill in missing value by the median LotFrontage of all the neighborhood
housing_processed['LotFrontage'] = housing_processed.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))    

# Garage: GarageYrBlt, GarageArea and GarageCars these are numerical columns, replace with zero
for col in ['GarageYrBlt', 'GarageArea', 'GarageCars']:
    housing_processed[col] = housing_processed[col].fillna(int(0))
    
# MasVnrArea : replace with zero
housing_processed['MasVnrArea'] = housing_processed['MasVnrArea'].fillna(int(0))

# Use the mode value 
housing_processed['Electrical'] = housing_processed['Electrical'].fillna(housing_processed['Electrical']).mode()[0]

# There is no need of Utilities so let's just drop this column
housing_processed = housing_processed.drop(['Utilities'], axis=1)

# Get the count again to verify that we do not have any more missing values
housing_processed.isnull().apply(sum).max()

0

## Deal With Outliers

In [5]:
num_attributes = housing_processed.select_dtypes(exclude='object')

high_quant = housing_processed.quantile(.999)

for i in num_attributes.columns:
    housing_processed = housing_processed.drop(housing_processed[i][housing_processed[i]>high_quant[i]].index)

housing_processed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1421 entries, 0 to 1457
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1421 non-null   int64  
 1   MSSubClass     1421 non-null   int64  
 2   MSZoning       1421 non-null   object 
 3   LotFrontage    1421 non-null   float64
 4   LotArea        1421 non-null   int64  
 5   Street         1421 non-null   object 
 6   Alley          1421 non-null   object 
 7   LotShape       1421 non-null   object 
 8   LandContour    1421 non-null   object 
 9   LotConfig      1421 non-null   object 
 10  LandSlope      1421 non-null   object 
 11  Neighborhood   1421 non-null   object 
 12  Condition1     1421 non-null   object 
 13  Condition2     1421 non-null   object 
 14  BldgType       1421 non-null   object 
 15  HouseStyle     1421 non-null   object 
 16  OverallQual    1421 non-null   int64  
 17  OverallCond    1421 non-null   int64  
 18  YearBuil

## Deal With Correlated Attributes

In [6]:
#### Remove highly correlated features
# Remove attributes that were identified for excluding when viewing scatter plots & corr values
attributes_drop = ['MiscVal', 'MoSold', 'YrSold', 'BsmtFinSF2','BsmtHalfBath','MSSubClass',
                   'GarageArea', 'GarageYrBlt', '3SsnPorch']

housing_processed = housing_processed.drop(attributes_drop, axis=1)

## Handle Text And Categorical Attributes

In [7]:
#### Transforming Cat variables
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder()
housing_processed_1hot = cat_encoder.fit_transform(housing_processed)
housing_processed_1hot

<1421x8750 sparse matrix of type '<class 'numpy.float64'>'
	with 100891 stored elements in Compressed Sparse Row format>

## Feature Scaling

- Min-max scaling (also known as normalization) `MinMaxScaler `
- Standardization `StandardScaler`