In [1]:
import pandas as pd
import numpy as np
import collections

from sklearn import preprocessing
from sklearn.impute import KNNImputer

In [2]:
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# train_data.describe().T

In [4]:
# Check for nulls and data types
# train_data.info()

In [5]:
# check the shape
# train_data.shape

## Null value imputation

In [6]:
# check for nulls
for col in train_data.columns:
    if train_data[col].isnull().any():
        print(f"Found null in : {col}")


Found null in : LotFrontage
Found null in : Alley
Found null in : MasVnrType
Found null in : MasVnrArea
Found null in : BsmtQual
Found null in : BsmtCond
Found null in : BsmtExposure
Found null in : BsmtFinType1
Found null in : BsmtFinType2
Found null in : Electrical
Found null in : FireplaceQu
Found null in : GarageType
Found null in : GarageYrBlt
Found null in : GarageFinish
Found null in : GarageQual
Found null in : GarageCond
Found null in : PoolQC
Found null in : Fence
Found null in : MiscFeature


In [7]:
# find and fill missing values in GarageYrBlt
na_list=list(train_data[train_data['GarageYrBlt'].isnull()]['Id'])

for n in na_list:
    train_data.loc[n-1,'GarageYrBlt']=train_data.loc[n-1,'YearBuilt']

In [8]:
na_imputed_num_features=['LotFrontage','MasVnrArea']
na_imputed_cat_features=['Alley','MasVnrType','BsmtCond','BsmtQual','BsmtExposure',
                         'BsmtFinType2','BsmtFinType1','Electrical','FireplaceQu',
                         'GarageType','GarageFinish','GarageQual',
                         'GarageCond','PoolQC','Fence','MiscFeature']
binned_features=['YearBuilt','YearRemodAdd','GarageYrBlt','YrSold']

# impute numerics with knn
def num_imputer(train_df,test_df,columns):
    for col in columns:
        imputer = KNNImputer(n_neighbors=5)
        imputer.fit(train_df[[col]])
        train_df[col]=imputer.transform(train_df[[col]])
        test_df[col]=imputer.transform(test_df[[col]])
        train_df[col]=train_df[col].astype('int')
        test_df[col]=test_df[col].astype('int')
    return train_df,test_df

# impute cats with unknown
def cat_imputer(train_df,test_df,columns):
    for col in columns:
        train_df[col]=train_df[col].fillna('unknown')
        test_df[col]=test_df[col].fillna('unknown')
    return train_df,test_df
    
# bin the year feature
def bin_features(df,columns):
    for ft in columns:
        df[ft]=pd.qcut(df[ft],q=[0, .25, .5, .75, 1.],labels=[1,2,3,4])    
    return df


train_data,test_data=num_imputer(train_data, test_data,na_imputed_num_features)
train_data,test_data=cat_imputer(train_data, test_data,na_imputed_cat_features)
train_data=bin_features(train_data,binned_features)
test_data=bin_features(test_data,binned_features)

In [9]:
# check for nulls
for col in train_data.columns:
    if train_data[col].isnull().any():
        print(f"Found null in : {col}")

In [10]:
train_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65,8450,Pave,unknown,Reg,Lvl,AllPub,...,0,unknown,unknown,unknown,0,2,2,WD,Normal,208500
1,2,20,RL,80,9600,Pave,unknown,Reg,Lvl,AllPub,...,0,unknown,unknown,unknown,0,5,1,WD,Normal,181500
2,3,60,RL,68,11250,Pave,unknown,IR1,Lvl,AllPub,...,0,unknown,unknown,unknown,0,9,2,WD,Normal,223500
3,4,70,RL,60,9550,Pave,unknown,IR1,Lvl,AllPub,...,0,unknown,unknown,unknown,0,2,1,WD,Abnorml,140000
4,5,60,RL,84,14260,Pave,unknown,IR1,Lvl,AllPub,...,0,unknown,unknown,unknown,0,12,2,WD,Normal,250000


### Feature Encoding 

In [10]:
# 
def ordinal_encoder(df,columns):
    for col in columns:
        oe=preprocessing.OrdinalEncoder()
        oe.fit(df[[col]])
        df[col]=oe.transform(df[[col]])   
        df[col]=df[col].astype('int')
    return df



# ordinal_encoded_columns=['MSSubClass']
# train_data=ordinal_encoder(train_data,ordinal_encoded_columns)

def label_encoder(df,columns):
    for col in columns:
        le=preprocessing.OrdinalEncoder()
        le.fit(df[[col]])
        df[col]=le.transform(df[[col]])   
        df[col]=df[col].astype('int')
    return df

# label_encoded_columns=['MSSubClass','MSZoning','Street','Alley','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle']
label_encoded_columns=['Alley','MasVnrType','BsmtCond','BsmtQual','BsmtExposure',
                         'BsmtFinType2','BsmtFinType1','Electrical','FireplaceQu',
                         'GarageType','GarageFinish','GarageQual',
                         'GarageCond','PoolQC','Fence','MiscFeature']
train_data=label_encoder(train_data,label_encoded_columns)


train_data.head()



Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd
0,5,3,65,8450,1,2,3,3,0,4,0,5,2,2,0,5,7,5,4,3
1,0,3,80,9600,1,2,3,3,0,2,0,24,1,2,0,2,6,8,3,2
2,5,3,68,11250,1,2,0,3,0,4,0,5,2,2,0,5,7,5,4,3
3,6,3,60,9550,1,2,0,3,0,0,0,6,2,2,0,5,7,5,1,2
4,5,3,84,14260,1,2,0,3,0,2,0,15,2,2,0,5,8,5,3,3


In [96]:
# # check the train test data consistency
# def train_test_distribution_check(train_data,test_data,columns):
#     print("-"*70)
#     print("| Checking for value consistency in train and test |")
#     print("-"*70)
#     for col in columns:
#         # check if the data type is same
#         if train_data[col].dtype==test_data[col].dtype:
#             print(f"{col} - data type matched!")
#             pass
#         else:
#             print(f"{col} - data type mismatch!")
#         # check if the unique values in train and test are same
#         train_feature_values=list(train_data[col].unique())
#         test_feature_values=list(test_data[col].unique())
#         if collections.Counter(train_feature_values)==collections.Counter(test_feature_values):
#             print(f"{col} - feature values matched!")
#         else:
#             print(f"{col} - unseen feature values found in test")          
#             train_feature_values=list(train_data[col].unique())
#             test_feature_values=list(test_data[col].unique())
#             for val in test_feature_values:
#                 if val not in train_feature_values:
#                     print(val)    
#         print("-"*10)
#     return


# categorical_columns=['MSSubClass','MSZoning','Street']
# train_test_distribution_check(train_data,test_data,categorical_columns)

In [111]:

standard_scaled_features=['LotFrontage','LotArea']
train_data.LotArea.unique()

array([ 8450,  9600, 11250, ..., 17217, 13175,  9717])

In [106]:

train_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd
0,5,3,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,4,3
1,0,3,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,3,2
2,5,3,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,4,3
3,6,3,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1,2
4,5,3,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,3,3


In [79]:
tmp=list(train_data.MSSubClass.unique())
tmp.sort()
print(tmp)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]


In [None]:
**Observation**  
    * only 1460 data points (small dataset)   
    * 80 features (needs feature engineering)
    * No nulls  
**Features**   
    * MSSubClass - Not ordinal - label encoding
    * MSZoning - Not ordinal - label encoding

In [None]:
# create lists of categorical and numerical columns
# categorical_columns=[]
# numerical_columns=[]
# for col in train_data.columns:
#     if train_data[col].dtype == 'int64' or train_data[col].dtype == 'float64':
#         numerical_columns.append(col)
#     elif train_data[col].dtype == 'object':
#         categorical_columns.append(col)