# Feature Engineering with Open-Source

This is notebook where I replace all hand made transformation with open-source APIs available

## Reproducibility: Setting seed

Even in this scenario, setting seed for reproducibility is very important

In [1]:
# Import libraries

#Data manipulation
import pandas as pd
import numpy as np

#Data visualization
import matplotlib.pyplot as plt

# Saving the pipeline
import joblib

# From Scikit-learn
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, Binarizer

# From feature-engine
from feature_engine.imputation import (AddMissingIndicator, MeanMedianImputer, CategoricalImputer)
from feature_engine.encoding import (RareLabelEncoder, OrdinalEncoder)
from feature_engine.transformation import (LogTransformer, YeoJohnsonTransformer)
from feature_engine.selection import DropFeatures
from feature_engine.wrappers import SklearnTransformerWrapper

# To visualize all columns
pd.pandas.set_option('display.max_columns', None)


## Importing Data

In [2]:
# Load Data
data = pd.read_csv('../data/train.csv')

# Get rows and Columns
print(data.shape)

# Drop ID column
data.drop('Id', axis=1, inplace=True)

# Visualize first 5 rows
data.head()

(1460, 81)


Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


## Separate data into train and test

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('SalePrice', axis=1),
                                                    data['SalePrice'],
                                                    test_size=0.1,
                                                    random_state=0)

X_train.shape, X_test.shape

((1314, 79), (146, 79))

## Feature Engineering

Apply Log to target

In [4]:
y_train = np.log(y_train)
y_test = np.log(y_test)

## Missing Values 

Categorical Variables

In [5]:
# Capture categorical variables

cat_vars = [var for var in data.columns if data[var].dtype == 'O']

# Add MSSubClass to the list of categorical variables

cat_vars = cat_vars + ['MSSubClass']

# Cast all variables in the list to object

X_train[cat_vars] = X_train[cat_vars].astype('O')
X_test[cat_vars] = X_test[cat_vars].astype('O')

# Number of categorical variables
len(cat_vars)



44

In [6]:
# Make a list of categorical variables that contain missing values

cat_vars_with_na = [var for var in cat_vars if X_train[var].isnull().sum() > 0]

#Print % of missing values per variable
X_train[cat_vars_with_na].isnull().mean().sort_values(ascending=False)



PoolQC          0.995434
MiscFeature     0.961187
Alley           0.938356
Fence           0.814307
MasVnrType      0.601218
FireplaceQu     0.472603
GarageType      0.056317
GarageFinish    0.056317
GarageQual      0.056317
GarageCond      0.056317
BsmtExposure    0.025114
BsmtFinType2    0.025114
BsmtQual        0.024353
BsmtCond        0.024353
BsmtFinType1    0.024353
Electrical      0.000761
dtype: float64

In [7]:
# Variables to impute with the string missing

with_string_missing = [var for var in cat_vars_with_na if X_train[var].isnull().mean() > 0.1]

# Variables to impute with the most frequent category

with_frequent_category = [var for var in cat_vars_with_na if X_train[var].isnull().mean() < 0.1]

In [8]:
# Impute Categories with the string 'Missing'

# Set up class
cat_imputer_missing = CategoricalImputer(imputation_method='missing', variables=with_string_missing)

# Fit the class to the train set
cat_imputer_missing.fit(X_train)

#The class learns and store the parameters
cat_imputer_missing.imputer_dict_

{'Alley': 'Missing',
 'MasVnrType': 'Missing',
 'FireplaceQu': 'Missing',
 'PoolQC': 'Missing',
 'Fence': 'Missing',
 'MiscFeature': 'Missing'}

In [9]:
# Replace NA by missing

X_train = cat_imputer_missing.transform(X_train)
X_test = cat_imputer_missing.transform(X_test)

In [10]:
# Replace missing values with the most frequent category

# Set up the class
cat_imputer_frequent = CategoricalImputer(imputation_method='frequent', variables=with_frequent_category)

#Fit the class to the train set
cat_imputer_frequent.fit(X_train)

#Class learns
cat_imputer_frequent.imputer_dict_

## This class can be stored in joblib and used in the production environment

{'BsmtQual': 'TA',
 'BsmtCond': 'TA',
 'BsmtExposure': 'No',
 'BsmtFinType1': 'Unf',
 'BsmtFinType2': 'Unf',
 'Electrical': 'SBrkr',
 'GarageType': 'Attchd',
 'GarageFinish': 'Unf',
 'GarageQual': 'TA',
 'GarageCond': 'TA'}

In [11]:
# Repalce NA by missing

X_train = cat_imputer_frequent.transform(X_train)
X_test = cat_imputer_frequent.transform(X_test)

In [12]:
# Sanity check if we have no missing info

X_train[cat_vars_with_na].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Electrical      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

## Numerical Variables

In [13]:
# Identify numerical variables

num_vars = [var for var in X_train.columns if X_train[var].dtype != 'O' and var != 'SalePrice']

# Number of numerical variables

len(num_vars)

36

In [14]:
# Make a list of numerical variables that contain missing values
num_vars_with_na = [var for var in num_vars if X_train[var].isnull().sum() > 0]

#Print % of missing values per variable
X_train[num_vars_with_na].isnull().mean().sort_values(ascending=False)

LotFrontage    0.177321
GarageYrBlt    0.056317
MasVnrArea     0.004566
dtype: float64

In [15]:
# Add missing indicator to numerical variables with missing data

missing_ind = AddMissingIndicator(variables=num_vars_with_na)

# Fit the class to the train set
missing_ind.fit(X_train)

# Transform the data
X_train = missing_ind.transform(X_train)
X_test = missing_ind.transform(X_test)

# Check the binary missing indicator variables
X_train[['LotFrontage_na', 'GarageYrBlt_na', 'MasVnrArea_na']].head()

Unnamed: 0,LotFrontage_na,GarageYrBlt_na,MasVnrArea_na
930,0,0,0
656,0,0,0
45,0,0,0
1348,1,0,0
55,0,0,0


In [16]:
# Replace Missing data with the mean

# set up the class
mean_imputer = MeanMedianImputer(imputation_method='mean', variables=num_vars_with_na)

#Learn and store the parameters
mean_imputer.fit(X_train)

#Stored parameters
mean_imputer.imputer_dict_

{'LotFrontage': 69.87974098057354,
 'MasVnrArea': 103.7974006116208,
 'GarageYrBlt': 1978.2959677419356}

In [17]:
X_train = mean_imputer.transform(X_train)
X_test = mean_imputer.transform(X_test)

# Check if we have no missing data
X_train[num_vars_with_na].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

## Temporal Variables

This uses CombineWithFeatureReference and DropFeatures

In [18]:
def elapsed_years(df, var):
    # capture difference between year variable and year the house was sold
    df[var] = df['YrSold'] - df[var]
    return df

In [19]:
temporal_vars = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']

for var in temporal_vars:
    X_train = elapsed_years(X_train, var)
    X_test = elapsed_years(X_test, var)

In [20]:
# Now we drop YrSold
drop_features = DropFeatures(features_to_drop=['YrSold'])

X_train = drop_features.fit_transform(X_train)
X_test = drop_features.transform(X_test)

## Numerical Variable Transformation

In [21]:
# Log Transformation

log_transformer = LogTransformer(variables=['LotFrontage', '1stFlrSF', 'GrLivArea'])

X_train = log_transformer.fit_transform(X_train)
X_test = log_transformer.transform(X_test)

In [22]:
# Sanity check for null values in train set
[var for var in ['LotFrontage', '1stFlrSF', 'GrLivArea'] if X_train[var].isnull().sum() > 0]

[]

In [23]:
# Sanity check for null values in test set
[var for var in ['LotFrontage', '1stFlrSF', 'GrLivArea'] if X_test[var].isnull().sum() > 0]

[]

## Yeo-Johnson Transformation

In [24]:
yeo_johnson_transformer = YeoJohnsonTransformer(variables=['LotArea'])

X_train = yeo_johnson_transformer.fit_transform(X_train)
X_test = yeo_johnson_transformer.transform(X_test)

#Learnt Param
yeo_johnson_transformer.lambda_dict_

{'LotArea': 0.017755558882009546}

In [25]:
# Sanity check Train set
[var for var in ['LotArea'] if X_train[var].isnull().sum() > 0]

[]

In [26]:
# Sanity check for null values in Test set
[var for var in ['LotFrontage', '1stFlrSF', 'GrLivArea'] if X_test[var].isnull().sum() > 0]

[]

## Skewed Variables

In [27]:
skewed = ['BsmtFinSF2', 'LowQualFinSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'MiscVal']

binarizer = SklearnTransformerWrapper(transformer=Binarizer(threshold=0), variables=skewed)

X_train = binarizer.fit_transform(X_train)
X_test = binarizer.transform(X_test)

X_train[skewed].head()

Unnamed: 0,BsmtFinSF2,LowQualFinSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal
930,0,0,0,0,0,0
656,0,0,0,0,0,0
45,0,0,0,0,0,0
1348,0,0,0,0,0,0
55,0,0,0,1,0,0


## Categorical Variables

Applying some mapping is manual

In [28]:
# Remap string to numbers that determine quality

qual_mappings = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5, 'Missing':0, 'NA':0}

qual_vars = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond']

for var in qual_vars:
    X_train[var] = X_train[var].map(qual_mappings)
    X_test[var] = X_test[var].map(qual_mappings)

    
# Exposure mapping

exposure_mappings = {'No': 1, 'Mn': 2, 'Av': 3, 'Gd': 4, 'Missing':0, 'NA':0}

var = 'BsmtExposure'

X_train[var] = X_train[var].map(exposure_mappings)
X_test[var] = X_test[var].map(exposure_mappings)
# Finish Mapping

finish_mappings = {'Missing':0, 'NA':0, 'Unf': 1, 'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}

finish_vars = ['BsmtFinType1', 'BsmtFinType2']

for var in finish_vars:
    X_train[var] = X_train[var].map(finish_mappings)
    X_test[var] = X_test[var].map(finish_mappings)

    
# Garage Mapping

garage_mappings = {'Missing':0, 'NA':0, 'Unf': 1, 'RFn': 2, 'Fin': 3}

var = 'GarageFinish'

X_train[var] = X_train[var].map(garage_mappings)
X_test[var] = X_test[var].map(garage_mappings)

# Fence Mapping

fence_mappings = {'Missing':0, 'NA':0, 'MnWw': 1, 'GdWo': 2, 'MnPrv': 3, 'GdPrv': 4}

var = 'Fence'

X_train[var] = X_train[var].map(fence_mappings)
X_test[var] = X_test[var].map(fence_mappings)

## Removing Rare Labels

In [29]:
# Capture all quality variables

qual_vars = qual_vars + finish_vars + ['BsmtExposure', 'GarageFinish', 'Fence']

# Capture remaining categorical variables


cat_others = [var for var in cat_vars if var not in qual_vars]

len(cat_others)

X_train[cat_others] = X_train[cat_others].astype('O')
X_test[cat_others] = X_test[cat_others].astype('O')

In [30]:
rare_encoder = RareLabelEncoder(tol=0.01, n_categories=1, variables=cat_others)

#Find common labels
rare_encoder.fit(X_train)

#Stored parameters
rare_encoder.encoder_dict_

{'MSZoning': ['RL', 'RM', 'FV', 'RH'],
 'Street': ['Pave'],
 'Alley': ['Missing', 'Grvl', 'Pave'],
 'LotShape': ['Reg', 'IR1', 'IR2'],
 'LandContour': ['Lvl', 'Bnk', 'HLS', 'Low'],
 'Utilities': ['AllPub'],
 'LotConfig': ['Inside', 'Corner', 'CulDSac', 'FR2'],
 'LandSlope': ['Gtl', 'Mod'],
 'Neighborhood': ['NAmes',
  'CollgCr',
  'OldTown',
  'Edwards',
  'Somerst',
  'NridgHt',
  'Gilbert',
  'Sawyer',
  'NWAmes',
  'BrkSide',
  'SawyerW',
  'Crawfor',
  'Mitchel',
  'Timber',
  'NoRidge',
  'IDOTRR',
  'ClearCr',
  'SWISU',
  'StoneBr',
  'Blmngtn',
  'MeadowV',
  'BrDale'],
 'Condition1': ['Norm', 'Feedr', 'Artery', 'RRAn', 'PosN'],
 'Condition2': ['Norm'],
 'BldgType': ['1Fam', 'TwnhsE', 'Duplex', 'Twnhs', '2fmCon'],
 'HouseStyle': ['1Story', '2Story', '1.5Fin', 'SLvl', 'SFoyer'],
 'RoofStyle': ['Gable', 'Hip'],
 'RoofMatl': ['CompShg'],
 'Exterior1st': ['VinylSd',
  'HdBoard',
  'Wd Sdng',
  'MetalSd',
  'Plywood',
  'CemntBd',
  'BrkFace',
  'Stucco',
  'WdShing',
  'AsbShng'],


In [31]:
X_train = rare_encoder.transform(X_train)
X_test = rare_encoder.transform(X_test)

## Encoding categorical variables

In [32]:
# Set up the encoder
cat_encoder = OrdinalEncoder(encoding_method='ordered', variables=cat_others)

# Create mappings
cat_encoder.fit(X_train, y_train)

#Mapping are stored
cat_encoder.encoder_dict_

{'MSZoning': {'Rare': 0, 'RM': 1, 'RH': 2, 'RL': 3, 'FV': 4},
 'Street': {'Rare': 0, 'Pave': 1},
 'Alley': {'Grvl': 0, 'Pave': 1, 'Missing': 2},
 'LotShape': {'Reg': 0, 'IR1': 1, 'Rare': 2, 'IR2': 3},
 'LandContour': {'Bnk': 0, 'Lvl': 1, 'Low': 2, 'HLS': 3},
 'Utilities': {'Rare': 0, 'AllPub': 1},
 'LotConfig': {'Inside': 0, 'FR2': 1, 'Corner': 2, 'Rare': 3, 'CulDSac': 4},
 'LandSlope': {'Gtl': 0, 'Mod': 1, 'Rare': 2},
 'Neighborhood': {'IDOTRR': 0,
  'MeadowV': 1,
  'BrDale': 2,
  'Edwards': 3,
  'BrkSide': 4,
  'OldTown': 5,
  'Sawyer': 6,
  'SWISU': 7,
  'NAmes': 8,
  'Mitchel': 9,
  'SawyerW': 10,
  'Rare': 11,
  'NWAmes': 12,
  'Gilbert': 13,
  'Blmngtn': 14,
  'CollgCr': 15,
  'Crawfor': 16,
  'ClearCr': 17,
  'Somerst': 18,
  'Timber': 19,
  'StoneBr': 20,
  'NridgHt': 21,
  'NoRidge': 22},
 'Condition1': {'Artery': 0,
  'Feedr': 1,
  'Norm': 2,
  'RRAn': 3,
  'Rare': 4,
  'PosN': 5},
 'Condition2': {'Rare': 0, 'Norm': 1},
 'BldgType': {'2fmCon': 0, 'Duplex': 1, 'Twnhs': 2, '1Fa

In [33]:
X_train = cat_encoder.transform(X_train)
X_test = cat_encoder.transform(X_test)

## Feature Scaling

In [34]:
# create scaler
scaler = MinMaxScaler()

#Fit scalar to the train set
scaler.fit(X_train)

#Tramsform the train and test set

X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)

X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
