# Tutorial for using the package `fast-ml` 

This package is as good as having a junior Data Scientist working for you. Most of the commonly used EDA steps, Missing Data Imputation techniques, Feature Engineering steps are covered in a ready to use format

## Part 5. Feature Engineering for Categorical Variables / Categorical Encodings



#### 1. Import eda module from the package 
`from fast_ml.missing_data_imputation import MissingDataImputer_Categorical, MissingDataImputer_Numerical`

#### 2. Define the imputer object. 
* For Categorical variables use `MissingDataImputer_Categorical`
* For Numerical variables use `MissingDataImputer_Numerical`

`cat_imputer = MissingDataImputer_Categorical(method = 'frequent')`

#### 3. Fit the object on your dataframe and provide a list of variables
`cat_imputer.fit(train, variables = ['BsmtQual'])`

#### 4. Apply the transform method on train / test dataset
`train = cat_imputer.transform(train)`
<br>&<br>
`test = cat_imputer.transform(test)`

#### 5. parameter dictionary gets created which store the values used for imputation. It can be viewed as
`cat_imputer.param_dict_`


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from fast_ml.feature_engineering_categorical import FeatureEngineering_Categorical

%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv('../data/house_prices.csv')
df.shape

(1460, 81)

In [3]:
df.head(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [7]:
numeric_type = ['float64', 'int64']
category_type = ['object']

## Start Feature Engineering for Categorical Variables

## Categorical Variables

### 1. BsmtQual 

In [3]:
#Before Imputation
df['BsmtQual'].value_counts()

TA    649
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64

In [None]:
cat_imputer1 = MissingDataImputer_Categorical(method = 'frequent')
cat_imputer1.fit(df, variables = ['BsmtQual'])

df = cat_imputer1.transform(df)

In [None]:
cat_imputer1.param_dict_

In [8]:
#After Imputation
df['BsmtQual'].value_counts()

TA    686
Gd    618
Ex    121
Fa     35
Name: BsmtQual, dtype: int64

In [9]:
# After Imputation a new indicator variable gets created
df['BsmtQual_nan'].value_counts()

0    1423
1      37
Name: BsmtQual_nan, dtype: int64

### 2. FireplaceQu

In [3]:
#Before Imputation
df['FireplaceQu'].value_counts()

Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64

In [4]:
cat_imputer2 = MissingDataImputer_Categorical(method = 'custom_value', value = 'Missing')
cat_imputer2.fit(df, variables = ['FireplaceQu'])

print (cat_imputer2.param_dict_)

df = cat_imputer2.transform(df)

In [8]:
#After Imputation
df['FireplaceQu'].value_counts()

Missing    690
Gd         380
TA         313
Fa          33
Ex          24
Po          20
Name: FireplaceQu, dtype: int64

In [9]:
#After Imputation
df['FireplaceQu_nan'].value_counts()

0    770
1    690
Name: FireplaceQu_nan, dtype: int64

In [4]:
c = 'FireplaceQu'
target = 'SalePrice'

In [8]:
param_dict_ = {}

param_dict_[c] = df[c].value_counts().to_dict()
param_dict_

{'FireplaceQu': {'Gd': 380, 'TA': 313, 'Fa': 33, 'Ex': 24, 'Po': 20}}

In [13]:
param_dict_[c] = (df[c].value_counts()/len(df[c])).to_dict()
param_dict_

{'FireplaceQu': {'Gd': 0.2602739726027397,
  'TA': 0.21438356164383562,
  'Fa': 0.022602739726027398,
  'Ex': 0.01643835616438356,
  'Po': 0.0136986301369863}}

In [None]:
class FeatureEngineering_Categorical:
    '''
    
    Various imputation methods available in this module are:
    Mean, Median, Mode, User define value, Random Sample distribution
    
    Parameters:
        Allowed values for
        method : 'frequent', 'custom_value', 'random'
        value : if method ='custom_value' then user can pass on the imputation value in this parameter
        add_indicator : True / False. If True then a new binary variable will be created of the name "var_nan" 
                        which will take value 1 if there's a missing value in var or 
                        0 if there's no missing value in var
    '''
    
    def __init__ (self, method, add_indicator = True, value='Missing', random_state =1):
        self.method = method
        self.value = value
        self.random_state = random_state
        self.add_indicator = add_indicator
        
    def fit (self, df, variables):
        
        if self.method =='one-hot':
            self.param_dict_ = {}
        
        if self.method == 'label' or self.method == 'int':
            self.param_dict_
        
        if self.method == 'frequency' or self.method == 'freq':
            self.
            for var in variables:
                value = df[var].mode()
                
                # Careful : because some variable can have multiple modes
                if len(value) ==1:
                    self.param_dict_[var] = value[0]
                else:
                    raise ValueError(f'Variable {var} contains multiple frequent categories')

        if self.method =='custom_value':
            #if value==None:
             #   raise ValueError("for 'custom_value' method provide a valid value in the 'value' parameter")
            #else:
            self.param_dict_ = {var:self.value for var in variables}

        if self.method =='random':
            None
            
        return self
    
    def transform(self, df):
        
        if self.method == 'random':
            df = self.__random_imputer__(df)
        
        else:
            for var in self.param_dict_:
                # Add indicator
                if self.add_indicator == True:
                    df[var + '_nan'] = np.where(df[var].isnull(), 1, 0)
                # impute missing values
                df[var].fillna(self.param_dict_[var] , inplace=True)
        
        
        return df