### Handling Categorical Features 
 
 
Notebook context: Snippets of python script in handling categorical features

The objective of this notebook is to summarize the frequent types of categorical feature encoding 

 + One hot encoding
 + Hash encoding
 + Label encoding
 + Count encoding
 + Target encoding
 

Dataset to Download

https://www.kaggle.com/c/house-prices-advanced-regression-techniques
    

In [2]:
import pandas as pd
from  sklearn import model_selection
from sklearn.feature_extraction import FeatureHasher
from sklearn.preprocessing import LabelEncoder

df_house = pd.read_csv("data/housingprices/train.csv")
df_house.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df_house.MSZoning.value_counts()

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64

### One Hot Encoding with Pandas

In [4]:
pd.get_dummies(df_house[["MSZoning"]], "MSZoning", drop_first = False).head()

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM
0,0,0,0,1,0
1,0,0,0,1,0
2,0,0,0,1,0
3,0,0,0,1,0
4,0,0,0,1,0


In [5]:
## If the categorical column is needed for further analysis, 
## pd.concat function should help

def fn_ohe(pd_dataframe, var_names, drop_variables = False, drop_first_column = False):
    '''
    pd_dataframe: dataframe to one hot encode
    var_names: column names to one hot encode
    drop_variables: if True, drop variable names after one hot encode
    drop_first_column: if false, its OHE and if true, its dummy variable encoding
    '''

    if drop_variables:
        return pd.get_dummies(pd_dataframe, var_names, drop_first = drop_first_column)
    else:
        return pd.concat([pd_dataframe, 
        pd.get_dummies(pd_dataframe[var_names], var_names, drop_first = drop_first_column)], 
        axis = 1)

fn_ohe(df_house[["MSZoning", "Street", "Id"]], ["MSZoning", "Street"], drop_variables = False, drop_first_column = False ).head()


Unnamed: 0,MSZoning,Street,Id,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave
0,RL,Pave,1,0,0,0,1,0,0,1
1,RL,Pave,2,0,0,0,1,0,0,1
2,RL,Pave,3,0,0,0,1,0,0,1
3,RL,Pave,4,0,0,0,1,0,0,1
4,RL,Pave,5,0,0,0,1,0,0,1


In [14]:
len(df_house['YearBuilt'].to_list())

1460

In [None]:
[(df_house['YearBuilt'].apply(lambda x: str(x)).tolist())]

### Hash Encoding with sklearn

In [None]:
def getHashEncode(compute_df, var_name, n_features = 5):
    '''
    compute_df: dataframe to compute hash encoding
    var_name: Variable name to be hashed
    n_features: Number of hashed features as output
    '''
    h = FeatureHasher(n_features= n_features, input_type='string')
    z = [compute_df[var_name].apply(lambda x: str(x)).tolist()]
    f = h.transform(z)
    fh = []
    for i in range(1, n_features+1):
        fh.append("fh" + str(i))

    df = pd.DataFrame(f.toarray(), columns= fh)
    return(pd.concat([compute_df[var_name], df], axis=1))

getHashEncode(df_house, "YearBuilt", 16).head()


In [33]:
df_house['YearBuilt'].nunique()

112

In [27]:
h = FeatureHasher(n_features= 16, input_type='string')

In [29]:
z = df_house['YearBuilt'].apply(lambda x: str(x)).tolist()

In [34]:
h.transform(z)

<1x16 sparse matrix of type '<class 'numpy.float64'>'
	with 16 stored elements in Compressed Sparse Row format>

### Label Encoding

In [9]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

MultiColumnLabelEncoder(["MSZoning", "Street"]).fit_transform(df_house[["MSZoning", "Street", "Id"]]).head()    

Unnamed: 0,MSZoning,Street,Id
0,3,1,1
1,3,1,2
2,3,1,3
3,3,1,4
4,3,1,5


### Count Encoding

In [23]:
def getCountVar(compute_df, count_df, var_name, count_var = "Id"):
    '''
    compute_df : Data frame to count encode
    count_df : Data frame from which the counts should be taken
    var_name : categorical variable for count encoding
    count_var : some other variable from the dataset (used as dummy variable to get count)
    '''
    grouped_df = count_df.groupby(var_name, as_index=False)[count_var].agg('count')
    grouped_df.columns = [var_name, "var_count"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    return merged_df
        

getCountVar(df_house[["MSZoning", "Id", "Street"]], df_house[["MSZoning", "Id", "Street"]], "MSZoning").head()

Unnamed: 0,MSZoning,Id,Street,var_count
0,RL,1,Pave,1151
1,RL,2,Pave,1151
2,RL,3,Pave,1151
3,RL,4,Pave,1151
4,RL,5,Pave,1151


In [22]:
df_house.MSZoning.value_counts()

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64

### Target Encoding

In [None]:

def getDVEncodeVar(compute_df, target_df, var_name, target_var="RESPONDERS", min_cutoff=1):
    if type(var_name) != type([]):
        var_name = [var_name]
    grouped_df = target_df.groupby(var_name)[target_var].agg(["mean"]).reset_index()
    grouped_df.columns = var_name + ["mean_value"]
    merged_df = pd.merge(compute_df, grouped_df, how="left", on=var_name)
    merged_df.fillna(-1, inplace=True)
    return list(merged_df["mean_value"])


def do_target_encode(compute_df, var_names):
    kf = model_selection.KFold(n_splits=5, shuffle=True, random_state=2018)
    for col in var_names:
        train_enc_values = np.zeros(compute_df.shape[0])
        test_enc_values = 0
        for dev_index, val_index in kf.split(train_df):
            new_train_df = train_df[[col, target_var]]
            dev_X, val_X = new_train_df.iloc[dev_index], new_train_df.iloc[val_index]
            train_enc_values[val_index] =  np.array( getDVEncodeVar(val_X[[col]], dev_X, col))
            test_enc_values += np.array( getDVEncodeVar(test_df[[col]], dev_X, col))
        test_enc_values /= 5.
        train_df[col + "_enc"] = train_enc_values
        test_df[col + "_enc"] = test_enc_values
        print train_df[col + "_enc"].describe()
        print test_df[col + "_enc"].describe()