# Categorical Feature Encoding Challenge II

This is project to show different ways to deal with categorical columns.

The data for this project is from Kaggle(https://www.kaggle.com/c/cat-in-the-dat-ii)

This project deals with
    1. Stratified K-Fold CV
    2. Model Selection
    3. One Hot Encoding
    4. Label Encoding
    5. Target Encoding



In [1]:
import pandas as pd

In [3]:
train_df = pd.read_csv('../input/train.csv')

In [5]:
train_df.sample(5)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
207966,207966,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,India,...,8737d6b84,2.0,Contributor,Freezing,b,H,AG,3.0,1.0,0
469045,469045,0.0,0.0,1.0,T,N,Red,Triangle,Axolotl,Canada,...,7700eee3d,1.0,Grandmaster,Cold,f,N,nj,6.0,3.0,0
277550,277550,0.0,0.0,0.0,F,Y,Red,Polygon,Snake,Costa Rica,...,d2b326d92,2.0,Contributor,Freezing,c,B,Pk,2.0,2.0,0
371430,371430,1.0,0.0,0.0,F,N,Blue,Triangle,Lion,India,...,ca3d7c36d,1.0,Novice,Freezing,k,H,TZ,5.0,12.0,0
193786,193786,0.0,0.0,0.0,F,Y,Red,Polygon,Hamster,India,...,b813f2af5,2.0,Grandmaster,Cold,i,N,SS,7.0,11.0,0


In [6]:
train_df.target.value_counts()

0    487677
1    112323
Name: target, dtype: int64

In [10]:
import plotly.express as px

In [12]:
px.bar( y=train_df.target.value_counts() ,)

In [13]:
mapping = {
"Freezing": 0,
"Warm": 1,
"Cold": 2,
"Boiling Hot": 3,
"Hot": 4,
"Lava Hot": 5
}

In [14]:
train_df.loc[:, "ord_2"] = train_df.ord_2.map(mapping)


In [16]:
train_df.ord_2.value_counts()

0.0    142726
1.0    124239
2.0     97822
3.0     84790
4.0     67508
5.0     64840
Name: ord_2, dtype: int64

In [18]:
import pandas as pd
from sklearn import preprocessing

# read the data
df = pd.read_csv("../input/train.csv")

# fill NaN values in ord_2 column
df.loc[:, "ord_2"] = df.ord_2.fillna("NONE")

# initialize LabelEncoder
lbl_enc = preprocessing.LabelEncoder()

# fit label encoder and transform values on ord_2 column
# P.S: do not use this directly. fit first, then transform
df.loc[:, "ord_2"] = lbl_enc.fit_transform(df.ord_2.values)

In [19]:
df.groupby(["ord_2"])["id"].count()

ord_2
0     84790
1     97822
2    142726
3     67508
4     64840
5     18075
6    124239
Name: id, dtype: int64

In [22]:
df['new_feature'] = (df.ord_1.astype('str')+"_"+df.ord_2.astype('str'))

In [23]:
df['new_feature'].value_counts()

Novice_2         38233
Novice_6         33263
Expert_2         33249
Expert_6         28900
Novice_1         26271
Contributor_2    26082
Expert_1         22956
Grandmaster_2    22818
Contributor_6    22774
Novice_0         22718
Grandmaster_6    19899
Expert_0         19477
Master_2         18035
Novice_3         17850
Contributor_1    17734
Novice_4         17373
Expert_3         15792
Master_6         15734
Contributor_0    15634
Grandmaster_1    15464
Expert_4         15078
Grandmaster_0    13623
Contributor_3    12428
Master_1         12364
Contributor_4    11919
Grandmaster_3    10805
Master_0         10800
Grandmaster_4    10363
Master_3          8594
Master_4          8209
Novice_5          4889
nan_2             4309
Expert_5          4225
nan_6             3669
Contributor_5     3250
nan_1             3033
Grandmaster_5     2894
nan_0             2538
Master_5          2262
nan_3             2039
nan_4             1898
nan_5              555
Name: new_feature, dtype: int64

In [27]:
# read the data
df = pd.read_csv("../input/train.csv")

In [28]:
df.ord_2.value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
Name: ord_2, dtype: int64

In [29]:
df.ord_2.fillna("None").value_counts()

Freezing       142726
Warm           124239
Cold            97822
Boiling Hot     84790
Hot             67508
Lava Hot        64840
None            18075
Name: ord_2, dtype: int64

In [31]:
import pandas as pd
from sklearn import preprocessing


# read training data
train = pd.read_csv("../input/train.csv")


#read test data
test = pd.read_csv("../input/test.csv")


# create a fake target column for test data
# since this column doesn't exist
test.loc[:, "target"] = -1


# concatenate both training and test data
data = pd.concat([train, test]).reset_index(drop=True)


# make a list of features we are interested in
# id and target is something we should not encode
features = [x for x in train.columns if x not in ["id", "target"]]


# loop over the features list
for feat in features:

    
    # create a new instance of LabelEncoder for each feature
    lbl_enc = preprocessing.LabelEncoder()
    
    # note the trick here
    # since its categorical data, we fillna with a string
    # and we convert all the data to string type
    # so, no matter its int or float, its converted to string
    # int/float but categorical!!!
    temp_col = data[feat].fillna("NONE").astype(str).values
    
    # we can use fit_transform here as we do not
    # have any extra test data that we need to
    # transform on separately
    data.loc[:, feat] = lbl_enc.fit_transform(temp_col)
    
# split the training and test data again
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [33]:
train.ord_2.value_counts()

2    142726
6    124239
1     97822
0     84790
3     67508
4     64840
5     18075
Name: ord_2, dtype: int64

In [34]:
#Making RARE class
df.ord_4 = df.ord_4.fillna("NONE")

In [37]:
df.loc[
    df['ord_4'].value_counts()[df['ord_4']].values < 2000 # find all rows where count is less than 2k
    ,"ord_4" # the column where we want replace to happen   
] = "RARE"  # Make it as RARE

In [38]:
df.ord_4.value_counts()

N       39978
P       37890
Y       36657
A       36633
R       33045
U       32897
M       32504
X       32347
C       32112
H       31189
Q       30145
T       29723
O       25610
B       25212
E       21871
K       21676
I       19805
NONE    17930
D       17284
F       16721
W        8268
Z        5790
S        4595
RARE     3607
G        3404
V        3107
Name: ord_4, dtype: int64