# Categorical Feature Encoding Introduction

A common task in machine learning pipelines is encoding categorical variables for a given algorithm in a format that allows as much useful signal as possible to be captured. Because this is such a common task and important skill to master, we've put together a dataset that contains only categorical features, and includes:

* binary features
* low- and high-cardinality nominal features
* low- and high-cardinality ordinal features
* (potentially) cyclical features

![](https://www.kdnuggets.com/wp-content/uploads/woman-yelling-cat-data-science-business.jpg)


# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use("ggplot")

In [None]:
#read the data

train_df = pd.read_csv("../input/cat-in-the-dat/train.csv")
test_df = pd.read_csv("../input/cat-in-the-dat/test.csv")
submission_df = pd.read_csv("../input/cat-in-the-dat/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
print("Number of observations in the train data: ", train_df.shape[0])
print("Number of observations in the train data: ", test_df.shape[0])
print("Number of columns in the train data: ", train_df.shape[1])

# Data Analysis

<img src="https://pbs.twimg.com/media/EJ9uzwgUYAINWfK?format=jpg&name=medium" alt="Drawing" style="width: 700px;"/>

In [None]:
#checking for missing values

train_df.isna().sum()

In [None]:
#basic stats of the data
train_df.describe()

In [None]:
#basic stats of the string data

train_df.describe(include = "object")

## Helper Functions to plot the Numerical and Categorical Variables

In [None]:
def NumericalVariables_targetPlots(df,segment_by,target_var = "Attrition"):
    """A function for plotting the distribution of numerical variables and its effect on attrition"""
    
    fig, ax = plt.subplots(ncols= 2, figsize = (14,6))    

    #boxplot for comparison
    sns.countplot(x = segment_by, hue = target_var, data=df, ax=ax[0])
    ax[0].set_title("Comparision of " + segment_by + " vs " + target_var)
    
    #distribution plot
    ax[1].set_title("Distribution of "+segment_by)
    ax[1].set_ylabel("Frequency")
    sns.distplot(a = df[segment_by], ax=ax[1], kde=False)
    
    plt.show()

In [None]:
def CategoricalVariables_targetPlots(df, segment_by,invert_axis = False, target_var = "target"):
    
    """A function for Plotting the effect of variables(categorical data) on attrition """
    
    fig, ax = plt.subplots(ncols= 2, figsize = (14,6))
    
    #countplot for distribution along with target variable
    #invert axis variable helps to inter change the axis so that names of categories doesn't overlap
    if invert_axis == False:
        sns.countplot(x = segment_by, data=df,hue="target",ax=ax[0])
    else:
        sns.countplot(y = segment_by, data=df,hue="target",ax=ax[0])
        
    ax[0].set_title("Comparision of " + segment_by + " vs " + "Target")
    
    #plot the effect of variable on attrition
    if invert_axis == False:
        sns.barplot(x = segment_by, y = target_var ,data=df,ci=None)
    else:
        sns.barplot(y = segment_by, x = target_var ,data=df,ci=None)
        
    ax[1].set_title("Target rate by {}".format(segment_by))
    ax[1].set_ylabel("Relative Target Representation")
    plt.tight_layout()

    plt.show()

## Binary Variables - Numerical & Categorical

In [None]:
#analyzing the variable "bin_0"

NumericalVariables_targetPlots(train_df, "bin_0", "target")

In [None]:
#analyzing the variable "bin_1"

NumericalVariables_targetPlots(train_df, "bin_1", "target")

In [None]:
#analyzing the variable "bin_2"

NumericalVariables_targetPlots(train_df, "bin_2", "target")

In [None]:
#analyzing the variable "bin_3"

CategoricalVariables_targetPlots(train_df, "bin_3")

In [None]:
#analyzing the variable "bin_4"

CategoricalVariables_targetPlots(train_df, "bin_4")

## Nominal Variables

In [None]:
#analyzing the variable "nom_0"

CategoricalVariables_targetPlots(train_df, "nom_0")

In [None]:
#analyzing the variable "nom_1"

CategoricalVariables_targetPlots(train_df, "nom_1")

In [None]:
#analyzing the variable "nom_2"

CategoricalVariables_targetPlots(train_df, "nom_2")

In [None]:
#analyzing the variable "nom_3"

CategoricalVariables_targetPlots(train_df, "nom_3")

In [None]:
#analyzing the variable "nom_4"

CategoricalVariables_targetPlots(train_df, "nom_4")

## Ordinal Variables

In [None]:
#changing the color scheme
plt.style.use("fivethirtyeight")

<img src="https://matplotlib.org/3.1.0/_images/sphx_glr_style_sheets_reference_008.png" alt="Drawing" style="width: 1000px;"/>
<img src="https://matplotlib.org/3.1.0/_images/sphx_glr_style_sheets_reference_009.png" alt="Drawing" style="width: 1000px;"/>

In [None]:
#analyzing the variable "ord_0"
NumericalVariables_targetPlots(train_df, "ord_0", "target")

In [None]:
#analyzing the variable "ord_1"
CategoricalVariables_targetPlots(train_df,"ord_1")

In [None]:
#analyzing the variable "ord_2"
CategoricalVariables_targetPlots(train_df,"ord_2")

In [None]:
#analyzing the variable "day"

NumericalVariables_targetPlots(train_df, "day", "target")

In [None]:
#analyzing the variable "day"

NumericalVariables_targetPlots(train_df, "month", "target")

## Target Variable

In [None]:
plt.style.use("seaborn")
train_df.target.value_counts(normalize = True).plot(kind = "barh")
plt.title("Distribution of Target Variable")
plt.show()

> The Target variable is imbalanced in the ratio of 7:3. We can use any of the sampling techniques like Undersampling, oversampling and SMOTE to handle the imbalance data.

**Data Analysis Observations:**
* `bin_3`and `bin_4` are binary variables representing T/F and Y/N. We can convert them to 0 or 1.
* `nom_0` to `nom_4` are not ordinal variables. We can create dummy variables for these columns using one hot encoding.
* `nom_5` to `nom_9` has high cardinaty in the variables. 
* `ord_1` and `ord_2` has ordinal data. We can manually encode these variables.
* `ord_3` to `ord_5` encode using Label encoder.
* `day` and `month` encode using sin and cosine values as they are cyclic in nature.

# Feature Encoding Techniques

## Binary Encoding 
- Manually converting T/F --> 1/0

In [None]:
# changing the binary values T/F to 1/0 and Y/N to 1/0.

train_df["bin_3"] = train_df["bin_3"].apply(lambda x: 0 if x == 'F' else 1)
train_df["bin_4"] = train_df["bin_4"].apply(lambda x: 0 if x == 'N' else 1)

#test data
test_df["bin_3"] = test_df["bin_3"].apply(lambda x: 0 if x == "F" else 1)
test_df["bin_4"] = test_df["bin_4"].apply(lambda x: 0 if x == "N" else 1)

## One Hot Encoding - Nominal Features
- Converting columns that contain numbers of no specific order of preference. The data in the column usually denotes a category or value of the category and also when the data in the column is label encoded. 
- This confuses the machine learning model, to avoid this the data in the column should be One Hot encoded.
- Get k-1 dummies out of k categorical levels by removing the first level.

In [None]:
nominal_cat_var = ["nom_" + str(i) for i in range(0,5)]

train_temp_df = pd.get_dummies(train_df, columns = nominal_cat_var, drop_first = True)
test_temp_df = pd.get_dummies(test_df, columns = nominal_cat_var, drop_first = True)

In [None]:
print("Number of columns in the train data after one hot encoding: ", train_temp_df.shape[1])
print("Number of columns in the test data after one hot encoding: ", test_temp_df.shape[1])

## Ordinal Encoding
- OrdinalEncoder converts each string value to a whole number. The first unique value in your column becomes 1, the second becomes 2, the third becomes 3, and so on.

In [None]:
#for ordinal variables - ord_1 and ord_2. we will manually replace the columns with integer values

ord1_mapping = {'Grandmaster': 5, 'Expert': 4 , 'Novice':1 , 'Contributor':2 , 'Master': 3}
ord2_mapping = {'Cold': 2, 'Hot':4, 'Lava Hot': 6, 'Boiling Hot': 5, 'Freezing': 1, 'Warm': 3}

In [None]:
train_temp_df["ord_1"] = train_temp_df["ord_1"].map(ord1_mapping)
train_temp_df["ord_2"] = train_temp_df["ord_2"].map(ord2_mapping)

test_temp_df["ord_1"] = test_temp_df["ord_1"].map(ord1_mapping)
test_temp_df["ord_2"] = test_temp_df["ord_2"].map(ord2_mapping)

In [None]:
#converting "ord_3" and "ord_4" as category type and getting the category codes.

for col in ["ord_3", "ord_4"]:
    train_temp_df[col] = train_temp_df[col].astype('category')
    ord_map = dict( zip(train_temp_df[col], train_temp_df[col].cat.codes))
    train_temp_df[col] = train_temp_df[col].map(ord_map)
    test_temp_df[col] = test_temp_df[col].map(ord_map)
    train_temp_df[col] = train_temp_df[col].astype(int)

## Ordinal - High Cardinality Variables

In [None]:
import string

In [None]:
# Encode 'ord_5' using ACSII values
# Source:- https://www.kaggle.com/c/cat-in-the-dat/discussion/105702#latest-607652

# # Option 1: Add up the indices of two letters in string.ascii_letters
train_temp_df['ord_5_oe_add'] = train_temp_df['ord_5'].apply(lambda x:sum([(string.ascii_letters.find(letter)+1) for letter in x]))
test_temp_df['ord_5_oe_add'] = test_temp_df['ord_5'].apply(lambda x:sum([(string.ascii_letters.find(letter)+1) for letter in x]))

train_temp_df.drop('ord_5', axis=1, inplace=True)
test_temp_df.drop('ord_5', axis=1, inplace=True)

## Nominal - High Cardinality Variables

### Feature hashing (a.k.a the hashing trick)

- Feature hashing is a very cool technique to represent categories in a “one hot encoding style” as a sparse matrix but with a much lower dimensions. In feature hashing we apply a hashing function to the category and then represent it by its indices.

- [Using categorical data in machine learning with python](https://blog.myyellowroad.com/using-categorical-data-in-machine-learning-with-python-from-dummy-variables-to-deep-category-66041f734512)
- [Don’t be tricked by the Hashing Trick](https://booking.ai/dont-be-tricked-by-the-hashing-trick-192a6aae3087)

In [None]:
nominal_highcat_var  = ["nom_" + str(i) for i in range(5,10)]
nominal_highcat_var

In [None]:
import category_encoders  as ce
from sklearn.feature_extraction import FeatureHasher


In [None]:
#I got this solution from @kabure and @Giba

for col in nominal_highcat_var:
    train_temp_df[f'hash_{col}'] = train_temp_df[col].apply( lambda x: hash(str(x)) % 5000 )
    test_temp_df[f'hash_{col}'] = test_temp_df[col].apply( lambda x: hash(str(x)) % 5000 )  

In [None]:
#drop the variables after transformation
   
train_temp_df.drop(['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'], axis=1, inplace=True)
test_temp_df.drop(['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'], axis=1, inplace=True)

In [None]:
train_temp_df.head()

In [None]:
test_temp_df.head()

## Split the Data

In [None]:
X_train = train_temp_df.drop(['id', 'target'],axis = 1)
y_train = train_temp_df['target']
X_test = test_temp_df.drop(['id'], axis = 1)

In [None]:
print('Input training dimension:', X_train.shape)
print('Test data dimension:', X_test.shape)

# Modeling & Evaluation

In [None]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier,BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_validate
from sklearn.pipeline import make_pipeline
from sklearn.metrics import roc_auc_score,roc_curve

In [None]:
#declare hyperparameters dictionary

pipelines = {
    'logistic' : make_pipeline(LogisticRegression(random_state = 123)),
    'decisiontree' : make_pipeline(DecisionTreeClassifier(random_state = 123)),
    'randomforest': make_pipeline(RandomForestClassifier(random_state = 123)),
    'adaboost': make_pipeline(AdaBoostClassifier(random_state = 123))
}

In [None]:
#get the all possible parameters for a model

#pipelines["adaboost"].get_params().keys()

In [None]:
#logistic hyperparameters
logistic_hyperparameters = {
    'logisticregression__C' : [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'logisticregression__penalty' : ['l1', 'l2']
}

#ada boost hyperparameters
ab_hyperparameters = {
    'adaboostclassifier__n_estimators' : [100, 200, 400, 600, 800],
    'adaboostclassifier__learning_rate' : [0.0001, 0.001, 0.01, 0.1, 0.2, 0.3,1],
    'adaboostclassifier__algorithm': ['SAMME', 'SAMME.R']
}

decisiontree_hyperparameters = {
    "decisiontreeclassifier__max_depth": np.arange(3,12),
    "decisiontreeclassifier__max_features": np.arange(3,10),
    "decisiontreeclassifier__min_samples_split": [2,3,4,5,6,7,8,9,10,11,12,13,14,15],
    "decisiontreeclassifier__min_samples_leaf" : np.arange(1,3)
}

#random forest hyperparameters

rf_hyperparameters = {
    'randomforestclassifier__n_estimators' : [100, 200, 400, 600, 800],
    'randomforestclassifier__max_features' : ['auto', 'sqrt', 'log2'],
    'randomforestclassifier__max_depth' : [int(x) for x in np.linspace(3, 10, num = 1)],
    'randomforestclassifier__min_samples_split' : np.arange(2, 10)
}

In [None]:
hyperparameters = {
    'adaboost' : ab_hyperparameters,
    'randomforest' : rf_hyperparameters,
    'logistic' : logistic_hyperparameters,
    'decisiontree' : decisiontree_hyperparameters
}

# Fit and Tune Model

In [None]:
k = StratifiedKFold(n_splits=3, random_state=123)

In [None]:
fitted_models = {}

for name, pipeline in pipelines.items():
    print("------- ", name, ' ---------')
    #create a cross validation object from pipelines and hyperparameters
    model = GridSearchCV(pipeline, hyperparameters[name], cv = k , n_jobs=-1,return_train_score=True, verbose = 2,scoring="roc_auc")
    
    #fit model on X train and y train
    model.fit(X_train, y_train)
    
    fitted_models[name] = model
    
    # Print '{name} has been fitted'
    print(name, 'has been fitted.')

In [None]:
##Here we are evaluating based on model roc score

results, names  = [], [] 
best_estimator_dict = {}

for name,model_built in fitted_models.items():
    names.append(name)
    results.append(np.round(model_built.best_score_,4))
    print("Mean AUC Score of "+ name + " :", np.round(model_built.cv_results_["mean_train_score"].mean(),4))
    print("Best AUC Score of "+ name + " :", np.round(model_built.best_score_,4))
    best_estimator_dict[model_built] = np.round(model_built.best_score_,4)

In [None]:
roc_auc_importance = pd.Series(results, names)
roc_auc_importance.plot(kind='barh', cmap = "viridis")
plt.title("Mean AUC_ROC Score Based Cross Validation for Different Models")
plt.xlabel("AUC ROC Score")
plt.ylabel("Model Name")
plt.show()

In [None]:
#finding the best model based on AUC ROC Score

best_fittedobject = max(best_estimator_dict, key=best_estimator_dict.get)

In [None]:
#get the best model from the best fittedobject

best_model = best_fittedobject.best_estimator_.steps[0][1]
best_model

In [None]:
#best model params

best_fittedobject.best_params_

In [None]:
#plot roc curve

def plot_roc( actual, probs ):
    fpr, tpr, thresholds = roc_curve( actual, probs,
                                              drop_intermediate = False )
    auc_score = roc_auc_score( actual, probs )
    plt.figure(figsize=(7, 7))
    plt.plot( fpr, tpr, label='ROC curve (area = %0.2f)' % auc_score )
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate or [1 - True Negative Rate]')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic plot')
    plt.legend(loc="lower right")
    plt.show()

In [None]:
#AUC ROC Curve on Training Data based on 

plot_roc(y_train, best_model.predict_proba(X_train)[:,1])

In [None]:
#plot the feature importance

try:
    feat_importances = pd.Series(best_model.feature_importances_, index=X_train.columns)
    feat_importances.nlargest(5).plot(kind='barh')
    plt.title("Feature Importance")
    plt.xlabel("Relative Importance")
    plt.ylabel("Variable Name")
    plt.show()
except:
    print("Best Model doesn't support Feature Importance Plot (Logisitc Regression)")

In [None]:
best_fittedobject.cv_results_["mean_train_score"].mean()

In [None]:
#predictions

y_preds = best_model.predict_proba(X_test)[:,1]

In [None]:
#appending the predictions to submission data

submission_df["target"] = y_preds
submission_df.to_csv('best_submission.csv',header=True, index=False)

## Upvote the Kernel if you liked it. 

<img src="https://pbs.twimg.com/media/EJ9LyBZXUAAhk0-?format=jpg&name=900x900" alt="Drawing" style="width: 500px;"/>