In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 align="center">Categorical Feature Encoding Challenge</h1>

<h2 align="center"> Binary classification, with every feature a categorical</h2>

<hr style="border: 1px solid #000" width="70%">

<img src="https://storage.googleapis.com/kaggle-media/competitions/playground/cat_in_dat/cat7.jpg" style="display: block; margin: 0 auto" width="500px" height="500px"/>

<hr style="border: 1px solid #000">


# Table of Content

* [1. Introduction](#1.-Introduction)
    * [1.1. Goal](#1.1.-Goal)
    * [1.2. Libraries & Tools](#1.2.-Libraries-&-Tools)
* [2. Exploratory Data Analysis](#2.-Exploratory-Data-Analysis)
    * [2.1. Overview of the data](#2.1.-Overview-of-the-data)
    * [2.2. The Categorical Variables](2.2.-The-Categorical-Variables)
    * [2.3. Binary Variables](#2.3.-Binary-Variables)
    * [2.4. Ordinal Variables](#2.4.--Ordinal-Variables)
    * [2.5. Norminal Variables](#2.5.--Norminal-Variables)
    * [2.6. Difference between Label Encoding and OneHotEncoding (OHE)](#2.6.-Difference-between-Label-Encoding-and-OneHotEncoding-(OHE))
        * [2.6.1. Label Encoding](#2.6.1.-Label-Encoding)
        * [2.6.2. OneHotEncoding](#2.6.2.-OneHotEncoding)
    * [2.7. Day and Month Variables](#2.7.-Day-and-Month-Variables)
* [3. Model Evaluation on Test-Set](#Model-Evaluation-on-Test-Set)
* [4. Conclusion](#4.-Conclusion)
* [References](#References)

## Kindly upvote if you find the kernel helpful :) 

# 1. Introduction

I started out this book [Hands-On Machine Learning with Scikit-Learn, Keras, and TensorFlow](https://www.oreilly.com/library/view/hands-on-machine-learning/9781492032632/) 2nd Edition by [Aurélien Géron](https://www.oreilly.com/people/aurelien-geron/), I wrote my first notebook on [housing prices]() after studing the second chapter, then me and my system were seperated for a while but now we're back together :). I hope to end this project as soon as I can. 

Just done with the thrid chapter and it all about classification in Machine Learning (<b style="color: #DE3163">Classification</b>) Aurélien taught about binary and multi-class classification so like the last project am going to make use of this dataset for this challenge - []() - to practice on binary classification, and the second part of the project - []() - for multi-class clasification. So here we go again.

**_Description and context:_**

#### Is there a cat in your data?

A common task in machine learning pipelines is encoding categorical variables for a given algorithm in a format that allows as much useful signal as possible to be captured.

Because this is such a common task and important skill to master, we've put together a dataset that contains only categorical features, and includes:

* binary features
* low- and high-cardinality nominal features
* low- and high-cardinality ordinal features
* (potentially) cyclical features

## 1.1. Goals

I want to start by appreciating [KDJ2020](https://www.kaggle.com/dkomyagin), [Shahules](https://www.kaggle.com/shahules) and [Prashant Manshani](https://www.kaggle.com/prazhant), their notebooks really helped me understand the various kinds of caregorical data and how to handle them. Their notebook also played no small part to my final score on the competition being <b style="color: #DE3163">0.80464</b>. [Reference](#References) to their notebooks can be found below.

My goals in this **notebook** are to:

1. Discover and visualize the data to gain insights.
2. Prepare the data for Machine Learning algorithms.
3. Select a model and train it.
4. Fine-tune the model.
5. Present my solution.

## 1.2. Libraries & Tools

In [None]:
import scipy
import random
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, LabelEncoder, StandardScaler

In [None]:
def gen_color():
    """
    Generates a random a color of any sort.
    """
    
    return "#" + "".join([random.choice(list("0123456789abcdef")) for _ in range(6)])

def pie_plot(column, data=None, title=None, ax=None, fontsize=15, explode=0,
             autopct='%1.1f%%', shadow=None, figsize=(10, 6), colors=None, color_with_label=None):
    """
    returns: a pie plot of the quantities of unique valiues from a column.
    """
    
    if type(column) == str:
        target = data[column].value_counts()
    else:
        target = column.value_counts()
    
    explode = [explode for _ in range(len(target))]
    
    if color_with_label is not None:
        colors = [color_with_label[key] for key in target.index]
    elif color_with_label is None and colors is None:
        colors = [gen_color() for _ in range(len(target))]
        
    if ax is not None:
        if title is not None:
            ax.set_title(title, fontsize=fontsize)

        ax.pie(target, labels=target.index, autopct=autopct, shadow=shadow, explode=explode, colors=colors)
    else:
        fig = plt.figure(figsize=figsize)

        if title is not None:
            plt.title(title, fontsize=fontsize)

        plt.pie(target, labels=target.index, autopct=autopct, shadow=shadow, explode=explode, colors=colors)
        

def bar_plot(column, data=None, title=None, ax=None, fontsize=15, figsize=(10, 6), color='b'):
    """
    returns: a bar plot of the quantities of unique valiues from a column.
    """
    
    if type(column) == str:
        target = data[column].value_counts()
    else:
        target = column.value_counts()
    
    if ax is not None:
        if title is not None:
            ax.set_title(title, fontsize=fontsize)

        ax.bar(target.index, target, color=color)
    else:
        fig = plt.figure(figsize=figsize)

        if title is not None:
            plt.title(title, fontsize=fontsize)

        plt.bar(target.index, target, color=color)
    

def compare_plots(shape, columns, titles=None, data=None, kind='pie', explode=0, color=None,
                  fontsize=15, autopct='%1.1f%%', figsize=(20, 10), shadow=None, color_with_label=None):
    """
    returns: a matplotlib.subplot consisting of several features as described by
            the 'columns' using the 'data' DataFrame using a preferred kind.
    """
    
    fig, axes = plt.subplots(*shape, figsize=figsize)
    
    for i, ax in enumerate(axes.ravel()):
        title = titles[i] if titles is not None else None
        
        if kind == 'pie':
            pie_plot(columns[i], data=data, title=title, ax=ax, fontsize=fontsize, colors=color,
                     autopct=autopct, figsize=figsize, explode=explode, shadow=shadow, 
                     color_with_label=color_with_label)
        elif kind == 'bar':
            if color is None:
                color = 'b'
                
            bar_plot(columns[i], data=data, title=title, ax=ax, fontsize=fontsize, figsize=figsize, color=color)
        else:
            raise TypeError

def prep_data(train_data, test_data):
    data = pd.concat([train_data, test_data])
    
    first_half = binary_data + ordinal_data
    second_half = day_n_month
    
    # Handling binary data
    data['bin_3'] = data['bin_3'].replace(to_replace=['F', 'T'], value=['0', '1']).astype(int)
    data['bin_4'] = data['bin_4'].replace(to_replace=['Y', 'N'], value=['1', '0']).astype(int)
    
    # Handling ordinal data
    data['ord_0'] = data['ord_0'] - 1
    
    for col, mapper in zip(
        ['ord_1', 'ord_2', 'ord_3', 'ord_4'],        
        [mapper_ord_1, mapper_ord_2, mapper_ord_3, mapper_ord_4]
    ):
        data[col] = data[col].replace(mapper)
        
    # Handling ord_5 high cardinality data
    ord_5_matrix = data.ord_5.values.reshape(-1, 1)
    data.ord_5 = ordinal_encoder.fit_transform(ord_5_matrix)
    
    # Scaling Ordinal Data
    data[ordinal_data] = ordinal_scaler.fit_transform(data[ordinal_data])
        
    # One Hot Encoding on norminal data nom_0 - nom_4
    nom_0_9_matrix = data[norminal_data].values
    ohe_trans = ohe_encoder.fit_transform(nom_0_9_matrix)
    
    # Scaling Day and Month Data
    data['day'] = data['day'] - 1
    data['month'] = data['month'] - 1
    
    data[day_n_month] = dm_scaler.fit_transform(data[day_n_month])
    
    part_one_matrix = scipy.sparse.coo_matrix(
        data.loc[:, first_half].to_numpy()
    ).astype('float64')

    part_two_matrix = scipy.sparse.coo_matrix(
        data.loc[:, second_half].to_numpy()
    ).astype('float64')
    
    
    result =  scipy.sparse.hstack([
        part_one_matrix, 
        ohe_trans, 
        part_two_matrix
    ]).tocsr()
    
    return result[:train_data.shape[0]], result[train_data.shape[0]:]

# 2. Exploratory Data Analysis

The housing data-set has already been divided into two distinctive set - the train and test set. We'll start by loading and performing data analysis on the training-set.


## 2.1. Overview of the data

The data-set consists of only categorical data, and can be said to be broken into 3 distinct parts

* **The Binary data** - labelled bin_: This are categorical variables consisting of binary data, i.e True/False, 1's/0's, Yes/No e.t.c

* **The Ordinal Data** - labelled ord_: This are categorical variables consisting of ordered data. They are finite list of categories with a form of order related to it. Like in the days of the week, what come next after tuesday?

* **The Norminal Data** - labelled nom_: This are categorical variables, finite like the ordinal data but with no order.

The day and month column of the data-set can be classified under the ordinal data.

In [None]:
train_data = pd.read_csv("../input/cat-in-the-dat/train.csv", index_col="id")
test_data = pd.read_csv("../input/cat-in-the-dat/test.csv", index_col="id")

In [None]:
train_target = train_data.target
train_data.drop('target', inplace=True, axis=1)

In [None]:
train_data.head()

In [None]:
train_data.columns, len(train_data.columns)

In [None]:
train_data.duplicated().any()

## 2.2. The Categorical Variables

Let's list out all the information about the variables that we have


In [None]:
train_data.info()

In [None]:
for col in train_data.columns:
    print(f"{col}: items_length = {len(train_data[col].unique())}")

In order for a model to effectively work with the data-set it has to be reduced/scaled to figures that are easy enough for the ML model to work with.

## 2.3. Binary Variables

We start with the binary variables

In [None]:
binary_data = [f'bin_{i}' for i in range(5)]

binary_categorical_data = train_data.loc[:, binary_data]

In [None]:
binary_categorical_data.head()

In [None]:
binary_categorical_data['bin_3'] = binary_categorical_data['bin_3'].replace(to_replace=['F', 'T'], 
                                                                            value=['0', '1']).astype(int)

binary_categorical_data['bin_4'] = binary_categorical_data['bin_4'].replace(to_replace=['Y', 'N'], 
                                                                            value=['1', '0']).astype(int)

In [None]:
binary_categorical_data.head()

In [None]:
titles = [f"ratio of 0's and 1's in {b}" for b in binary_data]

colors = {
    0: "#FF5733",
    1: "#2471A3",
}

compare_plots((1,5), binary_data, titles=titles, data=binary_categorical_data, color_with_label=colors)

## 2.4.  Ordinal Variables

Given that ordinal data's are ordered. We try to maintain this order and then scale the features

In [None]:
ordinal_data = [f'ord_{i}' for i in range(6)]

ordinal_categorical_data = train_data.loc[:, ordinal_data]

for ordinal in ordinal_data:
    print(f"""
for {ordinal}: uniques are {ordinal_categorical_data[ordinal].unique()}
    """)

In [None]:
titles = [
    f"Distribution in ord_{i}"
    for i in range(6)
]

compare_plots((3, 2), ordinal_data, titles=titles, data=ordinal_categorical_data, kind='bar', color="#2471A3")

In [None]:
ordinal_categorical_data['ord_0'] = ordinal_categorical_data['ord_0'] - 1

mapper_ord_1 = {'Novice': 0, 'Contributor': 1, 'Expert': 2, 'Master': 3, 'Grandmaster': 4}

mapper_ord_2 = {'Freezing': 0, 'Cold': 1, 'Warm': 2, 'Hot': 3,'Boiling Hot': 4, 'Lava Hot': 5}

mapper_ord_3 = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4, 'f': 5, 'g': 6, 'h': 7, 
                'i': 8, 'j': 9, 'k': 10, 'l': 11, 'm': 12, 'n': 13, 'o': 14}

mapper_ord_4 = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 
                'I': 8, 'J': 9, 'K': 10, 'L': 11, 'M': 12, 'N': 13, 'O': 14,
                'P': 15, 'Q': 16, 'R': 17, 'S': 18, 'T': 19, 'U': 20, 'V': 21, 
                'W': 22, 'X': 23, 'Y': 24, 'Z': 25}

for col, mapper in zip(['ord_1', 'ord_2', 'ord_3', 'ord_4'], 
                       [mapper_ord_1, mapper_ord_2, mapper_ord_3, mapper_ord_4]):
    ordinal_categorical_data[col] = ordinal_categorical_data[col].replace(mapper)

ordinal_categorical_data.head()

for ord_5, we have high cardinality. Lets apply  OrdinalEncoder with "categories=’auto’" to it.

In [None]:
# credits for ord_5 high cardinality code: 
# https://www.kaggle.com/gogo827jz/catboost-baseline-with-feature-importance through 

ordinal_encoder = OrdinalEncoder(categories='auto')

ord_5_matrix = ordinal_categorical_data.ord_5.values.reshape(-1, 1)

ordinal_encoder.fit(ord_5_matrix)

ordinal_categorical_data.ord_5 = ordinal_encoder.transform(ord_5_matrix)

ordinal_categorical_data.head()

In [None]:
ordinal_scaler = StandardScaler()

ordinal_categorical_data[ordinal_data] = ordinal_scaler.fit_transform(ordinal_categorical_data)

ordinal_categorical_data.head()

## 2.5.  Norminal Variables

The norminal data on the other hand is a totally different case. Since they are not ordered either of Label encoding or OHEncoding should work great with them.

In [None]:
norminal_data = [f'nom_{i}' for i in range(10)]

norminal_categorical_data = train_data.loc[:, norminal_data]

for col in norminal_categorical_data.columns:
    print(f"{col}: items_length = {len(train_data[col].unique())}")

## 2.6. Difference between Label Encoding and OneHotEncoding (OHE)

Although the label and one hot encoder performs similar function, they don't necessarily give the same result.

### 2.6.1. Label Encoding

Label encoding assigns each unique variable to a different integer. This encoding scheme assumes the categories are ordered.

![](https://miro.medium.com/max/996/1*K5JbqxIwwPmtiSNQhjLPRg.png)

### 2.6.2. OneHotEncoding

One-hot encoding creates new columns indicating the presence/absence of each possible value in the original variable. Unlike label encoding, one-hot encoding does not assume an ordering of the categories. Thus, you can expect this approach to work particularly well if there is no clear ordering in the categorical variables.

![](https://miro.medium.com/max/878/1*WXpoiS7HXRC-uwJPYsy1Dg.png)

For the norminal variables, we'll be making use of OHE...



In [None]:
ohe_encoder = OneHotEncoder()

nom_0_9_matrix = norminal_categorical_data[norminal_data].values
ohe_trans = ohe_encoder.fit_transform(nom_0_9_matrix)

ohe_trans

## 2.7. Day and Month Variables

Now let's evaluate the day and month variables

In [None]:
day_n_month = ['day', 'month']

day_month_data = train_data.loc[:, day_n_month]

for dm in day_n_month:
    print(f"""
for {dm}: uniques are {day_month_data[dm].unique()}
    """)

In [None]:
day_month_data['day'] = day_month_data['day'] - 1
day_month_data['month'] = day_month_data['month'] - 1

day_month_data.head()

In [None]:
dm_scaler = StandardScaler()

day_month_data[day_n_month] = dm_scaler.fit_transform(day_month_data)

day_month_data.head()

**Putting it all together!!!**

In [None]:
train_d = train_data.copy()
test_d = test_data.copy()

train_d, test_d = prep_data(train_d, test_d)

train_d.shape, test_d.shape

# Model Evaluation on Test-Set

We'll be making use of the Logistic classifier model for this evaluation.

I already fine-tune and tested the model, so I will just apply it to the data to get the result.

In [None]:
log_clf = LogisticRegression(C=0.1, max_iter=1000, n_jobs=-2)

log_clf.fit(train_d, train_target)

In [None]:
predictions = log_clf.predict_proba(test_d)[:,1]

In [None]:
target = pd.Series(predictions, name="target")

result = pd.DataFrame({
    "id": test_data.index, 
    "target": target
})

result = result.set_index("id")

# result.to_csv("cat-in-the-dat/my_submission.csv")

In [None]:
result.head()

# References

* https://www.kaggle.com/dkomyagin/cat-in-the-dat-0-80285-private-lb-solution
* https://www.kaggle.com/shahules/an-overview-of-encoding-techniques
* https://www.kaggle.com/prazhant/a-detailed-guide-to-different-encoding-schemes


### Previous Notebook

* [House Prices Prediction (Beginner)](https://www.kaggle.com/ganiyuolalekan/house-prices-prediction-beginner)

<br><br>
<b><a style="color: #283747" href="#Table-of-Content">Back to Top</a></b>