# Categorical Feature Encoding Challenge EDA for Everyone

### This is a basic EDA notebook for everyone(including beginners). If you think it's useful, please upvote. ðŸ˜Š

### I also shared [Top 1st Place Solution on Private LB Notebook](https://www.kaggle.com/werooring/top-1st-place-solution-on-private-lb)

- [Competition Link](https://www.kaggle.com/c/cat-in-the-dat/)  
- [Reference EDA notebook](https://www.kaggle.com/kabure/eda-feat-engineering-encode-conquer)

## Look Around Data

In [None]:
import numpy as np
import pandas as pd

train = pd.read_csv('/kaggle/input/cat-in-the-dat/train.csv', index_col='id')
test = pd.read_csv('/kaggle/input/cat-in-the-dat/test.csv', index_col='id')
submission = pd.read_csv('/kaggle/input/cat-in-the-dat/sample_submission.csv', index_col='id')

In [None]:
train.shape, test.shape

In [None]:
train.head()

In [None]:
submission.head()

### Create Feature Summary Table

#### Step 1: Create Data Frame by Feature

In [None]:
summary = pd.DataFrame(train.dtypes, columns=['dtypes'])
summary.head()

#### Step 2: Rename column after reseting index

In [None]:
summary = summary.reset_index()
summary = summary.rename(columns={'index': 'Feature'})
summary.head()

#### Step 3: Add missing values, unique values, and input values from rows 1 to 3

In [None]:
summary['Missing'] = train.isnull().sum().values  
summary['Uniques'] = train.nunique().values
summary['First Value'] = train.loc[0].values
summary['Second Value'] = train.loc[1].values
summary['Third Value'] = train.loc[2].values
summary.head()

#### Full codes

In [None]:
# Step 1: Create Data Frame by Feature
summary = pd.DataFrame(train.dtypes, columns=['dtypes'])

# Step 2 : Rename column after reseting index
# 2.1 Reset index
summary = summary.reset_index()
# 2.2 Rename column
summary = summary.rename(columns={'index': 'Feature'})

# Step 3 :  Add missing values, unique values, and input values from rows 1 to 3.
# Missing values by feature
summary['Missing'] = train.isnull().sum().values    
# Unique values by feature
summary['Uniques'] = train.nunique().values
# Input values from rows 1 to 3
summary['First Value'] = train.loc[0].values
summary['Second Value'] = train.loc[1].values
summary['Third Value'] = train.loc[2].values

In [None]:
summary

#### ord_0, ord_1, ord_2 unique values

In [None]:
for i in range(3):
    col = 'ord_' + str(i)
    print(f'{col} unique values: {train[col].unique()}')

ord_3, ord_4, ord_5 unique values

In [None]:
for i in range(3, 6):
    col = 'ord_' + str(i)
    print(f'{col} unique values: {train[col].unique()}')

#### day, month, target unique values

In [None]:
print('day unique values:', train['day'].unique())
print('month unique values:', train['month'].unique())
print('target unique values:', train['target'].unique())

## Visualize Feature and Target Values

In [None]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

### Target Values Distribution

In [None]:
mpl.rc('font', size=15) # Set font size
plt.figure(figsize=(7, 6)) # Set Figure size

# Target Value Distribution Count Plot
ax = sns.countplot(x='target', data=train)
ax.set(title='Target Distribution');

In [None]:
rectangle = ax.patches[0] # First Rectangle object
print('Rectangle Height:', rectangle.get_height())
print('Rectangle Width:', rectangle.get_width())
print('Rectangle x-axis of the left border:', rectangle.get_x())

In [None]:
print('x-coordinate of text:', rectangle.get_x() + rectangle.get_width()/2.0)
print('x-coordinate of text:', rectangle.get_height() + 1500)

In [None]:
mpl.rc('font', size=15)
plt.figure(figsize=(7, 6))

ax = sns.countplot(x='target', data=train)

total_size = len(train) # Total number of training data

# Circulates rectangle object, showing target values ratio at the top of the bar
for patch in ax.patches:
    height = patch.get_height() # Rectangle height (Data Count)
    width = patch.get_width() # Rectangle width
    left_coord = patch.get_x() # Rectangle x-axis of the left border
    percent = height/total_size*100 # target values ratio
    
    # Enter text at (x, y)coordinates 
    ax.text(x=left_coord + width/2.0, # x-axis position
            y=height + 1500, # y-axis position
            s='{:1.1f}%'.format(percent), # Text
            ha='center') # Center alignment

ax.set_title('Target Distribution');

### Binary Features Distribution

#### Step 1: Prepare row n Figure for m

In [None]:
mpl.rc('font', size=12)
figure, axes = plt.subplots(nrows=3, ncols=2) # Create a three-row, two-row Figure
figure.set_size_inches(10, 16) # Set Figure size
plt.subplots_adjust(wspace=0.4, hspace=0.3) # Set margins between subplots

#### Step 2: Assign Subplot to Each Axis
Step 2-1 : Create a function to obtain subplot rows and column positions

In [None]:
def get_row_col_idx(idx):
    '''Returns the position of rows and columns in a subplot'''
    row_idx = idx//2
    if idx%2 == 0:
        col_idx = 0
    else:
        col_idx = 1
    return row_idx, col_idx

Step 2-2 : Assign Binary Feature List

In [None]:
# Binary Feature List
bin_cols = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']

Step 2-3 : Assign a subplot to each axis

In [None]:
mpl.rc('font', size=12)
figure, axes = plt.subplots(nrows=3, ncols=2)
figure.set_size_inches(10, 16) 
plt.subplots_adjust(wspace=0.4, hspace=0.3) 

# Create a Binary Feature Distribution Plot by Target Value for each Subplot
for idx, col in enumerate(train[bin_cols]): 
    row_idx, col_idx = get_row_col_idx(idx) # Subplot rows, column locations
    ax = axes[row_idx, col_idx] # Set the axis on which you want to plot a graph
    
    # Target Value Distribution Count Plot
    sns.countplot(x=col, 
                  data=train, 
                  hue='target', 
                  palette='pastel', # Set Graph Color
                  ax=ax) 
    
    ax.set_title(f'{col} Distribution by Target')

#### Step 3: Show Ratio on a Graph

Step 3-1 : Create a function that displays target values ratio at the top of the bar graph.

In [None]:
def write_percent(ax):
    '''Displays target values ratio at the top of the bar graph'''
    for patch in ax.patches:
        height = patch.get_height()
        width = patch.get_width() 
        left_coord = patch.get_x()
        percent = height/total_size*100
        
        ax.text(left_coord + width/2.0, 
                height+500, 
                '{:1.1f}%'.format(percent), 
                ha='center') 

Step 3-2 : Plot a binary feature distribution by target value ratio

In [None]:
figure, axes = plt.subplots(nrows=3, ncols=2)
figure.set_size_inches(10, 16)
plt.subplots_adjust(wspace=0.4, hspace=0.3)

for idx, col in enumerate(train[bin_cols]): 
    row_idx, col_idx = get_row_col_idx(idx)
    ax = axes[row_idx, col_idx]
    
    sns.countplot(x=col, 
                  data=train, 
                  hue='target', 
                  palette='pastel',
                  ax=ax) 
    
    write_percent(ax)
    ax.set_title(f'{col} Distribution by Target')

### Nominal Features Distribution

#### Step 1: Create a crosstab generation function

In [None]:
pd.crosstab(train['nom_0'], train['target'])

In [None]:
pd.crosstab(train['nom_0'], train['target'], normalize='index')

In [None]:
crosstab = pd.crosstab(train['nom_0'], train['target'], normalize='index')*100
crosstab.reset_index()

In [None]:
crosstab[1]

In [None]:
def get_crosstab(df, col):
    '''create crosstab'''
    crosstab = pd.crosstab(df[col], df['target'], normalize='index')*100
    crosstab = crosstab.reset_index()
    return crosstab

#### Step 2: Create a point plot generation function

In [None]:
def plot_pointplot(ax, col, crosstab):
    '''Plot target value 1 ratio on common x-axis as a point plot'''
    ax2 = ax.twinx() # Create a common x-axis
    ax2 = sns.pointplot(x=col, y=1, data=crosstab,
                        order=crosstab[col].values, # Point Plot Order
                        color='black', # Point Plot Color
                        legend=False)
    ax2.set_ylim(crosstab[1].min()-5,crosstab[1].max()*1.1) #Set y-axis range
    ax2.set_ylabel("Target 1 Ratio(%)")

#### Step 3: Create ratio feature distribution and target value 1 ratio point plot by feature generation function

In [None]:
def plot_cat_dist_with_true_ratio(df, cols, num_rows, num_cols, size=(15, 20)):
    figure, axes = plt.subplots(nrows=num_rows, ncols=num_cols)
    figure.set_size_inches(size[0], size[1])

    for idx, col in enumerate(df[cols]): 
        crosstab = get_crosstab(df, col) # create crosstab
        
        # Set the axis on which to plot the graph
        if num_cols == 1:
            ax = axes[idx]
        else:
            row_idx, col_idx = get_row_col_idx(idx)
            ax = axes[row_idx, col_idx]
            
        sns.countplot(x=col, data=train, 
                      order=crosstab[col].values, 
                      color='skyblue', 
                      ax=ax) 

        write_percent(ax) # Indicate ratio by unique values
        
        plot_pointplot(ax, col, crosstab) # plot Pointplot
        
        ax.set_title(f'{col} Distribution') # Plot title
        
    plt.subplots_adjust(hspace=0.3, wspace=0.45)

In [None]:
nom_cols = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'] # Nominal features
plot_cat_dist_with_true_ratio(train, nom_cols, num_rows=3, num_cols=2)

### Ordinal Features Distribution


ord_0, ord_1, ord_2, ord_3 distribution

In [None]:
ord_cols = ['ord_0', 'ord_1', 'ord_2', 'ord_3'] # Ordinal features
plot_cat_dist_with_true_ratio(train, ord_cols, 
                              num_rows=2, num_cols=2, size=(15, 12))

In [None]:
from pandas.api.types import CategoricalDtype 

ord_1_value = ['Novice', 'Contributor', 'Expert', 'Master', 'Grandmaster']
ord_2_value = ['Freezing', 'Cold', 'Warm', 'Hot', 'Boiling Hot', 'Lava Hot']

# Ordered categorical data types
ord_1_dtype = CategoricalDtype(categories=ord_1_value, ordered=True)
ord_2_dtype = CategoricalDtype(categories=ord_2_value, ordered=True)

# Change data types
train['ord_1'] = train['ord_1'].astype(ord_1_dtype)
train['ord_2'] = train['ord_2'].astype(ord_2_dtype)

In [None]:
plot_cat_dist_with_true_ratio(train, ord_cols, num_rows=2, num_cols=2, size=(15, 12))

ord_4, ord_5 distribution

In [None]:
plot_cat_dist_with_true_ratio(train, ['ord_4', 'ord_5'], num_rows=2, num_cols=1, size=(15, 12))

### Date Features Distribution

In [None]:
date_cols = ['day', 'month']
plot_cat_dist_with_true_ratio(train, date_cols, num_rows=2, num_cols=1, size=(15, 12))