### Import Dependencies

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

| Variable Type | Preferred Encoding | Why?|
|-----|-----|----|
| Nominal | One-Hot Encoding | No inherent order -> avoids implying false ordinal relationships|
| Ordinal | Label Encoding | Preserves order -> small integers represent increasing levels

- Gender -> Nominal
- Geography -> Nominal
- CreditScoreBins -> Ordinal

In [2]:
df = pd.read_csv('data/processed/ChurnModelling_Binning_Applied.csv')
df.head()

Unnamed: 0,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins
0,France,Female,42.0,2,0.0,1,1,1,101348.88,1,Fair
1,Spain,Female,41.0,1,83807.86,1,0,1,112542.58,0,Fair
2,France,Female,42.0,8,159660.8,3,1,0,113931.57,1,Poor
3,France,Female,38.91,1,0.0,2,0,0,93826.63,0,Good
4,Spain,Female,43.0,2,125510.82,1,1,1,79084.1,0,Excellent


### Nominal Variables

In [3]:
nominal_variables = ['Geography, Gender']

geography_dummies = pd.get_dummies(df['Geography'], prefix='Geography')
gender_dummies = pd.get_dummies(df['Gender'], prefix='Gender')

df_encoded = pd.concat([df, geography_dummies], axis=1)
del df_encoded['Geography']

df_encoded = pd.concat([df_encoded, gender_dummies], axis=1)
del df_encoded['Gender']

df_encoded

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,42.00,2,0.00,1,1,1,101348.88,1,Fair,True,False,False,True,False
1,41.00,1,83807.86,1,0,1,112542.58,0,Fair,False,False,True,True,False
2,42.00,8,159660.80,3,1,0,113931.57,1,Poor,True,False,False,True,False
3,38.91,1,0.00,2,0,0,93826.63,0,Good,True,False,False,True,False
4,43.00,2,125510.82,1,1,1,79084.10,0,Excellent,False,False,True,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,39.00,5,0.00,2,1,0,96270.64,0,Very Good,True,False,False,False,True
9996,35.00,10,57369.61,1,1,1,101699.77,0,Poor,True,False,False,False,True
9997,36.00,7,0.00,1,0,1,42085.58,1,Good,True,False,False,True,False
9998,42.00,3,75075.31,2,1,0,92888.52,1,Very Good,False,True,False,False,True


### Ordinal Variables

In [4]:
encode_dict_creditscore = {
  'Poor': 0,
  'Fair': 1,
  'Good': 2,
  'Very Good': 3,
  'Excellent': 4            
}

df_encoded['CreditScoreBins'] = df_encoded['CreditScoreBins'].map(encode_dict_creditscore)
df_encoded.head(10)

Unnamed: 0,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,CreditScoreBins,Geography_France,Geography_Germany,Geography_Spain,Gender_Female,Gender_Male
0,42.0,2,0.0,1,1,1,101348.88,1,1,True,False,False,True,False
1,41.0,1,83807.86,1,0,1,112542.58,0,1,False,False,True,True,False
2,42.0,8,159660.8,3,1,0,113931.57,1,0,True,False,False,True,False
3,38.91,1,0.0,2,0,0,93826.63,0,2,True,False,False,True,False
4,43.0,2,125510.82,1,1,1,79084.1,0,4,False,False,True,True,False
5,44.0,8,113755.78,2,1,0,149756.71,1,1,False,False,True,False,True
6,50.0,7,0.0,2,1,1,10062.8,0,4,True,False,False,False,True
7,29.0,4,115046.74,4,1,0,119346.88,1,0,False,True,False,True,False
8,44.0,4,142051.07,2,0,1,74940.5,0,0,True,False,False,False,True
9,27.0,2,134603.88,1,1,1,71725.73,0,2,True,False,False,False,True


In [5]:
df_encoded.to_csv(
    'data/processed/ChurnModelling_Encoded.csv',index=False)