In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Loading the data
raw_data = pd.read_csv('/kaggle/input/credit-card-customers/BankChurners.csv')
raw_data.head()

# The dataset

## What the owner of the dataset says

* **CLIENTNUM**: Client number. Unique identifier for the customer holding the account
* **Attrition_Flag**: Internal event (customer activity) variable - if the account is closed then 1 else 0 (**Target**)
* **Customer_Age**: Demographic variable - Customer's Age in Years
* **Gender**: Demographic variable - M=Male, F=Female
* **Dependent_count**: Demographic variable - Number of dependents
* **Education_Level**: Demographic variable - Educational Qualification of the account holder (example: high school, college graduate, etc.)
* **Marital_Status**: Demographic variable - Married, Single, Divorced, Unknown
* **Income_Category**: Demographic variable - Annual Income Category of the account holder (< $40K, $40K - 60K, $60K - $80K, $80K-$120K, > $120K, Unknown)
* **Card_Category**: Product Variable - Type of Card (Blue, Silver, Gold, Platinum)
* **Months_on_book**: Period of relationship with bank
* **Total_relationship_count**: Total no. of products held by the customer
* **Months_Inactive_12_mon**: No. of months inactive in the last 12 months
* **Contacts_Count_12_mon**: No. of Contacts in the last 12 months
* **Credit_Limit**: Credit Limit on the Credit Card
* **Total_Revolving_Bal**: Total Revolving Balance on the Credit Card
* **Avg_Open_To_Buy**: Open to Buy Credit Line (Average of last 12 months)
* **Total_Amt_Chng_Q4_Q1**: Change in Transaction Amount (Q4 over Q1)
* **Total_Trans_Amt**: Total Transaction Amount (Last 12 months)
* **Total_Trans_Ct**: Total Transaction Count (Last 12 months)
* **Total_Ct_Chng_Q4_Q1**: Change in Transaction Count (Q4 over Q1)
* **Avg_Utilization_Ratio**: Average Card Utilization Ratio

And then the owner advises not to use the last 2 columns.

## Data preprocessing

* Drop the CLIENTNUM and the last 2 columns, as it makes no sense using it in the model
* Let's see the describe method and check for NaN and/or strange values
* Map the Attrition_Flag to 1: churned, 0: active; Gender to 0: male, 1: female (Let's do this before the get_dummies method, so it's not necessary to create dummies for them)
* One-hot encoding for the Education_Level, Marital_Status, Income_Category and Card_Category columns

In [None]:
# Creating a new data frame and dropping the columns we won't use
df = raw_data.copy().drop([raw_data.columns[0], raw_data.columns[-2], raw_data.columns[-1]], axis=1)
# Dataset statistics
df.describe(include='all')

In [None]:
# Checking the NaN count
df.isna().sum()

In [None]:
# Mapping the Attrition_Flag and Gender columns
df['Attrition_Flag'] = df['Attrition_Flag'].map({'Existing Customer': 0, 'Attrited Customer': 1})
df['Gender'] = df['Gender'].map({'M': 0, 'F': 1})

df.head()

In [None]:
# Checking the amount of "unknown" values on the education level, marital status and income columns
ed_level_unknown = df['Education_Level'].value_counts()['Unknown'] / df['Education_Level'].count()
marital_unknown = df['Marital_Status'].value_counts()['Unknown'] / df['Marital_Status'].count()
income_unknown = df['Marital_Status'].value_counts()['Unknown'] / df['Income_Category'].count()

print('Education Level % of unknown values: ', ed_level_unknown)
print('Marital Status % of unknown values: ', marital_unknown)
print('Income Category % of unknown values: ', income_unknown)

In [None]:
# Let's check the distribution of values on those categories
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharey=True, figsize=(12, 18))
ax1.hist(df['Income_Category'])
ax1.set_title('Income Category')
ax2.hist(df['Marital_Status'])
ax2.set_title('Marital Status')
ax3.hist(df['Education_Level'])
ax3.set_title('Education Level')
fig.show()

In [None]:
# Adding the 'unknown' observations to the mode value
df['Education_Level'] = df['Education_Level'].replace('Unknown', df['Education_Level'].mode()[0])
df['Marital_Status'] = df['Marital_Status'].replace('Unknown', df['Marital_Status'].mode()[0])
df['Income_Category'] = df['Income_Category'].replace('Unknown', df['Income_Category'].mode()[0])

In [None]:
# Getting dummies for the categorical columns: Education_Level, Marital_Status, Income_Category and Card_Category
df = pd.get_dummies(df)
print(df.columns)
df.head()

# The dummies are good!

All the columns we wanted to get dummies are now one-hot encoded. But there's one problem: There are "unknown" categories for education level, marital status and income. I wonder how that's going to impact the model. Speaking of it, let's create it!

In [None]:
# Creating the train and test sets
train_dataset = df.sample(frac=0.8, random_state=0)
test_dataset = df.drop(train_dataset.index) # Excluding all observations from the train set

# Let's compare the distributions of the train and test datasets
print(train_dataset.describe().transpose()[['mean', 'std']])
print(test_dataset.describe().transpose()[['mean', 'std']])

In [None]:
# Splitting the features from the labels
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('Attrition_Flag')
test_labels = test_features.pop('Attrition_Flag')

In [None]:
# Data normalization
normalizer = preprocessing.Normalization()
normalizer.adapt(np.array(train_features))

In [None]:
# Model
width = 5

model = keras.models.Sequential([
    normalizer,
    layers.Dense(units = width, activation='softplus'),
    layers.Dense(units = width, activation='softplus'),
    layers.Dense(units = 1)
])

model.summary()

# Compiling and Fitting

learning_rate = 0.005
epochs = 50
batch_size = 100

loss = keras.losses.MeanSquaredError()
optimizer = keras.optimizers.Adam(lr = learning_rate)

model.compile(
    optimizer = optimizer,
    loss = loss,
    metrics=['accuracy']
)

history = model.fit(
    train_features,
    train_labels,
    validation_split = 0.2,
    verbose = 2,
    epochs = epochs,
    batch_size = batch_size
)

plt.plot(history.history['loss'], label='loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.xlabel('Epoch')
plt.ylabel('Error [Attrition_Flag]')
plt.legend()
plt.grid(True)

In [None]:
# Let's evaluate
model.evaluate(
    test_features,
    test_labels,
    verbose = 2
)