Using the [dataset](https://www.kaggle.com/shivan118/churn-modeling-dataset) I'm going to use approaches to predict data that come from a unbalance dataset.

Using the nb_black formatter.

In [None]:
!pip install nb_black -q

In [None]:
%load_ext nb_black

# Importing dataset and mini-EAD

In [None]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import plotly.express as px
import plotly.figure_factory as ff
import os
from sklearn.model_selection import train_test_split
import tensorflow as tf

data = pd.read_csv("/kaggle/input/churn-modeling-dataset/Churn_Modelling.csv").drop(
    ["RowNumber", "CustomerId", "Surname"], axis=1
)
data.head()

In [None]:
# To plot numerical column
def plot_hist(data, column):
    fig = px.histogram(data, x=column, color="Exited")
    fig.show()
    fig = ff.create_table(pd.DataFrame(data[column].describe()).T)
    fig.show()


# To plot categorical column
def plot_count(data, column):
    df = data.groupby(column)["Exited"].value_counts()
    df = pd.DataFrame(df)
    df.columns = ["Count"]
    df.reset_index(inplace=True)
    fig = px.bar(df, x=column, y="Count", color="Exited", text="Count", barmode="group")
    fig.show()

# Data porfile

- There is 10 columns;
- No missing values;
- Exited column is the target;

### Columns meaning
- CreditScore: Customer score in financial context;
- Geography: Represets the customer contry;
- Gender: Just customer's sex;
- Age: Just Age;
- Tenure: How much time as customer;
- Balance: How much money in the bank;
- NumOfProducts: How much products the customer uses;
- HasCrCard: Does have the customer a credit card?
- IsActiveMember: Is the customer an active member?
- EstimetedSalary: How much is the customer salary?
- Exited: Client churn flag


In [None]:
data.info()

# EAD

### CreditScore

In [None]:
plot_hist(data, "CreditScore")

## Geography

In [None]:
plot_count(data, "Geography")

## Gender           

In [None]:
plot_count(data, "Gender")

## Age

In [None]:
plot_hist(data, "Age")

## Tenure

In [None]:
plot_hist(data, "Tenure")

## Balance          

In [None]:
plot_hist(data, "Balance")

## NumOfProducts

In [None]:
plot_hist(data, "NumOfProducts")

## HasCrCard

In [None]:
plot_hist(data, "HasCrCard")

## IsActiveMember

In [None]:
plot_hist(data, "IsActiveMember")

## EstimatedSalary

In [None]:
plot_hist(data, "EstimatedSalary")

# Data formatation

- StandardScaler -> Standardize features by removing the mean and scaling to unit variance The standard score of a sample x is calculated as: z = (x - u) / s.

- LabelEncoder -> Encode target labels with value between 0 and n_classes-1.

- OneHotEncoder -> Encode categorical features as a one-hot numeric array.



In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder

enc = OneHotEncoder(handle_unknown="ignore")
stander_scaler = StandardScaler()
label_encoder = LabelEncoder()

X = np.concatenate(
    (
        ## OneHotEncoder
        enc.fit_transform(data[["Geography"]]).toarray(),
        ## Stander Scaler
        stander_scaler.fit_transform(
            data[
                [
                    "CreditScore",
                    "Age",
                    "Tenure",
                    "Balance",
                    "NumOfProducts",
                    "EstimatedSalary",
                ]
            ]
        ),
        ## LabelEncoder
        label_encoder.fit_transform(data[["Gender"]]).reshape(-1, 1),
        ## No formatation
        data[["HasCrCard", "IsActiveMember"]].values,
    ),
    axis=1,
)

y = data.Exited.values
X.shape

Geting the name of our new columns after transformed...

In [None]:
columns = (
    [el for el in enc.categories_[0]]
    + ["CreditScore", "Age", "Tenure", "Balance", "NumOfProducts", "EstimatedSalary",]
    + ["Gender"]
    + ["HasCrCard", "IsActiveMember"]
    + ["Exited"]
)

## Correlation Matrix

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

table = pd.DataFrame(np.concatenate([X, y.reshape(-1, 1)], axis=1))
table.columns = columns
table = table.corr()
with sns.axes_style("white"):
    mask = np.zeros_like(table)
    mask[np.triu_indices_from(mask)] = True
    plt.figure(figsize=(10, 10))
    sns.heatmap(
        round(table, 2),
        cmap="Reds",
        mask=mask,
        vmax=table.max().max(),
        vmin=table.min().min(),
        linewidths=0.5,
        annot=True,
        annot_kws={"size": 12},
    ).set_title("Correlation Matrix App behavior dataset")

# Train ANN

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv2D
import keras


def get_model():
    return Sequential(
        [
            Dense(units=200, input_dim=12, activation="relu"),
            Dense(150, activation="relu"),
            Dropout(0.2),
            Dense(100, activation="relu"),
            Dense(100, activation="relu"),
            Dropout(0.2),
            Dense(100, activation="relu"),
            Dense(100, activation="relu"),
            Dense(100, activation="relu"),
            Dropout(0.2),
            Dense(100, activation="relu"),
            Dense(1, activation="sigmoid"),
        ]
    )


def train_ann(X, y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.1, random_state=42
    )

    model = get_model()

    model.compile(
        optimizer="adam", loss="binary_crossentropy", metrics=["mse", "accuracy"],
    )

    # Trainig and returning back the results.
    history = model.fit(
        X_train,
        y_train,
        batch_size=10,
        epochs=50,
        verbose=0,
        validation_data=(X_test, y_test),
    )
    loss, mse, acc = model.evaluate(X_test, y_test, verbose=0)
    fig = ff.create_table(
        pd.DataFrame([(loss, mse, acc)], columns=["Loss", "MSE", "Accuracy"]),
    )
    fig.show()

# Over-sampling

## SMOTE
Class to perform over-sampling using SMOTE.

This object is an implementation of SMOTE - Synthetic Minority Over-sampling Technique as presented in [R001eabbe5dd7-1](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html#r001eabbe5dd7-1).

Read more in the User Guide.



In [None]:
from imblearn.over_sampling import SMOTE

X, y = SMOTE(random_state=42).fit_resample(X, y)

train_ann(X, y)

## RandomOverSampler

Class to perform random over-sampling.

Object to over-sample the minority class(es) by picking samples at random with replacement.

Read more in the [User Guide](https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#random-over-sampler).

In [None]:
from imblearn.over_sampling import RandomOverSampler

X, y = RandomOverSampler(random_state=42).fit_resample(X, y)

train_ann(X, y)

## BorderlineSMOTE

Over-sampling using Borderline SMOTE.

This algorithm is a variant of the original SMOTE algorithm proposed in [R63962efaf197-2](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.BorderlineSMOTE.html#r63962efaf197-2). Borderline samples will be detected and used to generate new synthetic samples.

In [None]:
from imblearn.over_sampling import BorderlineSMOTE

X, y = BorderlineSMOTE(random_state=42).fit_resample(X, y)

train_ann(X, y)

## ADASYN
Perform over-sampling using Adaptive Synthetic (ADASYN) sampling approach for imbalanced datasets.

Read more in the [User Guide](https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#smote-adasyn).

In [None]:
from imblearn.over_sampling import ADASYN

X, y = ADASYN(random_state=42).fit_resample(X, y)

train_ann(X, y)

## KMeansSMOTE
Apply a KMeans clustering before to over-sample using SMOTE.

This is an implementation of the algorithm described in [Rea5937a049dc-1](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.KMeansSMOTE.html#rea5937a049dc-1).

Read more in the User Guide.

In [None]:
from imblearn.over_sampling import KMeansSMOTE

X, y = KMeansSMOTE(random_state=42).fit_resample(X, y)

train_ann(X, y)

## SVMSMOTE
Over-sampling using SVM-SMOTE.

Variant of SMOTE algorithm which use an SVM algorithm to detect sample to use for generating new synthetic samples as proposed in [R88acb9955f91-2](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SVMSMOTE.html#r88acb9955f91-2).

Read more in the [User Guide](https://imbalanced-learn.readthedocs.io/en/stable/over_sampling.html#smote-adasyn).

In [None]:
from imblearn.over_sampling import SVMSMOTE

X, y = SVMSMOTE(random_state=42).fit_resample(X, y)

train_ann(X, y)

# Under-Sampling

## ClusterCentroids
Perform under-sampling by generating centroids based on clustering methods.

Method that under samples the majority class by replacing a cluster of majority samples by the cluster centroid of a KMeans algorithm. This algorithm keeps N majority samples by fitting the KMeans algorithm with N cluster to the majority class and using the coordinates of the N cluster centroids as the new majority samples.

In [None]:
from imblearn.under_sampling import ClusterCentroids

X, y = ClusterCentroids(random_state=42).fit_resample(X, y)

train_ann(X, y)

## AllKNN

Class to perform under-sampling based on the AllKNN method.


In [None]:
from imblearn.under_sampling import AllKNN

X, y = AllKNN().fit_resample(X, y)

train_ann(X, y)

## NeighbourhoodCleaningRule

Class performing under-sampling based on the neighbourhood cleaning rule.

Read more in the [User Guide](https://imbalanced-learn.readthedocs.io/en/stable/under_sampling.html#condensed-nearest-neighbors).


In [None]:
from imblearn.under_sampling import NeighbourhoodCleaningRule

X, y = NeighbourhoodCleaningRule().fit_resample(X, y)

train_ann(X, y)

## RandomUnderSampler
Class to perform random under-sampling.

Under-sample the majority class(es) by randomly picking samples with or without replacement.

Read more in the [User Guide](https://imbalanced-learn.readthedocs.io/en/stable/under_sampling.html#controlled-under-sampling).

In [None]:
from imblearn.under_sampling import RandomUnderSampler

X, y = RandomUnderSampler().fit_resample(X, y)

train_ann(X, y)