# Churn Prediction And Exploratory Data Analisys

In [None]:
# Load Kaggle file

! pip install -q kaggle

from google.colab import files

files.upload()

! mkdir ~/.kaggle

! cp kaggle.json ~/.kaggle/

! chmod 600 ~/.kaggle/kaggle.json

# Download dataset from kaggle

! kaggle datasets download 'radheshyamkollipara/bank-customer-churn'

! mkdir Data

! unzip bank-customer-churn.zip -d Data

In [None]:
import pandas as pd

# Load the dataset
df = pd.read_csv("./Data/Customer-Churn-Records.csv")

# Check if the dataset has missing values
missing_percentage = df.isna().sum() / len(df) * 100
missing_percentage = missing_percentage[missing_percentage != 0]
print(df.isna().sum(), missing_percentage)

# Remove unnecessary columns
df.drop(["Surname", "CustomerId", "RowNumber"], inplace=True, axis=1)

In [None]:
# Check how many have left
print(df["Exited"].value_counts())

In [None]:
# Divide the dataset column into numerical and categorical
numerical = ['CreditScore', 'Age', 'Tenure', 'Balance', 'NumOfProducts', 'EstimatedSalary', 'Satisfaction Score', 'Point Earned']
categorical = ['Geography', 'Gender', 'Card Type', 'IsActiveMember', 'Complain']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Matplotlib and seaborn graphs style

sns.set(style="darkgrid", context="paper")
plt.style.use("dark_background")
plt.rcParams.update({"grid.linewidth": 0.5, "grid.alpha": 0.5})

%matplotlib inline

In [None]:
# Chart to check how many have left

pal = sns.color_palette("rocket", n_colors=2) # Color palette
sns.set_context("paper")
ax = sns.countplot(
    data=df,
    x='Exited',
    palette=pal,
)
for container in ax.containers:
    ax.bar_label(container) # Add number on top of bars

In [None]:
# Histograms for all the numerical fields

pal = iter(sns.color_palette("rocket", n_colors=len(numerical)))

plt.figure(figsize=(14, 8))
for i, col in enumerate(numerical):
    plt.subplot(2, int(len(numerical) / 2), i + 1)
    sns.histplot(
        x=str(col),
        data=df,
        color=next(pal),
        bins=20
    )
plt.tight_layout()

In [None]:
# Chart for the categorical ones

plt.figure(figsize=(10, 8))
for i, col in enumerate(categorical):
    pal = sns.color_palette("rocket", n_colors=len(df[col].unique()))
    ax = plt.subplot(2, int(len(categorical) / 2) + 1, i + 1)
    sns.countplot(
        x=str(col),
        data=df,
        palette=pal,
    )
    for container in ax.containers:
        ax.bar_label(container, )
    plt.legend([], [], frameon=False)
plt.tight_layout()

In [None]:
# Correlation matrix

plt.figure(figsize=(10, 6))
correlation_matrix = df.corr(numeric_only=True)
sns.heatmap(correlation_matrix, xticklabels=correlation_matrix.columns, yticklabels=correlation_matrix.columns, annot=True)

In [None]:
# Violin graphs to check the relationship between Complain and Age given the high correlation

g = sns.FacetGrid(df, col='Gender', height=4, aspect=0.8)

pal = sns.color_palette("rocket", n_colors=2)
g.map_dataframe(sns.violinplot, x='Complain', y ='Age', palette=pal)

In [None]:
# And boxplots for the categorical fields in relation to age

plt.figure(figsize=(12, 6))
for i, col in enumerate(categorical):
    pal = sns.color_palette("rocket", n_colors=len(df[col].unique()))
    plt.subplot(2, int(len(categorical) / 2) + 1, i + 1)
    sns.boxplot(
        x=str(col),
        y='Age',
        data=df,
        palette=pal
    )
    plt.legend([], [], frameon=False)
plt.tight_layout()

In [None]:
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

models = [
    ('Random Forest', RandomForestClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('SVM', SVC()),
    ('Logistic Regression', LogisticRegression())
]

In [None]:
def fit_score_model(model, model_name, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train.ravel()) # Fit the model
    predictions = model.predict(X_test) # Predict the values
    test_rmse = mean_squared_error(y_test, predictions, squared=False) # Calculate the Root Mean Square Error

    score = model.score(X_test, y_test.ravel()) # Calculate the score

    print(f"Model Name: {model_name}, RMSE: {test_rmse}, Score: {score}")

In [None]:
features_to_label = df[['Geography', 'Gender', 'Card Type']] # The columns to One Hot Encode

In [None]:
df_encoded = pd.get_dummies(features_to_label, columns=features_to_label.columns.values)

In [None]:
le = preprocessing.LabelEncoder()
df_cat = features_to_label.apply(le.fit_transform)

In [None]:
numerical_features = df.drop(['Geography', 'Gender', 'Card Type'], axis=1)
df = pd.merge(numerical_features, df_encoded, left_index=True, right_index=True)

In [None]:
X = df.drop(['Exited'], axis=1)

scaler = StandardScaler()
X_normalized = scaler.fit_transform(X) # Scale the dataset

y = df['Exited']
X_train, X_test, y_train, y_test = train_test_split(X_normalized, y, test_size=0.3)

for model_name, model in models:
  fit_score_model(model, model_name, X_train, y_train, X_test, y_test)