In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Libraries Used

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix

## Import the Dataset

In [None]:
df = pd.read_csv("/kaggle/input/credit-card-customers/BankChurners.csv")
df.head()

## Exploratory Data Analysis

In [None]:
df.info()

----
There are total of 10127 observations and 23 columns. Initial feedback: column 22 and 23 won't be useful

---

In [None]:
# drop column 22 and 23
df.drop(df.columns[[-1,-2]], axis=1, inplace=True)

In [None]:
df.info()

In [None]:
# frequency of each value of target variable
print(df['Attrition_Flag'].value_counts(normalize=True))

In [None]:
# Plot the frequency of each value of target variable
sns.countplot(x='Attrition_Flag', data=df)

In [None]:
# Distribution of Customer_Age
sns.displot(x='Customer_Age', data=df, kde=True)

In [None]:
# Frequency of each value of Gender variable
df['Gender'].value_counts(normalize=True)

In [None]:
# Plot the frequency of each value Gender variable
sns.countplot(x='Gender', data=df)

In [None]:
# Frequency of each value of Card_Category variable
df['Card_Category'].value_counts(normalize=True)

In [None]:
# Plot the frequency of each value Card_Category
sns.countplot(x='Card_Category', data=df)

In [None]:
# Frequency of each value of Education_Level variable
df['Education_Level'].value_counts(normalize=True)

In [None]:
# Plot the frequency of each value of Education_Level
plt.figure(figsize=(9,6))
sns.countplot(x='Education_Level', data=df)

In [None]:
# Frequency of each value of Dependent_count variable
df['Dependent_count'].value_counts(normalize=True)

In [None]:
# Plot the frequency of each value of Dependent_count
sns.countplot(x='Dependent_count', data=df)

In [None]:
# Frequency of each value of Marital_Status variable
df['Marital_Status'].value_counts(normalize=True)

In [None]:
# Plot the frequency of each value of Marital_Status
sns.countplot(x='Marital_Status', data=df)

In [None]:
# Frequency of each value of Income_Category variable
df['Income_Category'].value_counts(normalize=True)

In [None]:
# Plot the frequency of each value of Income_Category
plt.figure(figsize=(9,6))
sns.countplot(x='Income_Category', data=df)

In [None]:
# Frequency of each value of Months_on_book variable
df['Months_on_book'].value_counts(normalize=True)

## Data Cleaning and Feature Engineering

In [None]:
# Convert the target variable into numerical variable
df.Attrition_Flag = df.Attrition_Flag.replace({'Attrited Customer':1,'Existing Customer':0})

In [None]:
# convert gender of the observations into numerical values
df.Gender = df.Gender.replace({
    'F': 1,
    'M': 0
})

In [None]:
# Convert the Education_Level into dummy variables
df = pd.concat([df, pd.get_dummies(df['Education_Level']).drop(columns=['Unknown'])], axis=1)
df = pd.concat([df, pd.get_dummies(df['Income_Category']).drop(columns=['Unknown'])], axis=1)
df = pd.concat([df,pd.get_dummies(df['Marital_Status']).drop(columns=['Unknown'])],axis=1)
df = pd.concat([df,pd.get_dummies(df['Card_Category']).drop(columns=['Platinum'])],axis=1)

In [None]:
df.head()

In [None]:
# Drop these columns as they have been converted to dummy variables
df.drop(columns = ['Education_Level','Income_Category','Marital_Status','Card_Category','CLIENTNUM'],inplace=True)

In [None]:
df.info()

In [None]:
# Correlation matrix
df.corr()

## Feature Engineering

In [None]:
# Principal Component Analysis
N_COMPONENTS = 4
pca_model = PCA(n_components = N_COMPONENTS )
pc_matrix = pca_model.fit_transform(df)

In [None]:
# Concatanate the PCA features into the dataframe
df_with_pcs = pd.concat([df,pd.DataFrame(pc_matrix,columns=['PC-{}'.format(i) for i in range(0,N_COMPONENTS)])],axis=1)

In [None]:
# Seperate X and Y
X_features = ['Total_Trans_Ct','PC-3','PC-1','PC-0','PC-2','Total_Ct_Chng_Q4_Q1','Total_Relationship_Count']
X = df_with_pcs[X_features]
y = df_with_pcs['Attrition_Flag']

## Train-Test Slipt

In [None]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

## Modelling

In [None]:
# Fitting Random Forest Classification to the Training set
classifier = RandomForestClassifier(n_estimators=10, criterion='gini', random_state = 0)
classifier.fit(X_train, y_train)

## Prediction

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

## Evaluation

In [None]:
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
cm

In [None]:
# Findind out F1 Score
np.round(f1(y_pred,y_test),2)