# Loading Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from catboost import CatBoostClassifier

# Loading the dataset

In [None]:
data_bc = pd.read_csv('../input/credit-card-customers/BankChurners.csv')

# Viewing all Columns in play

In [None]:
data_desc = data_bc.describe()

# Getting view of data

In [None]:
data_bc.head()

In [None]:
data_bc.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1',
             'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2'],
                      axis='columns', inplace=True)

data_desc = data_bc.describe()

# Finding out Columns that have numerical and non-numerical values

In [None]:
data_desc.columns

In [None]:
data_bc.columns

From the above two output we can identify non numerical columns

# One hot encoding for non numerical columns

In [None]:
data_bc = pd.get_dummies(data_bc, columns=['Attrition_Flag', 'Gender', 'Education_Level', 'Marital_Status', 'Income_Category', 'Card_Category'])

data_bc.describe()

In [None]:
data_bc.isnull().sum()

Dropping one column each for one hot encoded columns

In [None]:
data_bc1 = data_bc.drop(columns=['Attrition_Flag_Attrited Customer', 'Gender_F', 'Education_Level_College', 
                                 'Marital_Status_Divorced', 'Income_Category_Unknown', 'Card_Category_Blue'])

In [None]:
data_bc1.columns

# Preparing Test and Train data

In [None]:
X=data_bc1[['CLIENTNUM', 'Customer_Age', 'Dependent_count', 'Months_on_book',
       'Total_Relationship_Count', 'Months_Inactive_12_mon',
       'Contacts_Count_12_mon', 'Credit_Limit', 'Total_Revolving_Bal',
       'Avg_Open_To_Buy', 'Total_Amt_Chng_Q4_Q1', 'Total_Trans_Amt',
       'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1', 'Avg_Utilization_Ratio', 'Gender_M',
       'Education_Level_Doctorate', 'Education_Level_Graduate',
       'Education_Level_High School', 'Education_Level_Post-Graduate',
       'Education_Level_Uneducated', 'Education_Level_Unknown',
       'Marital_Status_Married', 'Marital_Status_Single',
       'Marital_Status_Unknown', 'Income_Category_$120K +',
       'Income_Category_$40K - $60K', 'Income_Category_$60K - $80K',
       'Income_Category_$80K - $120K', 'Income_Category_Less than $40K',
       'Card_Category_Gold', 'Card_Category_Platinum', 'Card_Category_Silver']]
Y=data_bc1[['Attrition_Flag_Existing Customer']]
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,random_state=0)

X_train.to_csv('X_train.csv')
X_test.to_csv('X_test.csv')

Y_train.to_csv('Y_train.csv')
Y_test.to_csv('Y_test.csv')

# Loading saved data

In [None]:
X_train = pd.read_csv('./X_train.csv')
X_test = pd.read_csv('./X_test.csv')
Y_train = pd.read_csv('./Y_train.csv').to_numpy()[:,1]
Y_test = pd.read_csv('./Y_test.csv').to_numpy()[:,1]

# Logistic Regression

In [None]:
log_reg=LogisticRegression(C=1000,max_iter=50000)
log_reg.fit(X_train, Y_train)


print('--------------------------------------------------------------------------')
print('Logistic Regression:')
print('Traning Model accruracy scores: {:.3f}'.format(log_reg.score(X_train,Y_train)))
print('Test Model accruracy scores: {:.3f}'.format(log_reg.score(X_test,Y_test)))
print('--------------------------------------------------------------------------')

# KNN method

In [None]:
KNN=KNeighborsClassifier(n_neighbors=20)
KNN.fit(X_train, Y_train)
Y_pred=KNN.predict(X_test) #here we make our predictions

ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

#Checking the accuracy 
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

Count_row = []
index = 0

for i, row in ActVPred.iterrows():
    if (row['Actual'] < 1):
        if (row['Predicted'] < 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    else:
        if (row['Predicted'] >= 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    index = index + 1
    
print('--------------------------------------------------------------------------')
print('KNN:')
print('Model accruracy scores: {:.3f}'.format(Count_row.count(1)/index))

# RandomForest Classifier

In [None]:
Clf =  RandomForestClassifier(n_estimators = 500, n_jobs = -1)
Clf.fit(X_train, Y_train)
Y_pred=Clf.predict(X_test) 


ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

#Checking the accuracy 
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

Count_row = []
index = 0

for i, row in ActVPred.iterrows():
    if (row['Actual'] < 1):
        if (row['Predicted'] < 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    else:
        if (row['Predicted'] >= 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    index = index + 1
    
    
print('--------------------------------------------------------------------------')
print('Random Forest Classifier:')
print('Model accruracy scores: {:.3f}'.format(Count_row.count(1)/index))

compare1 = pd.DataFrame()
compare1[0] = Clf.feature_importances_
compare1[1] = X_test.columns

print('Feature importance: ')
print(compare1.sort_values(by=0,ascending= False))

# Neural Network

solver{‘lbfgs’, ‘sgd’, ‘adam’}, default=’adam’
The solver for weight optimization. 

-‘lbfgs’ is an optimizer in the family of quasi-Newton methods.

-‘sgd’ refers to stochastic gradient descent.

-‘adam’ refers to a stochastic gradient-based optimizer proposed by Kingma, Diederik, and Jimmy Ba

Note: The default solver ‘adam’ works pretty well on relatively large datasets (with thousands of training samples or more) in terms of both training time and validation score. For small datasets, however, ‘lbfgs’ can converge faster and perform better.

In [None]:
NN = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter=1000)
NN.fit(X_train, Y_train)

Y_pred = NN.predict(X_test)

ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

#Checking the accuracy 
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

Count_row = []
index = 0

for i, row in ActVPred.iterrows():
    if (row['Actual'] < 1):
        if (row['Predicted'] < 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    else:
        if (row['Predicted'] >= 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    index = index + 1
    
    
print('--------------------------------------------------------------------------')
print('Random Forest Classifier:')
print('Model accruracy scores: {:.3f}'.format(Count_row.count(1)/index))

# RandomForestRegressor

In [None]:
# Using the best model from Grid Serach CV
model = RandomForestRegressor(max_depth=15, random_state=42) 

model.fit(X_train, Y_train)

Y_pred = model.predict(X_test)

ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

#Checking the accuracy 
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

Count_row = []
index = 0

for i, row in ActVPred.iterrows():
    if (row['Actual'] < 1):
        if (row['Predicted'] < 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    else:
        if (row['Predicted'] >= 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    index = index + 1
    
print('--------------------------------------------------------------------------')
print('RandomForestRegressor:')
print('Model accruracy scores: {:.3f}'.format(Count_row.count(1)/index))

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

GNB = GaussianNB()

GNB.fit(X_train, Y_train)
Y_pred = GNB.predict(X_test)

ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

#Checking the accuracy 
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

Count_row = []
Visual_rep = []
index = 0

for i, row in ActVPred.iterrows():
    if (row['Predicted'] < 0.5):
        Visual_rep.append(0)
    else:
        Visual_rep.append(1)
            
    if (row['Actual'] < 1):
        if (row['Predicted'] < 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    else:
        if (row['Predicted'] >= 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    
    index = index + 1
    
print('--------------------------------------------------------------------------')
print('Naive Bayes:')
print('Model accruracy scores: {:.3f}'.format(Count_row.count(1)/index))


# XGBoost

In [None]:
model = XGBRegressor()
model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

#Checking the accuracy 
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

Count_row = []
Visual_rep = []
index = 0

for i, row in ActVPred.iterrows():
    if (row['Predicted'] < 0.5):
        Visual_rep.append(0)
    else:
        Visual_rep.append(1)
        
    if (row['Actual'] < 1):
        if (row['Predicted'] < 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    else:
        if (row['Predicted'] >= 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    index = index + 1
    
print('--------------------------------------------------------------------------')
print('XGBoost:')
print('Model accruracy scores: {:.3f}'.format(Count_row.count(1)/index))


# Representation of the prediction

In [None]:
ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(confusion_matrix(Visual_rep,Y_test),annot=True,cmap='coolwarm',fmt='d')
ax.set_title('Prediction On Original Data With XGBoost Confusion Matrix',fontsize=18)
ax.set_xticklabels(['Churn','Not Churn'],fontsize=18)
ax.set_yticklabels(['Predicted Churn','Predicted Not Churn'],fontsize=18)

plt.show()

# CatBoost

In [None]:
Cat = CatBoostClassifier(silent = True)

details = Cat.fit(X_train, Y_train)
Y_pred = Cat.predict(X_test)

ActVPred = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
print(ActVPred)

#Checking the accuracy 
print('Mean Absolute Error:', metrics.mean_absolute_error(Y_test, Y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(Y_test, Y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(Y_test, Y_pred)))

Count_row = []
Visual_rep = []
index = 0

for i, row in ActVPred.iterrows():
    if (row['Predicted'] < 0.5):
        Visual_rep.append(0)
    else:
        Visual_rep.append(1)
            
    if (row['Actual'] < 1):
        if (row['Predicted'] < 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    else:
        if (row['Predicted'] >= 0.5):
            Count_row.append(1)
        else:
            Count_row.append(0)
    
    index = index + 1
    
print('--------------------------------------------------------------------------')
print('CatBoost:')
print('Model accruracy scores: {:.3f}'.format(Count_row.count(1)/index))


# Representation of the prediction

In [None]:
ax = plt.subplots(figsize=(10, 10))
ax = sns.heatmap(confusion_matrix(Visual_rep,Y_test),annot=True,cmap='coolwarm',fmt='d')
ax.set_title('Prediction On Original Data With CatBoost Confusion Matrix',fontsize=18)
ax.set_xticklabels(['Churn','Not Churn'],fontsize=18)
ax.set_yticklabels(['Predicted Churn','Predicted Not Churn'],fontsize=18)

plt.show()