In [None]:
#Suppressing all warnings
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [None]:
data=pd.read_csv('../input/predicting-churn-for-bank-customers/Churn_Modelling.csv')
data.head()

In [None]:
data.drop('RowNumber',axis=1)

In [None]:
data['Geography'].value_counts()

In [None]:
# Checking For Null Values

In [None]:
data.isnull().sum()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# EDA

In [None]:
fig=px.pie(data,names='Exited',)
fig.show()

In [None]:
fig=px.pie(data,names='Gender',hole=0.5)
fig.show()

In [None]:
fig=px.histogram(data,x='Age')
fig.show()

In [None]:
fig=px.histogram(data,x='Balance')
fig.show()

In [None]:
fig=px.histogram(data,x='HasCrCard')
fig.show()

In [None]:
fig=px.histogram(data,x='IsActiveMember')
fig.show()

In [None]:
fig=px.histogram(data,x='EstimatedSalary')
fig.show()

In [None]:
fig=px.histogram(data,x='CreditScore')
fig.show()

In [None]:
fig=px.bar(data,x='Age',y='Exited',template='plotly_dark')
fig.show()

In [None]:
#fig=px.bar(data,x='CreditScore',y='Exited',template='plotly_dark')
#fig.show()

In [None]:
#fig=px.bar(data,x='IsActiveMember',y='HasCrCard',template='plotly_dark')
#fig.show()

In [None]:
data['HasCrCard'].value_counts()

In [None]:
data.head()

In [None]:
data['Geography'].value_counts()

In [None]:
df=data.copy()

In [None]:
df=df.drop(['RowNumber','Surname'],axis=1)

In [None]:
df.head()

In [None]:
dummy=pd.get_dummies(df[['Geography','Gender']],drop_first=True)
df=pd.concat([df,dummy],axis=1)
df

In [None]:
df=df.drop(['Geography','Gender'],axis=1)

In [None]:
df.head()

In [None]:
f, ax = plt.subplots(figsize=(11, 11)) 
sns.heatmap(df.corr(),annot=True)

In [None]:
df.columns

In [None]:
X = df[['CreditScore', 'Age', 'Tenure', 'Balance',
       'NumOfProducts', 'HasCrCard', 'IsActiveMember', 'EstimatedSalary',
        'Geography_Germany', 'Geography_Spain', 'Gender_Male']]
y = df['Exited']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=42)
X_train.shape, X_test.shape

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score ,roc_auc_score, plot_roc_curve
from sklearn.tree import plot_tree

In [None]:
def get_dt_graph(dt_classifier):
    plt.figure(figsize=(60,30))
    plot_tree(dt_classifier, feature_names = X.columns,class_names=['Not exited', "Exited"],filled=True);
    

In [None]:
def evaluate_model(dt_classifier):
    print("Train Accuracy :", accuracy_score(y_train, dt_classifier.predict(X_train)))
    print("Train Confusion Matrix:")
    print(confusion_matrix(y_train, dt_classifier.predict(X_train)))
    print("Train ROC_AUC :", roc_auc_score(y_train, dt_classifier.predict(X_train)))
   
    print("-"*50)
    
    print("Test Accuracy :", accuracy_score(y_test, dt_classifier.predict(X_test)))
    print("Test Confusion Matrix:")
    print(confusion_matrix(y_test, dt_classifier.predict(X_test)))
    print("Test ROC_AUC :", roc_auc_score(y_test, dt_classifier.predict(X_test)))
    

In [None]:
rf = RandomForestClassifier(random_state=42, n_estimators=10, max_depth=3)

In [None]:
rf.fit(X_train, y_train)

In [None]:
sample_tree = rf.estimators_[0]

In [None]:
get_dt_graph(sample_tree)


In [None]:
evaluate_model(rf)

### Grid search for hyper-parameter tuning

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
params = {
    'max_depth': [1, 2, 5, 10, 20],
    'min_samples_leaf': [5, 10, 20, 50, 100],
    'max_features': [2,3,4],
    'n_estimators': [10, 30, 50, 100, 200]
}

In [None]:
classifier_rf = RandomForestClassifier(random_state=42, n_jobs=-1)

In [None]:
grid_search = GridSearchCV(estimator=classifier_rf, param_grid=params, 
                          cv=4, n_jobs=-1, verbose=1, scoring = "roc_auc")

In [None]:
%%time
grid_search.fit(X_train,y_train)

In [None]:
rf_best = grid_search.best_estimator_
rf_best

In [None]:
evaluate_model(rf_best)

In [None]:
imp_df = pd.DataFrame({
    "Varname": X_train.columns,
    "Imp": rf_best.feature_importances_
})
imp_df.sort_values(by="Imp", ascending=False)

In [None]:
import lime
from lime import lime_tabular

interpretor=lime_tabular.LimeTabularExplainer(
   training_data=np.array(X_train),
feature_names=X_train.columns,
mode='classification')

In [None]:
exp=interpretor.explain_instance(
    data_row=X_test.iloc[10],
    predict_fn=rf_best.predict_proba
)
exp.show_in_notebook(show_table=True)

In [None]:
#import pickle

In [None]:
#with open('HDClassifierRF.pkl','wb')as pickle_file:
   # pickle.dump(rf_best,pickle_file)