# Import Libraries and Data

In [None]:
import ibis
import pandas as pd
import os

In [None]:
import getpass 
  
try: 
    p = getpass.getpass() 
except Exception as error: 
    print('ERROR', error) 
else: 
    print('Password entered:') 

### Import Data from Impala

In [None]:
hdfs = ibis.hdfs_connect(host=os.environ['HDFS_HOST'], port=50070)


In [None]:
client_impala = ibis.impala.connect(host=os.environ['IP_IMPALA'], port=21050, \
  hdfs_client=hdfs, user=os.environ['CHANDIMA_LOGIN'], password=getpass.getpass(), \
  auth_mechanism='PLAIN')

In [None]:
%%time
requete =client_impala.sql('SELECT * FROM open_data.clean_bank')
df = requete.execute()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

### Check co-relation between the features

In [None]:
import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline
sns.set(style="whitegrid")

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df.corr())

# Identify the Distribution of Data

In [None]:
copy = df.copy()
copy.dropna(inplace = True)
sns.distplot(copy["age"])

In [None]:
copy = df.copy()
copy.dropna(inplace = True)
sns.distplot(copy["balance"])

In [None]:
copy = df.copy()
copy.dropna(inplace = True)
sns.distplot(copy["estimatedsalary"])

In [None]:
copy = df.copy()
copy.dropna(inplace = True)
sns.distplot(copy["tenure"])

In [None]:
copy = df.copy()
copy.dropna(inplace = True)
sns.distplot(copy["isactivemember"])

Age and Balance are normaly distributed. Tenure and isactivemember are uniformly disributed. These distibutions are useful when to replace null values.
Either by mean or median. 

# Plots and Visualization

### Churn according to Gender

In [None]:
sns.countplot('gender',data=df,hue='exited').set_title('Churn with Gender')

### Churn according to Geography

In [None]:
sns.countplot('geography',data=df,hue='exited').set_title('Churn According to Geography')

### Churn acording to Age Group

In [None]:
age_cat = df[['age','exited']]

In [None]:
age_cat.head()

In [None]:
r = [0, 15, 25, 35, 45, 55,120]
g = ['Children 0 - 15','Teenagers 16-25','Youth 26-35','Adult 36- 45','Mature 46- 55','Old 56+']
age_cat['Age_Category'] = pd.cut(age_cat['age'], bins=r, labels=g)

In [None]:
df_3 = age_cat.groupby('Age_Category').mean()
df_3.reset_index(inplace=True)
plt.figure(figsize=(6,8))
sns.barplot(x='Age_Category', y='exited', data=df_3,palette='plasma').set_title('Churn vs Age Category')
plt.xticks(plt.xticks()[0],g , rotation=45, ha="right")
plt.tight_layout()
plt.show()


In [None]:
print(set(df['numofproducts']))

# Feature Engineering

### One-Hot encoding our categorical attributes

In [None]:
list_cat = ['geography', 'gender']
training_data = pd.get_dummies(df, columns = list_cat, prefix = list_cat)

In [None]:
training_data.head()

### Feature Rescaling

In [None]:
from sklearn.preprocessing import StandardScaler
import numpy as np
scaler = StandardScaler()

In [None]:
#We need to reshape our data since the Scaler takes in arrays
creditscore_train = np.array(training_data["creditscore"]).reshape(-1, 1)
age_train = np.array(training_data["age"]).reshape(-1, 1)
balance_train = np.array(training_data["balance"]).reshape(-1, 1)
estimatedsalary_train = np.array(training_data["estimatedsalary"]).reshape(-1, 1)

training_data["creditscore"] = scaler.fit_transform(creditscore_train)
training_data["age"] = scaler.fit_transform(age_train)
training_data["balance"] = scaler.fit_transform(balance_train)
training_data["estimatedsalary"] = scaler.fit_transform(estimatedsalary_train)

In [None]:
training_data.head()

# Split the Data Frame

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train, test = train_test_split(training_data, test_size=0.2)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

In [None]:
X_train = train.drop(['exited','rownumber','customerid','surname'], axis=1).values
y_train = train["exited"].values
X_test = test.drop(['exited','rownumber','customerid','surname'], axis=1).values

In [None]:
X_train.shape, y_train.shape, X_test.shape

### Import sklearn algorithms and libraries 

Use different classification algorithms. Use libraries to mesure the model performance and accuracy. 

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [None]:
from sklearn.metrics import make_scorer, accuracy_score 

In [None]:
from sklearn.model_selection import GridSearchCV

### Create validation Data Set: to avoid overfitting

In [None]:
X_training, X_valid, y_training, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [None]:
from sklearn.metrics import precision_score

In [None]:
from sklearn.metrics import recall_score

In [None]:
from sklearn.metrics import f1_score

### Random Forest Classifier

In [None]:
rf = RandomForestClassifier()

rf_parameters = {"n_estimators": [2, 4, 8, 10, 15, 20, 25, 30], "criterion": ["gini", "entropy"], 
                 "max_features": ["auto", "sqrt", "log2"], 
                 "max_depth": [2, 3, 5, 10], "min_samples_split": [2, 3, 5, 7, 10]}

grid_rf = GridSearchCV(rf, rf_parameters, scoring=make_scorer(accuracy_score))
grid_rf.fit(X_training, y_training)

rf = grid_rf.best_estimator_

rf.fit(X_training, y_training)
pred_rf = rf.predict(X_valid)
acc_rf = accuracy_score(y_valid, pred_rf)

print("The Score of Random Forest is: " + str(acc_rf))

In [None]:
precision_rf = precision_score(y_valid,pred_rf, average='weighted')

In [None]:
print("The Precision of Random Forest is: " + str(precision_rf))

In [None]:
recall_rf = recall_score(y_valid,pred_rf, average='weighted')
print("The Recall of Random Forest is: " + str(recall_rf))

In [None]:
F1_rf = f1_score(y_valid,pred_rf, average='weighted')
print("The F1 score of Random Forest is: " + str(F1_rf))

### Logostic Regression

In [None]:
logreg = LogisticRegression()

logreg_parameters = {"penalty": ["l2", "elasticnet"], "solver": ["newton-cg", "lbfgs", "liblinear", "sag", "saga"],
                     "max_iter": [4000], ,
                     "multi_class" :["ovr"]
                    }

grid_logreg = GridSearchCV(logreg, logreg_parameters, scoring=make_scorer(accuracy_score))
grid_logreg.fit(X_training, y_training)

logreg = grid_logreg.best_estimator_

logreg.fit(X_training, y_training)
pred_logreg = logreg.predict(X_valid)
acc_logreg = accuracy_score(y_valid, pred_logreg)

print("The Score of Logistic Regression is: " + str(acc_logreg))

In [None]:
precision_lg = precision_score(y_valid,pred_logreg, average='weighted')
print("The precision of Logistic Regression is: " + str(precision_lg))

In [None]:
recall_lg = recall_score(y_valid,pred_logreg, average='weighted')
print("The recall of Logistic Regression is: " + str(recall_lg))

In [None]:
F1_lg = recall_score(y_valid,pred_logreg, average='weighted')
print("The F1 score of Logistic Regression is: " + str(F1_lg))

### KNN

In [None]:
knn = KNeighborsClassifier()

knn_parameters = {"n_neighbors": [5,6,7,8,9,10],"leaf_size": [2,4,6,10,15,20, 30, 50],
                  "weights": ["uniform", "distance"], "algorithm": ["auto", "ball_tree", "kd_tree","brute"]
                  }

grid_knn = GridSearchCV(knn, knn_parameters, scoring=make_scorer(accuracy_score))
grid_knn.fit(X_training, y_training)

knn = grid_knn.best_estimator_

knn.fit(X_training, y_training)
pred_knn = knn.predict(X_valid)
acc_knn = accuracy_score(y_valid, pred_knn)

print("The Score of KNeighbors is: " + str(acc_knn))

In [None]:
precision_knn = precision_score(y_valid,pred_knn, average='weighted')
print("The precision of KNN is: " + str(precision_knn))

In [None]:
recall_knn = recall_score(y_valid,pred_knn, average='weighted')
print("The recall of KNN is: " + str(recall_knn))

In [None]:
F1_knn = recall_score(y_valid,pred_knn, average='weighted')
print("The FI score of KNN is: " + str(F1_knn))

### GaussianNB

In [None]:
gnb = GaussianNB()

gnb_parameters = {}

grid_gnb = GridSearchCV(gnb, gnb_parameters, scoring=make_scorer(accuracy_score))
grid_gnb.fit(X_training, y_training)

gnb = grid_gnb.best_estimator_

gnb.fit(X_training, y_training)
pred_gnb = gnb.predict(X_valid)
acc_gnb = accuracy_score(y_valid, pred_gnb)

print("The Score of Gaussian NB is: " + str(acc_gnb))

In [None]:
precision_gnb = precision_score(y_valid,pred_gnb, average='weighted')
print("The precision of Gaussian NB is: " + str(precision_gnb))

In [None]:
recall_gnb = recall_score(y_valid,pred_gnb, average='weighted')
print("The recall of Gaussian NB is: " + str(recall_gnb))

In [None]:
F1_gnb = recall_score(y_valid,pred_gnb, average='weighted')
print("The F1 score of Gaussian NB is: " + str(F1_gnb))

### Decision Tree

In [None]:
dt = DecisionTreeClassifier()

dt_parameters = {"max_features": ["auto", "sqrt", "log2"],"min_samples_split": [2,3,5,9,12,13,14,15,20],
                  "min_samples_leaf":[2,5,6,9,10],
                  "criterion": ["gini", "entropy"], 
                  "splitter": ["best", "random"], 
                  }

grid_dt = GridSearchCV(dt, dt_parameters, scoring=make_scorer(accuracy_score))
grid_dt.fit(X_training, y_training)

dt = grid_dt.best_estimator_

dt.fit(X_training, y_training)
pred_dt = dt.predict(X_valid)
acc_dt = accuracy_score(y_valid, pred_dt)

print("The Score of Decision Tree is: " + str(acc_dt))

In [None]:
precision_dt = precision_score(y_valid,pred_dt, average='weighted')
print("The precision of DT is: " + str(precision_dt))

In [None]:
recall_dt = recall_score(y_valid,pred_dt, average='weighted')
print("The recall of DT is: " + str(recall_dt))

In [None]:
F1_dt = recall_score(y_valid,pred_dt, average='weighted')
print("The F1 score of DT is: " + str(F1_dt))

### Linear SVC

In [None]:
linsvc = LinearSVC()

linsvc_parameters = {"penalty" :["l2"], "multi_class": ["ovr", "crammer_singer"], 
                     "fit_intercept": [True, False],
                     "C": [ 1,10], 
                     "max_iter": [100000]}

grid_linsvc = GridSearchCV(linsvc, linsvc_parameters, scoring=make_scorer(accuracy_score))
grid_linsvc.fit(X_training, y_training)

linsvc = grid_linsvc.best_estimator_

linsvc.fit(X_training, y_training)
pred_linsvc = linsvc.predict(X_valid)
acc_linsvc = accuracy_score(y_valid, pred_linsvc)

print("The Score of LinearSVC is: " + str(acc_linsvc))

In [None]:
precision_svc = precision_score(y_valid,pred_linsvc, average='weighted')
print("The precision of Linear SVC is: " + str(precision_svc))

In [None]:
recall_svc = recall_score(y_valid,pred_linsvc, average='weighted')
print("The recall of Linear SVC is: " + str(recall_svc))

In [None]:
F1_svc = recall_score(y_valid,pred_linsvc, average='weighted')
print("The F1 score of Linear SVC is: " + str(F1_svc))

### XGBoost 

In [None]:
from xgboost import XGBClassifier

xg_clf = XGBClassifier()

parameters_xg = {"objective" : ["reg:linear"], "n_estimators" : [5, 10, 15, 20]}

grid_xg = GridSearchCV(xg_clf, parameters_xg, scoring=make_scorer(accuracy_score))
grid_xg.fit(X_training, y_training)

xg_clf = grid_xg.best_estimator_

xg_clf.fit(X_training, y_training)
pred_xg = xg_clf.predict(X_valid)
acc_xg = accuracy_score(y_valid, pred_xg)

print("The Score for XGBoost is: " + str(acc_xg))

In [None]:
precision_xg = precision_score(y_valid,pred_xg, average='weighted')
print("The precision of xgboost is: " + str(precision_xg))

In [None]:
recall_xg = recall_score(y_valid,pred_xg, average='weighted')
print("The recall of xgboost is: " + str(recall_xg))

In [None]:
F1_xg = recall_score(y_valid,pred_xg, average='weighted')
print("The F1 score of xgboost is: " + str(F1_xg))

## Model Performance Comparison

In [None]:
model_performance = pd.DataFrame({
   
    "Accuracy_Score": [acc_rf,acc_logreg,acc_knn,acc_gnb,acc_dt,acc_linsvc,acc_xg],
    "Precision": [precision_rf, precision_lg, precision_knn, precision_gnb, precision_dt,precision_svc, precision_xg],
    "Recall" : [recall_rf, recall_lg, recall_knn, recall_gnb, recall_dt, recall_svc, recall_xg],
    "F1 Score": [F1_rf, F1_lg, F1_knn, F1_gnb, F1_dt, F1_svc, F1_xg],
     "Model": ["Random Forest Classifier", "Logistic Regression", "KNN", 
              "GaussianNB", "DecisionTree Classifier","Linear SVC", "XGBoost"]
})


model_performance = model_performance[['Model', 'Accuracy_Score', 'Precision', 'Recall', 'F1 Score']]
model_performance.sort_values(by="Precision", ascending=False)

We have trained the models and measured their accuracy, precision, Recall and F1 Score. According to received results, the best model is
XGBoost due to having high Precision, Recall and F1 score. We can use this model to predict our unseen data. 

In [None]:
xg_clf.fit(X_train, y_train)

In [None]:
result_sumbission = xg_clf.predict(X_test)

In [None]:
y_test = test["exited"].values

In [None]:
submission2 = pd.DataFrame({
        "Id": test["customerid"],
        "Customer_name":test["surname"],
        "expected_Exited": y_test,
        "predicted_Exited": result_sumbission.round()
    })

submission2 = submission2[['Id','Customer_name','expected_Exited','predicted_Exited']]

In [None]:
submission2.head(20)

In [None]:
from sklearn.metrics import confusion_matrix


In [None]:
cm = confusion_matrix(y_valid, pred_xg)
cm

In [None]:
ax= plt.subplot()
sns.heatmap(cm, annot=True, ax = ax); #annot=True to annotate cells

# labels, title and ticks
ax.set_xlabel('Predicted labels')
ax.set_ylabel('Actual labels'); 
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['0', '1'])
ax.yaxis.set_ticklabels(['1', '0'])

# Save churn_bank_customers.csv file in to HDFS

In [None]:
from hdfs import InsecureClient
client_hdfs = InsecureClient('http://192.168.56.10:50070', user=os.environ['CHANDIMA_LOGIN'])

In [None]:
# Writing Dataframe to hdfs
with client_hdfs.write('/user/chandima.pondapelage/Bank/result_csv/churn_bank_customers.csv',encoding = 'utf-8') as writer:
    submission2.to_csv(writer)