# Classification Algorithm for Banking

In [None]:
#Import Libraries

In [1]:
# To enable plotting graphs in Jupyter notebook
%matplotlib inline 
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from zipfile import ZipFile
import urllib.request
from io import BytesIO
folder = urllib.request.urlopen('https://s3.amazonaws.com/projex.dezyre.com/classification-algorithms-for-digital-transformation-in-banking/materials/data.zip')
zipfile = ZipFile(BytesIO(folder.read()))
zipfile.namelist()

['data/Data1.csv', 'data/Data2.csv']

In [None]:
# Load customer data present in CSV file
data1 = pd.read_csv(zipfile.open("data/Data1.csv"))
data2 = pd.read_csv(zipfile.open("data/Data2.csv"))

In [None]:
# Shape and size of data
print(data1.shape)
print(data2.shape)

In [None]:
# Merging two data frames. Use Pandas merge function to merge two data frames based on cutomer ID
cust_data=data1.merge(data2, how='inner', on='ID')

In [None]:
# Explore final shape of data
print(cust_data.shape)

In [None]:
# Explore data types
cust_data.dtypes

### Comment: As all data attributes are quantitative data, we don't need data transformation here

In [None]:
# Data description
cust_data.describe().transpose()

In [None]:
# Dropping ID as it doesn't have any impact on learning
cust_data = cust_data.drop(columns='ID')

In [None]:
cust_data.shape

In [None]:
# Check for null value
cust_data.isnull().sum()

### Comment: LoanOnCard attribute has 20 null data, which is 0.4% only. Secondly, it is the target class hence we can't repplace null value using mean or mode. We can remove these data from our dataset.

In [None]:
cust_data = cust_data.dropna()

In [None]:
cust_data.shape

## Exploratory Data Analysis

In [None]:
sns.countplot(x = 'LoanOnCard',  data = cust_data);

### Calculate target class data percentage

In [None]:
n_true = len(cust_data.loc[cust_data['LoanOnCard'] == 1.0])
n_false = len(cust_data.loc[cust_data['LoanOnCard'] == 0.0])
print("No. of true cases: {0} ({1:2.2f}%)".format(n_true, (n_true / (n_true + n_false)) * 100 ))
print("No. of false cases: {0} ({1:2.2f}%)".format(n_false, (n_false / (n_true + n_false)) * 100))

## Comment: Data imbalance is a typical problem in machine learning. Later we shall use it's impact when we develop ML models.

In [None]:
# Scatter plot to see how data points are distributed for "MonthlyAverageSpend" and "HighestSpend" as per target class
g = sns.scatterplot(x="HighestSpend", y="MonthlyAverageSpend", hue="LoanOnCard",
             data=cust_data,legend='full')
g.set(xscale="log")


In [None]:
fig, ax = plt.subplots(1, 2)
sns.histplot(cust_data.loc[cust_data.LoanOnCard == 0.0, 'Mortgage'], ax = ax[0])
sns.histplot(cust_data.loc[cust_data.LoanOnCard == 1.0, 'Mortgage'], ax = ax[1])
plt.show()

In [None]:
fig, ax = plt.subplots(1, 2)
sns.histplot(cust_data.loc[cust_data.LoanOnCard == 0.0, 'FixedDepositAccount'], ax = ax[0])
sns.histplot(cust_data.loc[cust_data.LoanOnCard == 1.0, 'FixedDepositAccount'], ax = ax[1])
plt.show()

In [None]:
columns = list(cust_data)[0:-1] # Excluding Outcome column which has only 
cust_data[columns].hist(stacked=False, bins=100, figsize=(12,30), layout=(14,2)); 
# Histogram of first 8 columns

In [None]:
sns.pairplot(cust_data, height=3, hue = 'LoanOnCard')

### Zipcode doesn't have any significance with other dependant variables and on learning, hence drop it from dependant variable list.

### Age and customer Since have similar information content. Will verify through correlation analysis


In [None]:
cust_data = cust_data.drop(columns='ZipCode')

In [None]:
#Correlation analysis
corr = cust_data.corr()
corr

In [None]:
#heatmap
fig,ax = plt.subplots(figsize=(10, 10))   
sns.heatmap(cust_data.corr(), ax=ax, annot=True, linewidths=0.05, fmt= '.2f',cmap="magma") # the color intensity is based on 
plt.show()

In [None]:
cust_data = cust_data.drop(columns='Age')

In [None]:
cust_data.shape

In [None]:
cust_data.head(10)

In [None]:
from sklearn.model_selection import train_test_split

X = cust_data.drop('LoanOnCard',axis=1)     # Predictor feature columns (8 X m)
Y = cust_data['LoanOnCard']   # Predicted class (1=True, 0=False) (1 X m)

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)
# 1 is just any random seed number

x_train.head()

In [3]:
!pip install mlfoundry --upgrade



In [None]:

import mlfoundry as mlf

TRACKING_URL = 'https://projectpro.truefoundry.com'
mlf_api = mlf.get_client(TRACKING_URL)



Please get your API key from https://projectpro.truefoundry.com/settings


## Logistic Regression

In [None]:
# import model and matrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, precision_score, recall_score, f1_score, roc_curve
from sklearn import metrics
# Fit the model on train
model = LogisticRegression(solver="liblinear")
model.fit(x_train, y_train)
#predict on test
y_predict = model.predict(x_test)
coef_df = pd.DataFrame(model.coef_)
coef_df['intercept'] = model.intercept_
print(coef_df)

In [None]:
model_score = model.score(x_test, y_test)
print(model_score)

In [None]:
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
print(f'Recall score: {recall_score(y_test,y_predict)}')
print(f'Precision score: {precision_score(y_test,y_predict)}')
print(f'f1 score: {f1_score(y_test,y_predict)}')

In [None]:
mlf_run = mlf_api.create_run(project_name='digital-transformation-in-banking', run_name="logistic-reg-model")

metrics_dict = {
    "accuracy": accuracy_score(y_test,y_predict),
    "auc": roc_auc_score(y_test, y_predict),
    "precision-score": recall_score(y_test,y_predict),
    "recall-score": precision_score(y_test,y_predict),
    "f1-score": f1_score(y_test,y_predict)
}

mlf_run.log_metrics(metrics_dict)
mlf_run.log_params(model.get_params())
mlf_run.log_model(model, framework=mlf.ModelFramework.SKLEARN)


In [None]:
mat = confusion_matrix(y_test, y_predict)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=mat)
disp.plot()
mlf_run.log_plots({"confusion-matrix": plt}, step=1)
plt.show()

In [None]:
yproba = model.predict_proba(x_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  yproba)
plt.plot(fpr, tpr, label=f"ROC curve")
mlf_run.log_plots({"ROC-curve": plt}, step=1)
plt.show()

In [None]:
mlf_run.log_dataset(
    dataset_name = 'test_dataset',
    features = x_test,
    predictions = y_predict,
    actuals = y_test,
    only_stats = False,   
)
mlf_run.end()


## Weighted Logistic Regression to handle class inbalance


In [None]:
# define class weights
w = {0:1, 1:2}

# Fit the model on train
model_weighted = LogisticRegression(solver="liblinear", class_weight=w)
model_weighted.fit(x_train, y_train)
#predict on test
y_predict = model_weighted.predict(x_test)

In [None]:
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
print(f'Recall score: {recall_score(y_test,y_predict)}')
print(f'Precision score: {precision_score(y_test,y_predict)}')
print(f'f1 score: {f1_score(y_test,y_predict)}')

In [None]:
mlf_run = mlf_api.create_run(project_name='digital-transformation-in-banking', run_name="logistic-reg-model-with-class-weight")

In [None]:
metrics_dict = {
    "accuracy": accuracy_score(y_test,y_predict),
    "auc": roc_auc_score(y_test, y_predict),
    "precision-score": recall_score(y_test,y_predict),
    "recall-score": precision_score(y_test,y_predict),
    "f1-score": f1_score(y_test,y_predict)
}
mlf_run.log_metrics(metrics_dict)
mlf_run.log_params(model_weighted.get_params())
mlf_run.log_model(model_weighted, framework=mlf.ModelFramework.SKLEARN)

In [None]:
mat = confusion_matrix(y_test, y_predict)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=mat)
disp.plot()
mlf_run.log_plots({"confusion-matrix": plt}, step=1)
plt.show()

In [None]:
yproba = model_weighted.predict_proba(x_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  yproba)
plt.plot(fpr, tpr, label=f"ROC curve")
mlf_run.log_plots({"ROC-curve": plt}, step=1)
plt.show()

In [None]:
mlf_run.log_dataset(
    dataset_name = 'test_dataset',
    features = x_test,
    predictions = y_predict,
    actuals = y_test,
    only_stats = False,   
)

mlf_run.end()

### Although the accuracy decreases, AUC and recall increases significantly, hence, it is a better model. Hence we select "model_weighted".

## Train Naive bayes algorithm

In [None]:
from sklearn.naive_bayes import GaussianNB # using Gaussian algorithm from Naive Bayes

# create the model
diab_model = GaussianNB()

diab_model.fit(x_train, y_train)

### Performance with training data

In [None]:
diab_train_predict = diab_model.predict(x_train)

from sklearn import metrics

print("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, diab_train_predict)))
print()

### Performance with testing data


In [None]:
y_predict = diab_model.predict(x_test)

from sklearn import metrics

print("Model Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, y_predict)))
print()

In [None]:
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
print(f'Recall score: {recall_score(y_test,y_predict)}')

In [None]:
mlf_run = mlf_api.create_run(project_name='digital-transformation-in-banking', run_name="naive-bayes")

metrics_dict = {
    "accuracy": accuracy_score(y_test,y_predict),
    "auc": roc_auc_score(y_test, y_predict),
    "precision-score": recall_score(y_test,y_predict),
    "recall-score": precision_score(y_test,y_predict),
    "f1-score": f1_score(y_test,y_predict)
}
mlf_run.log_metrics(metrics_dict)
mlf_run.log_params(diab_model.get_params())
mlf_run.log_model(diab_model, framework=mlf.ModelFramework.SKLEARN)

In [None]:
mat = confusion_matrix(y_test, y_predict)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=mat)
disp.plot()
mlf_run.log_plots({"confusion-matrix": plt}, step=1)
plt.show()

In [None]:
yproba = diab_model.predict_proba(x_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  yproba)
plt.plot(fpr, tpr, label=f"ROC curve")
mlf_run.log_plots({"ROC-curve": plt}, step=1)
plt.show()

In [None]:
mlf_run.log_dataset(
    dataset_name = 'test_dataset',
    features = x_test,
    predictions = y_predict,
    actuals = y_test,
    only_stats = False,   
)
mlf_run.end()

### Use of class prior for inbalanced data

In [None]:
diab_model_cp = GaussianNB(priors=[0.1, 0.9])
#diab_model.class_prior_ = [0.9, 0.1]
diab_model_cp.fit(x_train, y_train.ravel()) 
y_predict = diab_model_cp.predict(x_test)

In [None]:
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
print(f'Recall score: {recall_score(y_test,y_predict)}')

In [None]:
mlf_run = mlf_api.create_run(project_name='digital-transformation-in-banking', run_name="naive-bayes-with-priors")

metrics_dict = {
    "accuracy": accuracy_score(y_test,y_predict),
    "auc": roc_auc_score(y_test, y_predict),
    "precision-score": recall_score(y_test,y_predict),
    "recall-score": precision_score(y_test,y_predict),
    "f1-score": f1_score(y_test,y_predict)
}
mlf_run.log_metrics(metrics_dict)
mlf_run.log_params(diab_model_cp.get_params())
mlf_run.log_model(diab_model_cp, framework=mlf.ModelFramework.SKLEARN)

In [None]:
mat = confusion_matrix(y_test, y_predict)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=mat)
disp.plot()
mlf_run.log_plots({"confusion-matrix": plt}, step=1)
plt.show()

In [None]:
yproba = diab_model_cp.predict_proba(x_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  yproba)
plt.plot(fpr, tpr, label=f"ROC curve")
mlf_run.log_plots({"ROC-curve": plt}, step=1)
plt.show()

In [None]:
mlf_run.log_dataset(
    dataset_name = 'test_dataset',
    features = x_test,
    predictions = y_predict,
    actuals = y_test,
    only_stats = False,
)
mlf_run.end()

## Support Vector Machines

In [None]:
from sklearn import svm
clf = svm.SVC(gamma=0.25, C=10)
clf.fit(x_train , y_train)
y_predict = clf.predict(x_test)

In [None]:
### gamma is a measure of influence of a data point. It is inverse of distance of influence. 
### C is penalty of wrong classifications

In [None]:
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
print(f'Recall score: {recall_score(y_test,y_predict)}')
print(f'Precision score: {precision_score(y_test,y_predict)}')
print(f'f1 score: {f1_score(y_test,y_predict)}')

In [None]:
from scipy.stats import zscore
XScaled  = X.apply(zscore)  # convert all attributes to Z scale 
XScaled.describe()

In [None]:
x_trains, x_tests, y_trains, y_tests = train_test_split(XScaled, Y, test_size=0.3, random_state=1)

In [None]:
clf = svm.SVC(gamma=0.25, C=10, probability=True)
clf.fit(x_trains , y_trains)
y_predicts = clf.predict(x_tests)

In [None]:
print(f'Accuracy Score: {accuracy_score(y_tests,y_predicts)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_tests, y_predicts)}')
print(f'Area Under Curve: {roc_auc_score(y_tests, y_predicts)}')
print(f'Recall score: {recall_score(y_tests,y_predicts)}')
print(f'Precision score: {precision_score(y_tests,y_predicts)}')
print(f'f1 score: {f1_score(y_tests,y_predicts)}')

In [None]:
mlf_run = mlf_api.create_run(project_name='digital-transformation-in-banking', run_name="svm")

metrics_dict = {
    "accuracy": accuracy_score(y_test,y_predict),
    "auc": roc_auc_score(y_test, y_predict),
    "precision-score": recall_score(y_test,y_predict),
    "recall-score": precision_score(y_test,y_predict),
    "f1-score": f1_score(y_test,y_predict)
}
mlf_run.log_metrics(metrics_dict)
mlf_run.log_params(clf.get_params())
mlf_run.log_model(clf, framework=mlf.ModelFramework.SKLEARN)

In [None]:
mat = confusion_matrix(y_test, y_predict)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=mat)
disp.plot()
mlf_run.log_plots({"confusion-matrix": plt}, step=1)
plt.show()

In [None]:
yproba = clf.predict_proba(x_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  yproba)
plt.plot(fpr, tpr, label=f"ROC curve")
mlf_run.log_plots({"ROC-curve": plt}, step=1)
plt.show()

In [None]:
mlf_run.log_dataset(
    dataset_name = 'test_dataset',
    features = x_test,
    predictions = y_predict,
    actuals = y_test,
    only_stats = False,   
)
mlf_run.end()

## Decision Tree Classifier

In [None]:
# Build decision tree model
from sklearn.tree import DecisionTreeClassifier

dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(x_train, y_train)

In [None]:
# Scoring our DT
print(dTree.score(x_train, y_train))
print(dTree.score(x_test, y_test))

In [None]:
y_predict = dTree.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
print(f'Recall score: {recall_score(y_test,y_predict)}')
print(f'Precision score: {precision_score(y_test,y_predict)}')
print(f'f1 score: {f1_score(y_test,y_predict)}')

In [None]:
#Reducing over fitting (Regularization)
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 5, random_state=1)
dTreeR.fit(x_train, y_train)
print(dTreeR.score(x_train, y_train))
print(dTreeR.score(x_test, y_test))

In [None]:
y_predictR = dTreeR.predict(x_test)
print(f'Accuracy Score: {accuracy_score(y_test,y_predictR)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predictR)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_predictR)}')
print(f'Recall score: {recall_score(y_test,y_predictR)}')
print(f'Precision score: {precision_score(y_test,y_predictR)}')
print(f'f1 score: {f1_score(y_test,y_predictR)}')

In [None]:
# Decision Tree Visualize
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree
dTreeR3 = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTreeR3.fit(x_train, y_train)
fn = list(x_train)
cn = ['0', '1']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4, 4), dpi=300)
plot_tree(dTreeR3, feature_names = fn, class_names=cn, filled = True)

fig.savefig('tree.png')

In [None]:
mlf_run = mlf_api.create_run(project_name='digital-transformation-in-banking', run_name="decision-tree")

metrics_dict = {
    "accuracy": accuracy_score(y_test,y_predictR),
    "auc": roc_auc_score(y_test, y_predictR),
    "precision-score": recall_score(y_test,y_predictR),
    "recall-score": precision_score(y_test,y_predictR),
    "f1-score": f1_score(y_test,y_predictR)
}
mlf_run.log_metrics(metrics_dict)
mlf_run.log_params(dTree.get_params())
mlf_run.log_model(dTree, framework=mlf.ModelFramework.SKLEARN)

In [None]:
mat = confusion_matrix(y_test, y_predictR)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=mat)
disp.plot()
mlf_run.log_plots({"confusion-matrix": plt}, step=1)
plt.show()

In [None]:
yproba = clf.predict_proba(x_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  yproba)
plt.plot(fpr, tpr, label=f"ROC curve")
mlf_run.log_plots({"ROC-curve": plt}, step=1)
plt.show()

In [None]:
mlf_run.log_dataset(
    dataset_name = 'test_dataset',
    features = x_test,
    predictions = y_predictR,
    actuals = y_test,
    only_stats = False,   
)
mlf_run.end()

## Ensemble Learning: Random forest classifier


In [None]:
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(random_state=1)
rfcl = rfcl.fit(x_train, y_train)
y_predict = rfcl.predict(x_test)

In [None]:
# performance
print(f'Accuracy Score: {accuracy_score(y_test,y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_test, y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_test, y_predict)}')
print(f'Recall score: {recall_score(y_test,y_predict)}')
print(f'Precision score: {precision_score(y_test,y_predict)}')
print(f'f1 score: {f1_score(y_test,y_predict)}')

In [None]:
mlf_run = mlf_api.create_run(project_name='digital-transformation-in-banking', run_name="random-forest")

metrics_dict = {
    "accuracy": accuracy_score(y_test,y_predict),
    "auc": roc_auc_score(y_test, y_predict),
    "precision-score": recall_score(y_test,y_predict),
    "recall-score": precision_score(y_test,y_predict),
    "f1-score": f1_score(y_test,y_predict)
}
mlf_run.log_metrics(metrics_dict)
mlf_run.log_params(rfcl.get_params())
mlf_run.log_model(rfcl, framework=mlf.ModelFramework.SKLEARN)

In [None]:
mat = confusion_matrix(y_test, y_predict)
disp = metrics.ConfusionMatrixDisplay(confusion_matrix=mat)
disp.plot()
mlf_run.log_plots({"confusion-matrix": plt}, step=1)
plt.show()

In [None]:
yproba = rfcl.predict_proba(x_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  yproba)
plt.plot(fpr, tpr, label=f"ROC curve")
mlf_run.log_plots({"ROC-curve": plt}, step=1)
plt.show()

In [None]:
mlf_run.log_dataset(
    dataset_name = 'test_dataset',
    features = x_test,
    predictions = y_predict,
    actuals = y_test,
    only_stats = False,   
)
mlf_run.end()

## Unbalanced Data Handelling

In [None]:
# Install imbalanced-learn if you have not used before
!pip install imbalanced-learn

In [None]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from collections import Counter
# summarize class distribution
counter = Counter(Y)
print(counter)
# define pipeline
over = SMOTE(sampling_strategy=0.3,random_state=1) #sampling_strategy=0.1,random_state=1
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [ ('o', over),('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
Xb, Yb = pipeline.fit_resample(XScaled, Y)
# summarize the new class distribution
counter = Counter(Yb)
print(counter)

In [None]:
x_trainb, x_testb, y_trainb, y_testb = train_test_split(Xb, Yb, test_size=0.3, random_state=1)
# 1 is just any random seed number

## SVM with balanced Data

In [None]:
clf = svm.SVC(gamma=0.25, C=10)
clf.fit(x_trainb , y_trainb)
y_predictb = clf.predict(x_testb)

In [None]:
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,y_predictb)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, y_predictb)}')
print(f'Area Under Curve: {roc_auc_score(y_testb, y_predictb)}')
print(f'Recall score: {recall_score(y_testb,y_predictb)}')
print(f'Precision score: {precision_score(y_testb,y_predictb)}')
print(f'f1 score: {f1_score(y_testb,y_predictb)}')

## Random Forest classifier with Balanced Data

In [None]:
rfcl = RandomForestClassifier(random_state=1)
rfcl = rfcl.fit(x_trainb, y_trainb)
y_predict = rfcl.predict(x_testb)

In [None]:
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,y_predict)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, y_predict)}')
print(f'Area Under Curve: {roc_auc_score(y_testb, y_predict)}')
print(f'Recall score: {recall_score(y_testb,y_predict)}')
print(f'Precision score: {precision_score(y_testb,y_predict)}')
print(f'f1 score: {f1_score(y_testb,y_predict)}')

### Chosing hyperparameter using Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.25,0.01],'kernel': ['rbf', 'poly', 'sigmoid']}
grid = GridSearchCV(svm.SVC(),param_grid,refit=True,verbose=2)
grid.fit(x_trainb,y_trainb)
print(grid.best_estimator_)

## Pickle the model


In [None]:
# Pickle model file
import pickle
filename = 'finalized_model.sav'
pickle.dump(rfcl, open(filename, 'wb'))

## Load model from pickle file and use

In [None]:
# Checking the pickle model
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.predict(x_testb)
# performance
print(f'Accuracy Score: {accuracy_score(y_testb,result)}')
print(f'Confusion Matrix: \n{confusion_matrix(y_testb, result)}')
print(f'Area Under Curve: {roc_auc_score(y_testb, result)}')
print(f'Recall score: {recall_score(y_testb,result)}')
print(f'Precision score: {precision_score(y_testb,result)}')
print(f'f1 score: {f1_score(y_testb,result)}')

# Conclusion: 
### We have built  a model using logistic regression, Support vector machine and Random forest classifier. This data set is highly imbalance hence accuracy can't a good measure, Hence we have used precision, Recall, and AUC for determining better model. 
### We use class weight technique to handle un balanced data and observe that the model performance improved by considering class weight. 
### Scaling/data transformation plays a major role when we work on SVM. 
### We have also explored undersampling and oversampling technique like SMOTE to handle data imbalance.
### Hyper parameter tuning using Grid Search
### We have also seen how to systematically improve a model.
