# Modelling

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
from pathlib import Path

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from tabulate import tabulate

## Import the sampled csv data

In [2]:
df_og= pd.read_csv('Data/data_og.csv')  
df_nm1= pd.read_csv('Data/data_nm1.csv')  
df_nm2= pd.read_csv('Data/data_nm2.csv')  
df_nm3= pd.read_csv('Data/data_nm3.csv')  
df_rus= pd.read_csv('Data/data_rus.csv')  
df_ros= pd.read_csv('Data/data_ros.csv')  
df_smote= pd.read_csv('Data/data_smote.csv')  
df_smoteen= pd.read_csv('Data/data_smoteen.csv')  

## Logisitic Regression

In [3]:
def modellr(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)

    #Predict
    y_pred = classifier.predict(X_test)

    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    disp.append([m,str(round(accuracy_score(y_test, y_pred)*100,2)) + "%",str(round(precision_score(y_test, y_pred,zero_division=0),2)),(str(round(recall_score(y_test, y_pred),2))),(str(round(f1_score(y_test, y_pred),2)))])

In [4]:
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['Original','Near Miss1', 'Near Miss2','Near Miss3','Random UnderSampling','Random Sampling','Smote','Smoteen' ]

allmodels = []

disp = []

for i in range(0,8):
    modellr(data_sample_set[i],names[i])
    
allmodels.append(["Logistic Regression",disp])

In [5]:
print("Model used : Logistic Regression\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score"]))

Model used : Logistic Regression

Sampling Technique    Accuracy      Precision    Recall    F1 Score
--------------------  ----------  -----------  --------  ----------
Original              98.69%             0         0           0
Near Miss1            67.24%             0.7       0.62        0.66
Near Miss2            79.31%             0.87      0.7         0.77
Near Miss3            62.07%             0.62      0.64        0.63
Random UnderSampling  71.98%             0.75      0.67        0.71
Random Sampling       76.15%             0.78      0.73        0.75
Smote                 88.06%             0.84      0.93        0.89
Smoteen               90.56%             0.89      0.94        0.91


## KNN

In [6]:
def modelknn(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)

    #Predict
    y_pred = classifier.predict(X_test)

    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    
    disp.append([m,str(round(accuracy_score(y_test, y_pred)*100,2)) + "%",str(round(precision_score(y_test, y_pred,zero_division=0),2)),(str(round(recall_score(y_test, y_pred),2))),(str(round(f1_score(y_test, y_pred),2)))])
    
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['Original','Near Miss1', 'Near Miss2','Near Miss3','Random UnderSampling','Random Sampling','Smote','Smoteen' ]

disp = []

for i in range(0,8):
    modelknn(data_sample_set[i],names[i])
    
allmodels.append(["KNN",disp])

In [7]:
print("Model used : KNN\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score"]))

Model used : KNN

Sampling Technique    Accuracy      Precision    Recall    F1 Score
--------------------  ----------  -----------  --------  ----------
Original              98.64%             0         0           0
Near Miss1            76.72%             0.84      0.67        0.75
Near Miss2            92.24%             0.99      0.86        0.92
Near Miss3            65.3%              0.69      0.57        0.63
Random UnderSampling  70.26%             0.71      0.7         0.71
Random Sampling       97.25%             0.95      1           0.97
Smote                 94.94%             0.92      0.98        0.95
Smoteen               97.31%             0.96      0.99        0.98


## Decision Tree

In [8]:
def modeldt(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train,y_train)

    #Predict
    y_pred = classifier.predict(X_test)

    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    
    disp.append([m,str(round(accuracy_score(y_test, y_pred)*100,2)) + "%",str(round(precision_score(y_test, y_pred,zero_division=0),2)),(str(round(recall_score(y_test, y_pred),2))),(str(round(f1_score(y_test, y_pred),2)))])
    
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['Original','Near Miss1', 'Near Miss2','Near Miss3','Random UnderSampling','Random Sampling','Smote','Smoteen' ]

disp = []

for i in range(0,8):
    modeldt(data_sample_set[i],names[i])
    
allmodels.append(["Decision Tree",disp])

In [9]:
print("Model used : Decision Tree\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score"]))

Model used : Decision Tree

Sampling Technique    Accuracy      Precision    Recall    F1 Score
--------------------  ----------  -----------  --------  ----------
Original              97.3%              0.06      0.07        0.06
Near Miss1            79.31%             0.8       0.79        0.79
Near Miss2            93.53%             0.94      0.94        0.94
Near Miss3            62.28%             0.64      0.59        0.61
Random UnderSampling  62.72%             0.63      0.63        0.63
Random Sampling       98.91%             0.98      1           0.99
Smote                 96.89%             0.96      0.98        0.97
Smoteen               98.48%             0.98      0.99        0.99


## Naive Bayes

In [None]:
def modelNB(df_t,m):
    X = df_t.iloc[:, :-1]
    y = df_t.iloc[:, -1]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

    #Feature Scaling
    sc = StandardScaler()
    X_train.loc[:,:] = sc.fit_transform(X_train.loc[:,:])
    X_test.loc[:,:] = sc.transform(X_test.loc[:,:])

    #Training the model
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train,y_train)

    #Predict
    y_pred = classifier.predict(X_test)

    #Metrics
    cm = confusion_matrix(y_test, y_pred)
    
    disp.append([m,str(round(accuracy_score(y_test, y_pred)*100,2)) + "%",str(round(precision_score(y_test, y_pred,zero_division=0),2)),(str(round(recall_score(y_test, y_pred),2))),(str(round(f1_score(y_test, y_pred),2)))])
    
#Splitting the dataset into the Training set and Test set
#data_sample_set = ['df_og','df_nm1','df_nm2','df_nm3','df_rus','df_ros','df_smote','df_smoteen']
data_sample_set = [df_og,df_nm1,df_nm2,df_nm3,df_rus,df_ros,df_smote,df_smoteen]
names = ['Original','Near Miss1', 'Near Miss2','Near Miss3','Random UnderSampling','Random Sampling','Smote','Smoteen' ]

disp = []

for i in range(0,8):
    modelNB(data_sample_set[i],names[i])
    
allmodels.append(["Naive Bayes",disp])

In [None]:
print("Model used : Naive Bayes\n")
print(tabulate(disp, headers=["Sampling Technique", "Accuracy", "Precision","Recall","F1 Score"]))