In [4]:
#importing required libraries for project
import numpy as np 
from numpy import argmax
import pandas as pd 
import os
import matplotlib.pyplot as plt 
import matplotlib.ticker as mticker 
from matplotlib.ticker import ScalarFormatter 
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, auc, precision_recall_curve
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, log_loss, brier_score_loss, classification_report
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from imblearn.over_sampling import SMOTE
import datetime
from sklearn.pipeline import Pipeline



In [5]:
#importing and reading data
data = pd.read_csv('Fraud_Analysis_Data.csv')
data

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0


#  Data Preprocessing

In [6]:
#Average time delta between transactions not used in calculations
def list_diff(x_input):
    if len(x_input)<2:
        xdiff = 0
        delta_mean=0
    else:
        xdiff = [x_input[n]-x_input[n-1] for n in range(1,len(x_input))]
        delta_mean = np.mean(xdiff)
    return delta_mean
    
#Delta before last transaction not used in calculations
def delta_last(x_input):
    if len(x_input)<2:
        delta_l=0
    else:
        delta_l = x_input[-1]-x_input[-2]
    return delta_l

def data_preprocessing(data):
    #number of transactions Orig not used in calculations
    dict_count_sent = data['nameOrig'].value_counts()
    data['New_TotalOrig']= data['nameOrig'].map(dict_count_sent) 
    
    #number of transactions Dest not used in calculations
    dict_count_get = data['nameDest'].value_counts()
    data['New_TotalDest']= data['nameDest'].map(dict_count_get)
    
    #average transaction Orig not used in calculations
    trans_mean_orig = data.groupby("nameOrig")["amount"].aggregate(['mean'])
    dict_trans_mean_orig=trans_mean_orig.to_dict()
    value_dict_trans_mean_orig=dict_trans_mean_orig['mean']
    data['New_TotalMeanOrig']=data['nameOrig'].map(value_dict_trans_mean_orig)
    
    #average transaction Dest not used in calculations
    trans_mean_dest = data.groupby("nameDest")["amount"].aggregate(['mean'])
    dict_trans_mean_dest=trans_mean_dest.to_dict()
    value_dict_trans_mean_dest=dict_trans_mean_dest['mean']
    data['New_TotalMeanDest']=data['nameDest'].map(value_dict_trans_mean_dest)
    
    #Amount of transactions with participation Orig not used in calculations
    trans_sum_orig = data.groupby("nameOrig")["amount"].aggregate([sum])
    dict_trans_sum_orig=trans_sum_orig.to_dict()
    value_dict_trans_sum_orig=dict_trans_sum_orig['sum']
    data['New_TotalSumOrig']=data['nameOrig'].map(value_dict_trans_sum_orig)
    
    #Amount of transactions with participation Dest not used in calculations
    trans_sum_dest = data.groupby("nameDest")["amount"].aggregate([sum])
    dict_trans_sum_dest=trans_sum_dest.to_dict()
    value_dict_trans_sum_dest=dict_trans_sum_dest['sum']
    data['New_TotalSumDest']=data['nameDest'].map(value_dict_trans_sum_dest)
    
    #type Orig ==first letter from nameOrig 
    data['New_TypeOrig']= data['nameOrig'].apply(lambda x: x[0])
    
    #type Dest ==first letter from nameDest
    data['New_TypeDest']= data['nameDest'].apply(lambda x: x[0])
    
    #Average time delta between transactions Orig not used in calculations
    x_input = data.groupby('nameOrig')['step'].apply(list).reset_index(name='info')
    data = pd.merge(data, x_input, how='left', on='nameOrig')
    data['New_Delta_Time_Tr_Orig'] = data['info'].apply(lambda x: list_diff(x))
    #time to previous transaction Orig !not used in calculations
    data['New_Delta_Last_Tr_Orig']= data['info'].apply(lambda x: delta_last(x))
    
    #Average time delta between transactions Dest not used in calculations
    x_input_dest = data.groupby('nameDest')['step'].apply(list).reset_index(name='info_2')
    data = pd.merge(data, x_input_dest, how='left', on='nameDest')
    data['New_Delta_Time_Tr_Dest'] = data['info_2'].apply(lambda x: list_diff(x) )
    #time to previous transaction Dest !not used in calculations
    data['New_Delta_Last_Tr_Dest']= data['info_2'].apply(lambda x: delta_last(x))
    
    #Removing extra columns
    data = data.drop(columns=['info','info_2'])
    
    #delete first letter Orig , Dest
    data['nameOrig']=data['nameOrig'].apply(lambda x: x[1:])
    data['nameDest']=data['nameDest'].apply(lambda x: x[1:])
    
    #!not used in calculations
    data['res_data']=pd.to_datetime(data['step'], unit='h', origin=pd.Timestamp('2000-01-01'))
    
    ### Select the date, days of the week, hours, month  not used in calculations
    data['date'] = data.res_data.dt.date
    data['day_of_week'] = data.res_data.dt.dayofweek
    data['hour'] = data.res_data.dt.hour
    data['month'] = data.res_data.dt.month
    
    return data
        

In [None]:
#loading the preprocesses data
data_f = data_preprocessing(data)
data_f

In [None]:
data_f.columns

# EDA

let's сheck if there are duplicates in the data

In [None]:
#printing duplicate values
print('Number of duplicates are : ', data_f.duplicated().sum())

let's check for gaps in the data

In [None]:
#checking for null values
data_f.isnull().sum()

let's plot Correlation Matrix 

In [None]:
#plotting Correlation Heatmap
def heatmap_eda(data):
    data_heat = data[['step', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud']]
    
    plt.figure(figsize=(6, 4))
    mask = np.triu(np.ones_like(data_heat.corr(), dtype=np.bool))
    heatmap = sns.heatmap(data_heat.corr(), mask=mask, vmin=-1, vmax=1, center=0, annot=False, cmap="Set2")
    heatmap.set_title('Correlation Heatmap', pad=12)

heatmap_eda(data_f)

Let's make a barplot to see the fraud and non fraud transactions in different transactions type

In [None]:
#plotting barplot to check for fraud and non fraud transactions
def countplot_eda(data_f):
    sns.set_style('whitegrid')
    sns.set_context('notebook')
    plt.figure(figsize=(8, 4))
    count_plot = sns.countplot(data=data_plot, x='type', hue='isFraud', palette="pastel")
    count_plot.set_xlabel('Type')
    count_plot.set_ylabel('Count')
    count_plot.set_yscale('log')
    count_plot.yaxis.set_major_formatter(mticker.ScalarFormatter())
    plt.show()

countplot_eda(data_f)

* As we can see, fraud transactions are done in TRANSFER and CASH_OUT transaction type.

In [None]:
#plotting histogram 
def histplot_eda(data):
    plt.figure(figsize=(10, 6))
palette = sns.color_palette("pastel")
hist_plot = sns.histplot(data=data[:100000], 
                            x='amount', 
                            hue='isFraud', 
                            kde=True, 
                            element='step', 
                            palette="Set2", 
                            log_scale=True)
hist_plot.set_ylabel('Number of Observations')
hist_plot.set_xlabel('Amount')
mean_value_f = data[data['isFraud']==False]['amount'].mean()
mean_value_t = data[data['isFraud']==True]['amount'].mean()
hist_plot.axvline(x=mean_value_f, color=palette[0])
hist_plot.axvline(x=mean_value_t, color=palette[-1])
hist_plot.annotate(f'Mean amount for regular transactions: ${mean_value_f:,.2f}', 
                      xy=(0.1, 0.5),
                      xycoords='axes fraction')
hist_plot.annotate(f'Mean amount for fraudulent transactions: ${mean_value_t:,.2f}', 
                      xy=(0.1, 0.3),
                      xycoords='axes fraction')
hist_plot.xaxis.set_major_formatter(mticker.ScalarFormatter())
hist_plot.ticklabel_format(style='plain', axis='x')  
plt.show()
    
histplot_eda(data_f)

* As we can see, the transaction amount is significantly higher for fraud transactions.

In [None]:
fig = plt.figure()
fig.set_size_inches(12, 4)
ax_1 = fig.add_subplot(1, 2, 1)
sns.countplot(data=data_f, 
              x='New_TypeOrig', 
              hue='isFraud',
              palette= "pastel").set_yscale('log')
ax_2 = fig.add_subplot(1, 2, 2)
sns.countplot(data=data_f, 
              x='New_TypeDest', 
              hue='isFraud',
              palette= "pastel").set_yscale('log')
fig.tight_layout()
ax_1.set(title = 'Orig')
ax_2.set(title = 'Dest')
plt.show()

**Let's see how transactions are distributed over time**

In [None]:
fig = plt.figure()
fig.set_size_inches(20, 14)
ax_1 = fig.add_subplot(2, 1, 1)
plt.hist(data_f[data_f['isFraud']==1]['res_data'], 
         bins=data_f['res_data'].unique(), 
         color = "blue")
ax_2 = fig.add_subplot(2, 1, 2)
plt.hist(data_f[data_f['isFraud']==0]['res_data'], 
         bins=data_f['res_data'].unique(),
         color = "blue")
fig.tight_layout()
ax_1.set(title = 'Number of Fraudulent Transactions')
ax_2.set(title = 'Number of Non-Fraudulent Transactions')
plt.show()

In [None]:
fig = plt.figure()
fig.set_size_inches(16, 6)

ax_1 = fig.add_subplot(1, 2, 1)
plt.plot(data_f[data_f['isFraud']==1].groupby(['day_of_week']).size())
plt.xticks(range(7), labels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
           rotation=30)

ax_2 = fig.add_subplot(1, 2, 2)
plt.plot(data_f[data_f['isFraud']==0].groupby(['day_of_week']).size())
plt.xticks(range(7), labels=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
           rotation=30)

fig.tight_layout()
ax_1.set(title = 'Number of fraudulent transactions by day of the week')
ax_2.set(title = 'Number of non-fraudulent transactions by day of the week')
plt.show()

* We see the maximum number of fraudulent transactions on Monday and minimum number of fraudulent transactions on Thursday
* We see the maximum number of non-fraudulent transactions on Saturday and minimum number of fraudulent transactions on Wednesday

In [None]:
fig = plt.figure()
fig.set_size_inches(16, 6)

ax_1 = fig.add_subplot(1, 2, 1)
plt.plot(data_f[data_f['isFraud']==1].groupby(['hour']).size())
plt.xticks(rotation=30)

ax_2 = fig.add_subplot(1, 2, 2)
plt.plot(data_f[data_f['isFraud']==0].groupby(['hour']).size())
plt.xticks(rotation=30)

fig.tight_layout()
ax_1.set(title = 'Number of fraudulent transactions by hour')
ax_2.set(title = 'Number of non-fraudulent transactions by hour')
plt.show()

In [None]:
### Let's build average target curves by hours for each day of the week
frame_1 = data_f[data_f['isFraud']==1].groupby(['day_of_week', 'hour'], as_index=False)['amount'].count()
frame_1 = frame_1.pivot(index='hour', columns='day_of_week', values='amount')
frame_1.plot(figsize=(12, 4),color=['#1f77b4', '#ff7f0e', '#2ca02c', 
                                   '#d62728', '#9467bd','#8c564b', '#e377c2'])
plt.xticks(rotation=30)
plt.title('Number of fraudulent transactions by hour for each day of the week')
plt.legend(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

frame_2 = data_f[data_f['isFraud']==0].groupby(['day_of_week', 'hour'], as_index=False)['amount'].count()
frame_2 = frame_2.pivot(index='hour', columns='day_of_week', values='amount')
frame_2.plot(figsize=(12, 4),color=['#1f77b4', '#ff7f0e', '#2ca02c', 
                                   '#d62728', '#9467bd','#8c564b', '#e377c2'])
plt.xticks(rotation=30)
plt.title('Number of non-fraudulent transactions by hour for each day of the week')
plt.legend(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

plt.show()

* It can be seen that the peak almost always falls at the same time on almost any day of the week, around 17-20 hours.

# Data preparation

In [None]:
data_test = data_f.copy()
#Let's leave only the necessary columns
data_test.drop(columns = ['step', 'nameOrig','nameDest', 'isFlaggedFraud', 
                          'New_TotalOrig', 'New_TotalDest', 'New_TotalMeanOrig',
                          'New_TotalMeanDest', 'New_TotalSumOrig', 'New_TotalSumDest', 
                          'New_Delta_Time_Tr_Orig','New_Delta_Last_Tr_Orig', 
                          'New_Delta_Time_Tr_Dest','New_Delta_Last_Tr_Dest', 
                          'res_data', 'date', 'day_of_week', 'hour','month'], 
               inplace=True)

In [None]:
data_test

**Categorical Features**

In [None]:
#printing categorical features
print(data_test.dtypes)

In [None]:
#since we have few unique values, we will go by the simple way of converting categorical features
data_test = pd.get_dummies(data_test, prefix = ['type', 'New_TypeOrig', 'New_TypeDest'], drop_first = True)

In [None]:
data_test

# Testing

let's try to run a test model (LogisticRegression) without special data preparation, only after pre-processing categorical features

In [None]:
X = data_test.drop(columns=['isFraud'])
y = data_test['isFraud']

#splitting data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.25,
                                                    shuffle=True,
                                                    random_state=42)

In [None]:
#building LR model
LR_model = LogisticRegression(random_state=42)
LR_model.fit(X_train,y_train)
LR_prediction=LR_model.predict(X_test)
cm_LR = confusion_matrix(y_test, LR_prediction, labels=LR_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_LR, display_labels=LR_model.classes_)
disp.plot()
plt.show()

In [None]:
cbc_model = CatBoostClassifier(iterations=20,
                               loss_function='Logloss',
                               verbose=True)
cbc_model.fit(X_train, y_train)
cbc_prediction = cbc_model.predict(X_test)

cm_cbc = confusion_matrix(y_test, cbc_prediction, labels=cbc_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_cbc, display_labels=cbc_model.classes_)
disp.plot()
plt.show()

In [None]:
#Look at the ratio of classes in the target
print(sum(data_test['isFraud'] == 1), '- positive class')
print(sum(data_test['isFraud'] == 0), '- negative class')

# Oversampling and Undersampling

In [None]:
over_sample = SMOTE(random_state=42)
X_train_sm, y_train_sm = over_sample.fit_resample(X_train,y_train)

In [None]:
print(sum(y_train_sm == 1), '- positive class')
print(sum(y_train_sm == 0), '- negative class')

let's try to run LogisticRegression with smote

In [None]:
LR_model = LogisticRegression(random_state=42)
LR_model.fit(X_train_sm,y_train_sm)
LR_prediction=LR_model.predict(X_test)
cm_LR = confusion_matrix(y_test, LR_prediction, labels=LR_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_LR, display_labels=LR_model.classes_)
disp.plot()
plt.show()

# Model Evaluation Metrics

**F1 score**
The F1 score combines precision and recall into one metric, with 1 being the best score and 0 being the worst. It gives equal weight to precision and recall.

**Logarithmic Loss**
Logarithmic Loss, also known as logloss, measures how confident the classifier is in its predictions. It ranges from 0 to 1, indicating from "not sure at all" to "completely sure." Logloss decreases significantly when the classifier is very confident but predicts incorrectly.

**Brier Score**
The Brier Score is a measure of the accuracy of probabilistic predictions, equivalent to mean squared error for predicted probabilities.

**PR curve**
The precision-recall curve shows the balance between precision and recall for different thresholds. A larger area under the curve means both high precision and high recall, indicating low false positive and false negative rates.

**Confusion Matrix**
A confusion matrix is a table that displays a classifier's accuracy across different classes. Correct predictions appear diagonally from top left to bottom right.


In [None]:
def metrics_estimation(model, X_train_sm, X_test, y_train_sm, y_test, prediction, prob):
    #We build all kinds of combinations precision, recall, threshold
    precision, recall, thres = precision_recall_curve(y_test, prob[:, 1])
    #We find a triple for which recall is maximum and precision > 0.6
    ind = np.where(recall == recall[precision > 0.6].max())
    print(f"Precison score:\t {precision[ind][-1]}")
    print(f"Recall score: \t {recall[ind][-1]}")
    print(f"Threshold:\t {thres[ind][-1]}")

    # calculate f1_score for binary classification problem
    f1 = f1_score(y_test, prediction, average='binary')
    print('F1 score: %f' % (f1))

    # retrieve the probabilities for the positive class
    prob_positive = prob[:, 1]

    ###Log Loss Score
    logloss = log_loss(y_test, prob)
    print('Log Loss score: %f' % (logloss))

    ###Brier Score
    brier = brier_score_loss(y_test, prob_positive)
    print('Brier score: %f' % (brier))
    
    return (precision[ind][-1], recall[ind][-1], f1, logloss, brier, thres[ind][-1])


# Test different models

**K-Nearest Neighbors**

The k-nearest neighbors (KNN) method works by measuring the distances between the new feature and all features in the sample. It then selects the k nearest neighbors and predicts the target variable based on either voting (for classification) or averaging (for regression) among these neighbors.

In [None]:
scaler = StandardScaler()
Xsc_train = scaler.fit_transform(X_train_sm)
Xsc_test = scaler.transform(X_test)

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(Xsc_train, y_train_sm)
knn_prediction = knn_model.predict(Xsc_test)
knn_prob = knn_model.predict_proba(Xsc_test)

In [None]:
def metrics_estimation_for_knn(model, X_train_sm, X_test, y_train_sm, y_test, prediction, prob):
    #We build all kinds of combinations precision, recall, threshold
    precision, recall, thres = precision_recall_curve(y_test, prob[:, 1])
    #We find a triple for which recall is maximum and precision > 0.6
    ind = np.where(recall == recall[precision > 0.6].max())
    print(f"Precison score:\t {precision[ind][-1]}")
    print(f"Recall score: \t {recall[ind][-1]}")

    # calculate f1_score for binary classification problem
    f1 = f1_score(y_test, prediction, average='binary')
    print('F1 score: %f' % (f1))

    # retrieve the probabilities for the positive class
    prob_positive = prob[:, 1]

    ###Log Loss Score
    logloss = log_loss(y_test, prob)
    print('Log Loss score: %f' % (logloss))

    ###Brier Score
    brier = brier_score_loss(y_test, prob_positive)
    print('Brier score: %f' % (brier))
    
    return (precision[ind][-1], recall[ind][-1], f1, logloss, brier)

list_metrics_knn = metrics_estimation_for_knn(knn_model, Xsc_train, Xsc_test, y_train_sm, y_test, knn_prediction, knn_prob)

Area Under the Precision-Recall curve

In [None]:
# retrieve the probabilities for the positive class
knn_prob_positive = knn_prob[:, 1]

# calculate the no skill line as the proportion of the positive class
no_skill = len((y_train_sm[y_train_sm==1])+(y_test[y_test==1])) / len(y_train_sm + y_test)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate inputs for the PR curve
precision, recall, thresholds = precision_recall_curve(y_test, knn_prob_positive)

# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.show()

# calculate and print PR AUC
auc_pr = auc(recall, precision)
print('AUC PR: %.3f' % auc_pr)

In [None]:
#confusion matrix
cm = confusion_matrix(y_test, knn_prediction, labels=knn_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=knn_model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(knn_prediction, y_test))

**Random Forests**



In [None]:
rf_model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
rf_model.fit(Xsc_train, y_train_sm)
rf_prediction = rf_model.predict(Xsc_test)
rf_prob = rf_model.predict_proba(Xsc_test)

In [None]:
list_metrics_rf = metrics_estimation(rf_model, Xsc_train, Xsc_test, y_train_sm, y_test, rf_prediction, rf_prob)

In [None]:
# retrieve the probabilities for the positive class
rf_prob_positive = rf_prob[:, 1]

# calculate the no skill line as the proportion of the positive class
no_skill = len((y_train_sm[y_train_sm==1])+(y_test[y_test==1])) / len(y_train_sm + y_test)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate inputs for the PR curve
precision, recall, thresholds = precision_recall_curve(y_test, rf_prob_positive)

# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.show()

# calculate and print PR AUC
auc_pr = auc(recall, precision)
print('AUC PR: %.3f' % auc_pr)

In [None]:
cm_rf = confusion_matrix(y_test, rf_prediction, labels=rf_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_rf, display_labels=rf_model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(rf_prediction, y_test))

**Decision Tree Classifier**

In [None]:
dtc_model=DecisionTreeClassifier(random_state=42)
dtc_model.fit(Xsc_train,y_train_sm)
dtc_prediction=dtc_model.predict(Xsc_test)
dtc_prob = dtc_model.predict_proba(Xsc_test)

In [None]:
list_metrics_dtc = metrics_estimation(dtc_model, Xsc_train, Xsc_test, y_train_sm, y_test, dtc_prediction, dtc_prob)

In [None]:
# retrieve the probabilities for the positive class
dtc_prob_positive = dtc_prob[:, 1]

# calculate the no skill line as the proportion of the positive class
no_skill = len((y_train_sm[y_train_sm==1])+(y_test[y_test==1])) / len(y_train_sm + y_test)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate inputs for the PR curve
precision, recall, thresholds = precision_recall_curve(y_test, dtc_prob_positive)

# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.show()

# calculate and print PR AUC
auc_pr = auc(recall, precision)
print('AUC PR: %.3f' % auc_pr)

In [None]:
cm_dtc = confusion_matrix(y_test, dtc_prediction, labels=dtc_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_dtc, display_labels=dtc_model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(dtc_prediction, y_test))

**Linear Discriminant Analysis**

This approach operates by decreasing the data set's dimensionality through projecting all data points onto a line. It then groups these points into classes according to their distance from the central point.

In [None]:
lda_model = LinearDiscriminantAnalysis()
lda_model.fit(Xsc_train,y_train_sm)
lda_prediction=lda_model.predict(Xsc_test)
lda_prob = lda_model.predict_proba(Xsc_test)

In [None]:
list_metrics_lda = metrics_estimation(lda_model, Xsc_train, Xsc_test, y_train_sm, y_test, lda_prediction, lda_prob)

In [None]:
# retrieve the probabilities for the positive class
lda_prob_positive = lda_prob[:, 1]

# calculate the no skill line as the proportion of the positive class
no_skill = len((y_train_sm[y_train_sm==1])+(y_test[y_test==1])) / len(y_train_sm + y_test)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate inputs for the PR curve
precision, recall, thresholds = precision_recall_curve(y_test, lda_prob_positive)

# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.show()

# calculate and print PR AUC
auc_pr = auc(recall, precision)
print('AUC PR: %.3f' % auc_pr)

In [None]:
cm_lda = confusion_matrix(y_test, lda_prediction, labels=lda_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_lda, display_labels=lda_model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(lda_prediction, y_test))

**Logistic Regression**

Logistic regression produces predictions that fall on a binary scale of zero or one. When the predicted value is 0.5 or higher, the object is classified as one, and when it's below 0.5, it's classified as zero. Each feature in logistic regression is associated with a label of either 0 or 1. This method is suitable for data with a linear relationship, making it a useful tool for linear classification tasks.

In [None]:
lr_model = LogisticRegression(random_state=42)
lr_model.fit(Xsc_train,y_train_sm)
lr_prediction=lr_model.predict(Xsc_test)
lr_prob = lr_model.predict_proba(Xsc_test)

In [None]:
list_metrics_lr = metrics_estimation(lr_model, Xsc_train, Xsc_test, y_train_sm, y_test, lr_prediction, lr_prob)

In [None]:
# retrieve the probabilities for the positive class
lr_prob_positive = lr_prob[:, 1]

# calculate the no skill line as the proportion of the positive class
no_skill = len((y_train_sm[y_train_sm==1])+(y_test[y_test==1])) / len(y_train_sm + y_test)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate inputs for the PR curve
precision, recall, thresholds = precision_recall_curve(y_test, lr_prob_positive)

# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.show()

# calculate and print PR AUC
auc_pr = auc(recall, precision)
print('AUC PR: %.3f' % auc_pr)

In [None]:
cm_lr = confusion_matrix(y_test, lr_prediction, labels=lr_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_lr, display_labels=lr_model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(lr_prediction, y_test))

**Neural Networks**

Neural Networks are a type of machine learning algorithm that utilizes multiple hidden layers consisting of interconnected neurons with activation functions. This approach mimics a simplified model of the brain's functionality to analyze and make predictions based on data.

In [None]:
nn_model = MLPClassifier(solver='lbfgs', alpha=1e-5,
                         hidden_layer_sizes=(5, 2), random_state=1)
nn_model.fit(Xsc_train,y_train_sm)
nn_prediction = nn_model.predict(Xsc_test)
nn_prob = nn_model.predict_proba(Xsc_test)

In [None]:
list_metrics_nn = metrics_estimation(nn_model, Xsc_train, Xsc_test, y_train_sm, y_test, nn_prediction, nn_prob)

In [None]:
# retrieve the probabilities for the positive class
nn_prob_positive = nn_prob[:, 1]

# calculate the no skill line as the proportion of the positive class
no_skill = len((y_train_sm[y_train_sm==1])+(y_test[y_test==1])) / len(y_train_sm + y_test)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate inputs for the PR curve
precision, recall, thresholds = precision_recall_curve(y_test, nn_prob_positive)

# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.show()

# calculate and print PR AUC
auc_pr = auc(recall, precision)
print('AUC PR: %.3f' % auc_pr)

In [None]:
cm_nn = confusion_matrix(y_test, nn_prediction, labels=nn_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm_nn, display_labels=nn_model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(nn_prediction, y_test))

**XGBoost**

XGBoost stands for "Extreme Gradient Boosting," which is a powerful gradient boosting algorithm that employs decision trees as its base model. It is renowned for its robustness and accuracy in making predictions, making it one of the strongest machine learning algorithms available.

In [None]:
xgb_model = XGBClassifier(learning_rate=0.1,random_state=42)
xgb_model.fit(Xsc_train, y_train_sm)
xgb_prediction = xgb_model.predict(Xsc_test)
xgb_prob = xgb_model.predict_proba(Xsc_test)

In [None]:
list_metrics_xgb = metrics_estimation(xgb_model, Xsc_train, Xsc_test, y_train_sm, y_test, xgb_prediction, xgb_prob)

In [None]:
# retrieve the probabilities for the positive class
xgb_prob_positive = xgb_prob[:, 1]

# calculate the no skill line as the proportion of the positive class
no_skill = len((y_train_sm[y_train_sm==1])+(y_test[y_test==1])) / len(y_train_sm + y_test)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate inputs for the PR curve
precision, recall, thresholds = precision_recall_curve(y_test, xgb_prob_positive)

# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.show()

# calculate and print PR AUC
auc_pr = auc(recall, precision)
print('AUC PR: %.3f' % auc_pr)

In [None]:
#confusion matrix
cm = confusion_matrix(y_test, xgb_prediction, labels=xgb_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=xgb_model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(xgb_prediction, y_test))

In [None]:
#The XGBoost library provides a built-in function to plot features ordered by their importance.
from xgboost import plot_importance
# plot feature importance
plot_importance(xgb_model)

**LightGBM**

LightGBM is an advanced gradient boosting algorithm that is specifically engineered to be quicker and more efficient compared to other boosting algorithms. It has earned a reputation for being the fastest gradient boosting method while requiring minimal RAM usage. LightGBM achieves its speed and efficiency by constructing trees in a leaf-wise manner rather than level-wise, which is the approach taken by algorithms like XGBoost. This means that LightGBM splits one node at a time instead of an entire level of nodes at the same depth, resulting in a significant speed boost.

In [None]:
lgbm_model = LGBMClassifier()
lgbm_model.fit(Xsc_train, y_train_sm)
lgbm_prediction = lgbm_model.predict(Xsc_test)
lgbm_prob = lgbm_model.predict_proba(Xsc_test)

In [None]:
list_metrics_lgbm = metrics_estimation(lgbm_model, Xsc_train, Xsc_test, y_train_sm, y_test, lgbm_prediction, lgbm_prob)

In [None]:
# retrieve the probabilities for the positive class
lgbm_prob_positive = lgbm_prob[:, 1]

# calculate the no skill line as the proportion of the positive class
no_skill = len((y_train_sm[y_train_sm==1])+(y_test[y_test==1])) / len(y_train_sm + y_test)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate inputs for the PR curve
precision, recall, thresholds = precision_recall_curve(y_test, lgbm_prob_positive)

# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.show()

# calculate and print PR AUC
auc_pr = auc(recall, precision)
print('AUC PR: %.3f' % auc_pr)

In [None]:
#confusion matrix
cm = confusion_matrix(y_test, lgbm_prediction, labels=lgbm_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=lgbm_model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(lgbm_prediction, y_test))

**7.9. CatBoost**

CatBoost is a type of gradient boosting algorithm 


In [None]:
cboost_model = CatBoostClassifier(verbose=0)
cboost_model.fit(Xsc_train, y_train_sm)
cboost_prediction = cboost_model.predict(Xsc_test)
cboost_prob = cboost_model.predict_proba(Xsc_test)

In [None]:
list_metrics_cboost = metrics_estimation(cboost_model, Xsc_train, Xsc_test, y_train_sm, y_test, cboost_prediction, cboost_prob)

In [None]:
# retrieve the probabilities for the positive class
cboost_prob_positive = cboost_prob[:, 1]

# calculate the no skill line as the proportion of the positive class
no_skill = len((y_train_sm[y_train_sm==1])+(y_test[y_test==1])) / len(y_train_sm + y_test)
# plot the no skill precision-recall curve
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')

# calculate inputs for the PR curve
precision, recall, thresholds = precision_recall_curve(y_test, cboost_prob_positive)

# plot PR curve
plt.plot(recall, precision, marker='.', label='Logistic')
# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend()
plt.show()

# calculate and print PR AUC
auc_pr = auc(recall, precision)
print('AUC PR: %.3f' % auc_pr)

In [None]:
#confusion matrix
cm = confusion_matrix(y_test, cboost_prediction, labels=cboost_model.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=cboost_model.classes_)
disp.plot()
plt.show()

In [None]:
print(classification_report(cboost_prediction, y_test))

# Choosing Best Model

Lets check which classifier is best out of all!

In [None]:
classifiers = []
classifiers.append(knn_model) #K-Nearest Neighbors
classifiers.append(rf_model) #Random Forests
classifiers.append(dtc_model) #Decision Tree Classifier
classifiers.append(lda_model) #Linear Discriminant Analysis
classifiers.append(lr_model) #Logistic Regression 
classifiers.append(nn_model) #Neural Networks
classifiers.append(xgb_model) #XGBoost
classifiers.append(lgbm_model) #LightGBM
classifiers.append(cboost_model) #CatBoost

In [None]:
precisions=[]
recalls=[]
f1s=[]
logloss=[]
briers=[]
list_metrics=[list_metrics_knn, list_metrics_rf, list_metrics_dtc, 
              list_metrics_lda, list_metrics_lr, list_metrics_nn, 
              list_metrics_xgb, list_metrics_lgbm, list_metrics_cboost]
for m in list_metrics:
    precisions.append(m[0])
    recalls.append(m[1])
    f1s.append(m[2])
    logloss.append(m[3])
    briers.append(m[4])

In [None]:
precisions_dict={}
recalls_dict={}
f1s_dict={}
logloss_dict={}
briers_dict={}

for i in range(9):
    key=['K-Nearest Neighbors', 'Random Forests', 'Decision Tree Classifier',
        'Linear Discriminant Analysis','Logistic Regression',
         'Neural Networks','XGBoost','LightGBM','CatBoost'][i]
    precisions_dict[key] = precisions[i]
    recalls_dict[key] = recalls[i]
    f1s_dict[key] = f1s[i]
    logloss_dict[key] = logloss[i]
    briers_dict[key] = briers[i]

precisions_dict_sorted = dict(sorted(precisions_dict.items(), key = lambda item: item[1]))
recalls_dict_sorted = dict(sorted(recalls_dict.items(), key = lambda item: item[1]))
f1s_dict_sorted = dict(sorted(f1s_dict.items(), key = lambda item: item[1]))
logloss_dict_sorted = dict(sorted(logloss_dict.items(), key = lambda item: item[1]))
briers_dict_sorted = dict(sorted(briers_dict.items(), key = lambda item: item[1]))

In [None]:
#set up plotting area
plt.figure(figsize=(10,8)).clf ()

#fit logistic regression model and plot ROC curve
precision, recall, thresholds = precision_recall_curve(y_test, knn_prob_positive)
auc_pr = auc(recall, precision)
plt.plot(recall, precision, marker='.', label='KNN  %.3f' % auc_pr)

precision, recall, thresholds = precision_recall_curve(y_test, rf_prob_positive)
auc_pr = auc(recall, precision)
plt.plot(recall, precision, marker='.', label='RF  %.3f' % auc_pr)

precision, recall, thresholds = precision_recall_curve(y_test, dtc_prob_positive)
auc_pr = auc(recall, precision)
plt.plot(recall, precision, marker='.', label='DTC  %.3f' % auc_pr)

precision, recall, thresholds = precision_recall_curve(y_test, lda_prob_positive)
auc_pr = auc(recall, precision)
plt.plot(recall, precision, marker='.', label='LDA  %.3f' % auc_pr)

precision, recall, thresholds = precision_recall_curve(y_test, lr_prob_positive)
auc_pr = auc(recall, precision)
plt.plot(recall, precision, marker='.', label='LR  %.3f' % auc_pr)

precision, recall, thresholds = precision_recall_curve(y_test, nn_prob_positive)
auc_pr = auc(recall, precision)
plt.plot(recall, precision, marker='.', label='NN  %.3f' % auc_pr)

precision, recall, thresholds = precision_recall_curve(y_test, xgb_prob_positive)
auc_pr = auc(recall, precision)
plt.plot(recall, precision, marker='.', label='XGB  %.3f' % auc_pr)

precision, recall, thresholds = precision_recall_curve(y_test, lgbm_prob_positive)
auc_pr = auc(recall, precision)
plt.plot(recall, precision, marker='.', label='LGBM  %.3f' % auc_pr)

precision, recall, thresholds = precision_recall_curve(y_test, cboost_prob_positive)
auc_pr = auc(recall, precision)
plt.plot(recall, precision, marker='.', label='cboost  %.3f' % auc_pr)

# axis labels
plt.xlabel('Recall')
plt.ylabel('Precision')
# show the legend
plt.legend(bbox_to_anchor=(1,1))
plt.show()

In [None]:
# function to add value labels
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i,y[i],'{:.3f}'.format(y[i]))

In [None]:
plt.rcParams["figure.figsize"] = (20,5)

plt.bar(list(precisions_dict_sorted.keys()), list(precisions_dict_sorted.values()), 
       np.round(list(precisions_dict_sorted.values()),3), align='center')

# calling the function to add value labels
addlabels(list(precisions_dict_sorted.keys()), list(precisions_dict_sorted.values()))

plt.legend(title='Precision score of each classifiers')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (20,5)
plt.bar(list(recalls_dict_sorted.keys()), list(recalls_dict_sorted.values()), 
             np.round(list(recalls_dict_sorted.values()),3))

# calling the function to add value labels
addlabels(list(recalls_dict_sorted.keys()), list(recalls_dict_sorted.values()))

plt.legend(title='Recall score of each classifiers')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (20,5)
plt.bar(list(f1s_dict_sorted.keys()), list(f1s_dict_sorted.values()), 
             np.round(list(f1s_dict_sorted.values()),3))

# calling the function to add value labels
addlabels(list(f1s_dict_sorted.keys()), list(f1s_dict_sorted.values()))

plt.legend(title='F1 score of each classifiers')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (20,5)
plt.bar(list(logloss_dict_sorted.keys()), list(logloss_dict_sorted.values()), 
             np.round(list(logloss_dict_sorted.values()),3))

# calling the function to add value labels
addlabels(list(logloss_dict_sorted.keys()), list(logloss_dict_sorted.values()))

plt.legend(title='LogLoss score of each classifiers')
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (20,5)
plt.bar(list(briers_dict_sorted.keys()), list(briers_dict_sorted.values()), 
             np.round(list(briers_dict_sorted.values()),3))

# calling the function to add value labels
addlabels(list(briers_dict_sorted.keys()), list(briers_dict_sorted.values()))

plt.legend(title='Brier score of each classifiers')
plt.xticks(rotation=90)
plt.show()

Several machine learning models, including CatBoost, LightGBM, and XGBoost, demonstrated strong performance in handling their respective tasks.