In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, precision_score, recall_score,f1_score, roc_auc_score


In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv('C:\\Users\\MSI\\Downloads\\archive (2)\\exam.csv')

df.head()

In [None]:
df.shape

In [None]:
df.describe().T

In [None]:
df.dtypes


In [None]:
if df.isnull().values.any():
    print('There are some missing values in this dataset\n')
    df.dropna(inplace=True)
    print('Shape : ', data.shape) 
else:
    print('GREAT, There is no missing values in th')

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, orient='h') # horizontal
plt.title('Outliers')
plt.grid(axis='y')
plt.show()

In [None]:
df['type'].unique()

In [None]:
df['type'].value_counts()

In [None]:
plt.figure(figsize=(10,10))
plt.title('type vs counts')
sns.countplot(data=df,x='type')
plt.xlabel('Type')
plt.ylabel('Counts')
plt.grid(axis='y', alpha=1)
plt.show()

In [None]:
df['type'].replace({'CASH_OUT':0, 'PAYMENT':1, 'CASH_IN':2, 'TRANSFER':3, 'DEBIT':4}, inplace=True)

In [None]:
df['type'].value_counts()


In [None]:
df.head(15)

In [None]:
# "isFraud" feature
df['isFraud'].unique()

In [None]:
df['isFraud'].value_counts()


In [None]:
Target_counts = df['isFraud'].value_counts()

# Plot

sns.barplot(x=Target_counts.index, y=Target_counts.values, palette='flare')
plt.xlabel('Target')
plt.ylabel('Count')
plt.title('Counts of Target\n NOT Fraud = 0 || IS Fraud = 1)')
plt.xticks()
plt.show()

In [None]:
df.drop(['nameOrig', 'nameDest'], axis=1, inplace=True)


In [None]:
df.columns


In [None]:
# finding correlation
df.corr()

In [None]:
# visualization of Correlation
plt.figure(figsize=(8, 8))
sns.heatmap(df.corr(), 
            annot=True, 
            linewidths=0.9, 
            fmt=".1f", vmin=-1, vmax=1,
            cmap='coolwarm')
plt.show()

In [None]:
df.drop(['newbalanceOrig', 'oldbalanceDest'], axis=1, inplace=True)

In [None]:
df.columns


In [None]:
df.info()


In [None]:
# X Data
x = df.drop(['isFraud'], axis=1)
print('X shape is : ' , x.shape)
print()

# y Data
y = df['isFraud']
print('Y shape is : ' , y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, shuffle=True)

# Splitted Data
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)

In [None]:
# Standardization (Z-Score Normalization) 
# StandardScaler for Data

scaler = StandardScaler()

# Fit the scaler on the training data
scaler.fit(X_train)

# Transform the training and testing data
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
Model_LR = LogisticRegression()
Model_LR.fit(X_train_scaled, y_train)
y_pred_LR = Model_LR.predict(X_test_scaled)

# Quick evaluation
Train_Accuracy = Model_LR.score(X_train_scaled, y_train)
Test_Accuracy = Model_LR.score(X_test_scaled, y_test)
print(f'Training accuracy: {Train_Accuracy*100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy*100:.2f} %')

In [None]:
CM = confusion_matrix(y_true=y_test, y_pred=y_pred_LR)
ConfusionMatrixDisplay(CM, display_labels=df['isFraud'].unique()).plot()
plt.title('Confusion Matrix Without Normalization')
plt.show()

In [None]:
print(classification_report(y_test, y_pred_LR))


In [None]:
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
Accuracy_LR = accuracy_score(y_test, y_pred_LR)


# Precision = TP / (TP + FP)
Precision_LR = precision_score(y_test, y_pred_LR)


# Recall = TP / (TP + FN)
Recall_LR = recall_score(y_test, y_pred_LR)


# F1 Score = 2 × ((Precision * Recall) / (Precision + Recall))
F1_Score_LR = f1_score(y_test, y_pred_LR)
print(f'F1 Score : {F1_Score_LR * 100 : .2f} %\n')


ROC_AUC_LR = roc_auc_score(y_test, y_pred_LR)


print(f'Accuracy Score : {Accuracy_LR * 100 : .2f} %\n')
print(f'Precision Score : {Precision_LR * 100 : .2f} %\n')
print(f'Recall Score : {Recall_LR * 100 : .2f} %\n')
print(f'AUC_ROC : {ROC_AUC_LR * 100 : 

In [None]:
Scores = [Accuracy_LR, Precision_LR, Recall_LR, F1_Score_LR, ROC_AUC_LR]
Score_Names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC']

# Plot
plt.pie(Scores, labels=Score_Names, 
        autopct='%1.2f%%', 
        startangle=140, 
        labeldistance=1.15,
       wedgeprops = { 'linewidth' : .5, 'edgecolor' : 'white' })
plt.axis('equal')
plt.show()

In [None]:
Model_DT = DecisionTreeClassifier()
Model_DT.fit(X_train_scaled, y_train)
y_pred_DT = Model_DT.predict(X_test_scaled)

# Quick evaluation
Train_Accuracy = Model_DT.score(X_train_scaled, y_train)
Test_Accuracy = Model_DT.score(X_test_scaled, y_test)
print(f'Training accuracy: {Train_Accuracy*100:.2f} %')
print(f'Testing accuracy: {Test_Accuracy*100:.2

In [None]:

CM = confusion_matrix(y_true=y_test, y_pred=y_pred_DT)
ConfusionMatrixDisplay(CM, display_labels=df['isFraud'].unique()).plot()
plt.title('Confusion Matrix Without Normalization')
plt.show()

In [None]:
print(classification_report(y_test, y_pred_DT))


In [None]:
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
Accuracy_DT = accuracy_score(y_test, y_pred_DT)


# Precision = TP / (TP + FP)
Precision_DT = precision_score(y_test, y_pred_DT)


# Recall = TP / (TP + FN)
Recall_DT = recall_score(y_test, y_pred_DT)


# F1 Score = 2 × ((Precision * Recall) / (Precision + Recall))
F1_Score_DT = f1_score(y_test, y_pred_DT)


ROC_AUC_DT = roc_auc_score(y_test, y_pred_DT)




print(f'Accuracy Score : {Accuracy_DT * 100 : .2f} %\n')
print(f'Precision Score : {Precision_DT * 100 : .2f} %\n')
print(f'Recall Score : {Recall_DT * 100 : .2f} %\n')
print(f'F1 Score : {F1_Score_DT * 100 : .2f} %\n')
print(f'AUC_ROC : {ROC_AUC_DT * 100 :

In [None]:
Score_Names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC']

# Plot
plt.figure(figsize=(7, 5))
plt.pie(Scores, labels=Score_Names, autopct='%1.2f%%', startangle=140)
plt.axis('equal')
plt.show()

In [None]:
Model_NB = GaussianNB()
Model_NB.fit(X_train_scaled, y_train)
y_pred_NB = Model_NB.predict(X_test_scaled)

# Quick evaluation
Train_Accuracy = Model_NB.score(X_train_scaled, y_train)
Test_Accuracy = Model_NB.score(X_test_scaled, y_test)


In [None]:

CM = confusion_matrix(y_true=y_test, y_pred=y_pred_NB)
ConfusionMatrixDisplay(CM, display_labels=df['isFraud'].unique()).plot()
plt.title('Confusion Matrix Without Normalization')
plt.show()

In [None]:
print(classification_report(y_test, y_pred_NB))


In [None]:
# Accuracy = (TP + TN) / (TP + TN + FP + FN)
Accuracy_NB = accuracy_score(y_test, y_pred_NB)


# Precision = TP / (TP + FP)
Precision_NB = precision_score(y_test, y_pred_NB)


# Recall = TP / (TP + FN)
Recall_NB = recall_score(y_test, y_pred_NB)


# F1 Score = 2 × ((Precision * Recall) / (Precision + Recall))
F1_Score_NB = f1_score(y_test, y_pred_NB)


ROC_AUC_NB = roc_auc_score(y_test, y_pred_NB)


print(f'Accuracy Score : {Accuracy_NB * 100 : .2f} %\n')
print(f'Precision Score : {Precision_NB * 100 : .2f} %\n')
print(f'Recall Score : {Recall_NB * 100 : .2f} %\n')
print(f'F1 Score : {F1_Score_NB * 100 : .2f} %\n')
print(f'AUC_ROC : {ROC_AUC_NB * 100 : .

In [None]:
Scores = [Accuracy_NB, Precision_NB, Recall_NB, F1_Score_NB, ROC_AUC_NB]
Score_Names = ['Accuracy', 'Precision', 'Recall', 'F1 Score', 'AUC-ROC']

# Plot
plt.figure(figsize=(7, 5))
plt.pie(Scores, labels=Score_Names, autopct='%1.2f%%', startangle=140)
plt.axis('equal')
plt.show()

In [None]:
evaluation = pd.DataFrame({'Classification Model': ['Logistic Regression','Decision Tree', 'Naive Bayes'],
                           'Accuracy Rate': [(Accuracy_LR*100).round(2), (Accuracy_DT*100).round(2), (Accuracy_NB*100).round(2)]})

evaluation