In [1]:
import pandas as pd

df=pd.read_csv('https://raw.githubusercontent.com/saintsaintsan/Supervised-Machine-Learning/refs/heads/main/data/fraud.csv', index_col = 0) # index_col = 0 → use the first column as the index of the DataFrame

# Extract 'Class' column as target variable (y) in NumPy array format
y = df['Class'].values
df = df.iloc[:,1:]

# Drop 'Class' column from DataFrame
X = df.drop(columns = 'Class').values
df[['Class']].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,21337
1,356


In [2]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
# test_size=0.40 → 40% for testing, 60% for training
# random_state=1 → ensures reproducibility

X_train, X_test, y_train, y_test = train_test_split(X,y,
                                    test_size = 0.40,
                                    random_state=1)

In [3]:
# Import data preprocessing and pipeline tools

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

#--------------------------------------------------
## ------------Logistic Regresion----------------##
#--------------------------------------------------

# Import Logistic Regression model

from sklearn.linear_model import LogisticRegression

   # L1 regularization - can set some coefficients to zero (feature selection)
   # C1 Regularization strength (medium level)
   # solver = "liblinear"  Algorithm for optimization, works with L1 and small datasets


steps = [('scaler', StandardScaler()),
         ('logReg', LogisticRegression(penalty = "l1", C =1, solver='liblinear'))]

LR_pipeline = Pipeline(steps)
LR_pipeline.fit(X_train, y_train)

In [4]:
#--------------------------------------------------
## Model Evaluation ##
#--------------------------------------------------

# Import evaluation metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score

# Predict class labels for test set
ypred_test = LR_pipeline.predict(X_test)

# Create confusion matrix for test set predictions
mat_clf = confusion_matrix(y_test, ypred_test)

# Create classification report (precision, recall, f1-score) for test set
report_clf = classification_report(y_test, ypred_test)

# Display confusion matrix and classification report for test set
print(mat_clf)
print(report_clf)

# Predict class probabilities for test set
ypred_testP = LR_pipeline.predict_proba(X_test)

# Calculate and print ROC AUC score for testing set
auc = roc_auc_score(y_test, ypred_testP[:,1])
print(auc)

[[8535    6]
 [  27  110]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      8541
           1       0.95      0.80      0.87       137

    accuracy                           1.00      8678
   macro avg       0.97      0.90      0.93      8678
weighted avg       1.00      1.00      1.00      8678

0.9828726529056497


In [5]:
ypred_train = LR_pipeline.predict(X_train)
mat_clf = confusion_matrix(y_train, ypred_train)
report_clf = classification_report(y_train, ypred_train)

print(mat_clf)
print(report_clf)


# Predict class probabilities for training set

ypred_trainP = LR_pipeline.predict_proba(X_train)

# Calculate and print ROC AUC score for training set
auc = roc_auc_score(y_train, ypred_trainP[:,1])
print(auc)

[[12793     3]
 [   46   173]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12796
           1       0.98      0.79      0.88       219

    accuracy                           1.00     13015
   macro avg       0.99      0.89      0.94     13015
weighted avg       1.00      1.00      1.00     13015

0.9796711586526041
