In [12]:
!pip install imbalanced-learn xgboost



In [13]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier

In [14]:
from google.colab import files
uploaded = files.upload()

Saving fraudTest.csv to fraudTest.csv


In [16]:
data = pd.read_csv("fraudTest.csv")

print("Dataset Shape:", data.shape)
print("\nFirst 5 Rows:")
print(data.head())

Dataset Shape: (555719, 23)

First 5 Rows:
   Unnamed: 0 trans_date_trans_time            cc_num  \
0           0   2020-06-21 12:14:25  2291163933867244   
1           1   2020-06-21 12:14:33  3573030041201292   
2           2   2020-06-21 12:14:53  3598215285024754   
3           3   2020-06-21 12:15:15  3591919803438423   
4           4   2020-06-21 12:15:17  3526826139003047   

                               merchant        category    amt   first  \
0                 fraud_Kirlin and Sons   personal_care   2.86    Jeff   
1                  fraud_Sporer-Keebler   personal_care  29.84  Joanne   
2  fraud_Swaniawski, Nitzsche and Welch  health_fitness  41.28  Ashley   
3                     fraud_Haley Group        misc_pos  60.05   Brian   
4                 fraud_Johnston-Casper          travel   3.19  Nathan   

       last gender                       street  ...      lat      long  \
0   Elliott      M            351 Darlene Green  ...  33.9659  -80.9355   
1  Williams      F 

In [27]:
print(data["Class"].value_counts())

KeyError: 'Class'

In [18]:
print(data.columns)

Index(['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category',
       'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time',
       'merch_lat', 'merch_long', 'is_fraud'],
      dtype='object')


In [19]:
# Remove unnecessary ID/text columns
data = data.drop(columns=[
    'Unnamed: 0',
    'trans_date_trans_time',
    'cc_num',
    'merchant',
    'first',
    'last',
    'street',
    'city',
    'state',
    'zip',
    'job',
    'dob',
    'trans_num'
])

In [20]:
data['gender'] = data['gender'].map({'M': 0, 'F': 1})

In [21]:
data = pd.get_dummies(data, columns=['category'], drop_first=True)

In [22]:
X = data.drop("is_fraud", axis=1)
y = data["is_fraud"]

In [23]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [24]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Before SMOTE:")
print(y_train.value_counts())

print("\nAfter SMOTE:")
print(y_train_resampled.value_counts())

Before SMOTE:
is_fraud
0    442856
1      1719
Name: count, dtype: int64

After SMOTE:
is_fraud
0    442856
1    442856
Name: count, dtype: int64


In [25]:
from xgboost import XGBClassifier

model = XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    eval_metric='logloss'
)

model.fit(X_train_resampled, y_train_resampled)

In [26]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9932430000719787

Classification Report:

              precision    recall  f1-score   support

           0       1.00      0.99      1.00    110718
           1       0.34      0.84      0.49       426

    accuracy                           0.99    111144
   macro avg       0.67      0.92      0.74    111144
weighted avg       1.00      0.99      0.99    111144


Confusion Matrix:

[[110036    682]
 [    69    357]]


In [29]:
# Predict first 5 transactions from test set
sample = X_test.iloc[0:5]

predictions = model.predict(sample)

for i, pred in enumerate(predictions):
    if pred == 1:
        print(f"Transaction {i+1}: Fraudulent ðŸš¨")
    else:
        print(f"Transaction {i+1}: Legitimate âœ…")

Transaction 1: Legitimate âœ…
Transaction 2: Legitimate âœ…
Transaction 3: Legitimate âœ…
Transaction 4: Legitimate âœ…
Transaction 5: Legitimate âœ…


In [30]:
# Predict one transaction
single_transaction = X_test.iloc[0].values.reshape(1, -1)

prediction = model.predict(single_transaction)

if prediction[0] == 1:
    print("Fraudulent Transaction ðŸš¨")
else:
    print("Legitimate Transaction âœ…")

Legitimate Transaction âœ…


In [31]:
sample = X_test.iloc[0:10]
actual = y_test.iloc[0:10].values
predicted = model.predict(sample)

for i in range(10):
    print(f"Transaction {i+1}")
    print("Actual:     ", "Fraud ðŸš¨" if actual[i]==1 else "Legit âœ…")
    print("Predicted:  ", "Fraud ðŸš¨" if predicted[i]==1 else "Legit âœ…")
    print("-"*30)

Transaction 1
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------
Transaction 2
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------
Transaction 3
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------
Transaction 4
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------
Transaction 5
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------
Transaction 6
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------
Transaction 7
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------
Transaction 8
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------
Transaction 9
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------
Transaction 10
Actual:      Legit âœ…
Predicted:   Legit âœ…
------------------------------


In [32]:
import joblib

In [33]:
joblib.dump(model, "fraud_detection_model.pkl")

['fraud_detection_model.pkl']

In [34]:
from google.colab import files
files.download("fraud_detection_model.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
joblib.dump(X.columns, "model_features.pkl")
files.download("model_features.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>