<a href="https://colab.research.google.com/github/samipn/crisp-dm_semma_and_kdd/blob/main/KDD_Credit_Fraud.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# KDD: Credit Card Fraud
We walk Dataset Selection → Data Cleaning → Transformation → Data Mining → Evaluation/Interpretation.

In [1]:
!pip install kaggle



In [2]:
from google.colab import files

uploaded = files.upload()

print("kaggle.json file uploaded.")

Saving kaggle.json to kaggle.json
kaggle.json file uploaded.


In [3]:
import os
import shutil

# Create the .kaggle directory in the user's home directory
home_dir = os.path.expanduser("~")
kaggle_dir = os.path.join(home_dir, ".kaggle")
os.makedirs(kaggle_dir, exist_ok=True)

# Copy the uploaded kaggle.json file to the .kaggle directory
# Assuming the uploaded file is in the current directory
uploaded_file_path = 'kaggle.json'
destination_path = os.path.join(kaggle_dir, 'kaggle.json')
shutil.copy(uploaded_file_path, destination_path)

# Set the appropriate file permissions for kaggle.json
os.chmod(destination_path, 0o600)

print("Kaggle API setup complete.")

Kaggle API setup complete.


## Selection & Understanding
Load the dataset. It is highly imbalanced; `Class=1` indicates fraud.

In [4]:
#@title Setup
import pandas as pd, numpy as np, matplotlib.pyplot as plt, seaborn as sns, joblib, os, plotly.express as px
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, classification_report, f1_score, precision_recall_curve, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline

RANDOM_STATE = 42
os.makedirs('data', exist_ok=True)

# Download the dataset using Kaggle API
!kaggle datasets download -d mlg-ulb/creditcardfraud -p data/ --unzip

path = 'data/creditcard.csv'
if not os.path.exists(path):
    print("Download Kaggle creditcard.csv from mlg-ulb/creditcardfraud")
df = pd.read_csv(path)
df['Class'].value_counts(normalize=True)

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
License(s): DbCL-1.0
Downloading creditcardfraud.zip to data
  0% 0.00/66.0M [00:00<?, ?B/s]
100% 66.0M/66.0M [00:00<00:00, 2.69GB/s]


Unnamed: 0_level_0,proportion
Class,Unnamed: 1_level_1
0,0.998273
1,0.001727


In [5]:
df.describe().T.head(10)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Time,284807.0,94813.86,47488.145955,0.0,54201.5,84692.0,139320.5,172792.0
V1,284807.0,1.168375e-15,1.958696,-56.40751,-0.920373,0.018109,1.315642,2.45493
V2,284807.0,3.416908e-16,1.651309,-72.715728,-0.59855,0.065486,0.803724,22.057729
V3,284807.0,-1.379537e-15,1.516255,-48.325589,-0.890365,0.179846,1.027196,9.382558
V4,284807.0,2.074095e-15,1.415869,-5.683171,-0.84864,-0.019847,0.743341,16.875344
V5,284807.0,9.604066e-16,1.380247,-113.743307,-0.691597,-0.054336,0.611926,34.801666
V6,284807.0,1.487313e-15,1.332271,-26.160506,-0.768296,-0.274187,0.398565,73.301626
V7,284807.0,-5.556467e-16,1.237094,-43.557242,-0.554076,0.040103,0.570436,120.589494
V8,284807.0,1.213481e-16,1.194353,-73.216718,-0.20863,0.022358,0.327346,20.007208
V9,284807.0,-2.406331e-15,1.098632,-13.434066,-0.643098,-0.051429,0.597139,15.594995


## Transformation
Scale `Amount` and optionally `Time`. The rest are PCA components already.

In [6]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

y = df['Class']
X = df.drop(columns=['Class'])
pre = ColumnTransformer([
    ('amt', Pipeline([('imp', SimpleImputer(strategy='median')), ('sc', StandardScaler())]), ['Amount','Time'])
], remainder='passthrough')

## Data Mining (Modeling)
Compare Logistic Regression vs RandomForest with **SMOTE** inside the pipeline.

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
models = {
    "log_reg": LogisticRegression(max_iter=200, class_weight='balanced'),
    "rf": RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE, class_weight='balanced')
}
for name, clf in models.items():
    pipe = ImbPipeline([('pre', pre), ('smote', SMOTE(random_state=RANDOM_STATE)), ('clf', clf)])
    aucs, aps = [], []
    for tr, te in skf.split(X, y):
        pipe.fit(X.iloc[tr], y.iloc[tr])
        proba = pipe.predict_proba(X.iloc[te])[:,1]
        aucs.append(roc_auc_score(y.iloc[te], proba))
        aps.append(average_precision_score(y.iloc[te], proba))
    print(name, "CV AUC:", np.mean(aucs).round(4), "AP:", np.mean(aps).round(4))

log_reg CV AUC: 0.9778 AP: 0.7284
rf CV AUC: 0.9816 AP: 0.8576


## Evaluation & Interpretation
Select threshold by maximizing F1 or business cost.

In [9]:
# Fit and threshold-tune
from sklearn.metrics import precision_recall_curve, f1_score
pipe = ImbPipeline([('pre', pre), ('smote', SMOTE(random_state=RANDOM_STATE)), ('clf', LogisticRegression(max_iter=300, class_weight='balanced'))])
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE)
pipe.fit(X_tr, y_tr)
probs = pipe.predict_proba(X_te)[:,1]
prec, rec, th = precision_recall_curve(y_te, probs)
f1s = 2*prec*rec/(prec+rec+1e-9)
best_idx = f1s[:-1].argmax()
best_th = th[best_idx]
pred = (probs >= best_th).astype(int)
print("Best threshold:", best_th, "F1:", f1_score(y_te, pred))
os.makedirs('deployment', exist_ok=True) # Create the directory if it doesn't exist
joblib.dump(pipe, 'deployment/model.joblib'); print("Saved to deployment/model.joblib")

Best threshold: 0.9999999906914586 F1: 0.8247422680412371
Saved to deployment/model.joblib
