In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# **1.INTRODUCTION: CREDIT CARD FRAUD DETECTION**

The goal of this project is to build an end-to-end machine learning pipeline that can automatically detect fraudulent credit card transactions. I chose the Credit Card Fraud Detection dataset from Kaggle, which contains real-world transaction data collected by a European bank.

The task is a binary classification problem, where the model must predict whether a transaction is:

0 → Legitimate

1 → Fraudulent

The objective is to build a model that can detect fraud with the highest possible recall and F1-score for the minority class (fraud), while following machine learning theory and justifying each step in the pipeline.

In this project, I followed the same pipeline we learned from the Titanic example.

# 2. 1. DATA LOADING <a id="data-loading-eda"></a>
## 2.1 Importing the Dataset

In [None]:
# 1. DATA LOADING <a id="data-loading-eda"></a>
# 1.1 Importing the Dataset
import pandas as pd

df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')
df.head()

# 3.DATA CLEANING

## 3.1 Quick overview : verifying the dataset and that the target column (Class) is present

In [None]:
# 3.DATA CLEANING

# 3.1 Quick overview : verifying the dataset and that the target column (Class) is present

import pandas as pd
df = pd.read_csv('/kaggle/input/creditcardfraud/creditcard.csv')  # your file path
print("Shape:", df.shape)
print("\nColumns:\n", df.columns.tolist())
print("\nInfo:")
print(df.info())
print("\nFirst 5 rows:")
display(df.head())


## 3.2 Summary and missing values

In [None]:
# 3.2 Summary and missing values

# Here, i am checking the missing data and getting the sense of distributions, ranges, skewness.
# If missing values exist — description of chosen imputation strategy (mean/median/mode/model-based) is necessary and why (median for skewed data, mean for symmetric). If none, the data is complete.
#No missing values as all columns shows zero (0)

print("Missing values per column:")
print(df.isnull().sum())
print("\nDescriptive statistics (numeric):")
display(df.describe().T)


## 3.3 Checking Duplicates

In [None]:
#3.3 Checking Duplicates
n_dup = df.duplicated().sum()
print(f"Number of duplicate rows: {n_dup}")

if n_dup>0:
    display(df[df.duplicated()].head())


## 3.4 Dropping duplicates

In [None]:
# 2.4 Dropping duplicates
# duplicate causes the model to ‘see’ the same record multiple times, which can bias the learned patterns.

df = df.drop_duplicates()
df.shape


# 4 EXPLORATORY DATA ANALYSIS (EDA)

## 4.1 Class distribution

In [None]:
#4 EXPLORATORY DATA ANALYSIS (EDA)

# 4.1 Class distribution
#Fraud datasets are heavily imbalanced — this determines evaluation metrics and sampling strategy

class_counts = df['Class'].value_counts()
class_props = df['Class'].value_counts(normalize=True)
print("Counts:\n", class_counts)
print("\nProportions:\n", class_props)


## 3.2 Simple plots

In [None]:
# 4.2 Simple plots
# Spot skew and potential transformations (log), and class separation by amount/time.

import matplotlib.pyplot as plt
plt.figure(figsize=(12,4))

plt.subplot(1,2,1)
plt.hist(df['Amount'], bins=80)
plt.title('Transaction Amount distribution')
plt.xlabel('Amount')

plt.subplot(1,2,2)
plt.hist(df['Time'], bins=80)
plt.title('Transaction Time distribution (seconds since start)')
plt.xlabel('Time')

plt.tight_layout()
plt.show()

# Boxplot of Amount by Class (fraud vs non-fraud)
plt.figure(figsize=(6,4))
df.boxplot(column='Amount', by='Class', showfliers=False)
plt.title('Amount by Class (without outliers displayed)')
plt.suptitle('')
plt.show()


## 4.3 Checking correlations (quick heatmap or top correlated features with target)

In [None]:
#4.3 Checking correlations (quick heatmap or top correlated features with target)
# Correlation with target and quick heatmap
# To Understand which features relate to the target; PCA features may be anonymized (V1..V28) so interpret with caution
import seaborn as sns
corr_with_target = df.corr()['Class'].sort_values(ascending=False)
print("Top correlations with Class:\n", corr_with_target.head(10))
print("\nBottom correlations with Class:\n", corr_with_target.tail(10))

# Small heatmap of correlations for the top features by magnitude
top_feats = corr_with_target.abs().sort_values(ascending=False).head(10).index.tolist()
plt.figure(figsize=(8,6))
sns.heatmap(df[top_feats].corr(), annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Correlation heatmap (top features by |corr| with Class)')
plt.show()


# 5. FEATURE ENGINEERING

## 5.1 Train/test split

In [None]:
# 5. FEATURE ENGINEERING

# 5.1 Train/test split
# Reserve unseen data to evaluate final model. Use stratify because of imbalance.


from sklearn.model_selection import train_test_split

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("Train shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train class distribution:\n", y_train.value_counts(normalize=True))
print("Test class distribution:\n", y_test.value_counts(normalize=True))


## 5.2 Feature Scaling - Time and Amount

In [None]:
# 5.2 Feature Scaling - Time and Amount

# Credit card dataset uses PCA features (V1–V28) which are already scaled but amount and time are NOT scaled → scaling helps most ML algorithms


from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Copy data to avoid overwriting originals
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()

# Fit on train, transform both
X_train_scaled[['Time', 'Amount']] = scaler.fit_transform(X_train[['Time', 'Amount']])
X_test_scaled[['Time', 'Amount']] = scaler.transform(X_test[['Time', 'Amount']])

X_train_scaled.head()



# 6. DATA PREPROCESSING FOR MODELING
## 6.1 Train Logistic Regression

In [None]:
# 6. DATA PREPROCESSING FOR MODELING
# 6.1 Train Logistic Regression
# Here, i am using class_weight='balanced' because fraud is extremely rare.
# Balancing class weights give higher penalty to misclassifying the minority (fraud) class

from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(max_iter=2000, class_weight='balanced')

log_reg.fit(X_train_scaled, y_train)


# 7.MODEL TRAINING, HYPERPARAMETER TUNING AND MODEL VALIDATION / EVALUATION

## 7.1 confusion matrix, precision, recall, F1-score and ROC-AUC

In [None]:
# 7.MODEL TRAINING, HYPERPARAMETER TUNING AND MODEL VALIDATION / EVALUATION

# 7.1 confusion matrix, precision, recall, F1-score and ROC-AUC

# These are the correct metrics for imbalanced datasets (NOT accuracy)
# Accuracy is misleading because non-fraud dominates; Recall is crucial; we want to catch as many frauds as possible; Precision matters to avoid too many false alarms; AUC measures ranking ability across thresholds.

from sklearn.metrics import (
    confusion_matrix, classification_report, roc_auc_score
)

# Predictions
y_pred = log_reg.predict(X_test_scaled)
y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Classification Report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# ROC-AUC
auc = roc_auc_score(y_test, y_prob)
print("\nROC-AUC Score:", auc)


In [None]:
# Explanation:
# TN = 55,478 → model correctly said "not fraud" 
# TP = 90 → model correctly caught 90 fraud cases
# FN = 8 → model missed 8 frauds → VERY GOOD (low FN)
# FP = 1,386 → model wrongly flagged 1,386 normal transactions as fraud
# The model catches fraud cases extremely well (high recall). But it produces many false alarms (high false positives)

# In summary, our baseline Logistic Regression with class_weight='balanced' performs very well in terms of fraud detection (Recall = 0.92). This is crucial because in fraud detection missing a fraud is much more costly than flagging a normal transaction incorrectly. However, the model has low precision (0.06), meaning it produces many false positives. This is expected in highly imbalanced problems. The high ROC-AUC (0.97) indicates that the model separates fraud from non-fraud effectively, and we can further improve precision using more advanced models or threshold tuning.

## 7.2 Decision Threshold Tuning

In [None]:
# 7.2 Decision Threshold Tuning
# Getting predicted probabilities (fraud probability)

y_prob = log_reg.predict_proba(X_test_scaled)[:, 1]


## 7.3 Precision recall Curve

In [None]:
#7.3 Precision recall Curve

# Plotting Precision-Recall curve
# This curve is the most important tool for choosing a threshold in imbalanced datasets.

from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

precision, recall, thresholds = precision_recall_curve(y_test, y_prob)

plt.figure(figsize=(7,5))
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.grid(True)
plt.show()


## 7.4 Finding the Best Threshold

In [None]:
#7.4 Finding the Best Threshold
# Normally we choose a point that balances both (recall and precision). But The default threshold (0.5) is not used in fraud detection
# Hence, I will find the threshold that gives the best F1-score for the fraud class using this:

import numpy as np
from sklearn.metrics import f1_score

thresholds = np.linspace(0, 1, 101)  # 0.00, 0.01, ..., 1.00

best_threshold = 0
best_f1 = 0

for t in thresholds:
    preds = (y_prob >= t).astype(int)
    f1 = f1_score(y_test, preds)

    if f1 > best_f1:
        best_f1 = f1
        best_threshold = t

print("Best Threshold:", best_threshold)
print("Best F1 Score:", best_f1)


## 7.5 Re-Evaluation of the Model with the New Threshold

In [None]:
# 7.5 Re-Evaluation of the Model with the New Threshold
# This is to increase the precision (fraud), Fscore, and reduce false positive although recall (fraud) might drop slightly


optimal_pred = (y_prob >= best_threshold).astype(int)

print("Confusion Matrix (optimal threshold):")
print(confusion_matrix(y_test, optimal_pred))

print("\nClassification Report:")
print(classification_report(y_test, optimal_pred))

print("\nROC-AUC Score (unchanged):", roc_auc_score(y_test, y_prob))


In [None]:
# We tuned the threshold using the Precision–Recall curve to improve the model’s precision while maintaining a high recall. This reduces false positives and improves the model’s usefulness in real-world fraud detection.

## 7.6 Checking future importance

In [None]:
#7.6 Checking future importance
#  This is to understand which features are influencing predictions, even if the model performs well.

# Get feature importance from logistic regression coefficients
importance = pd.DataFrame({
    'feature': X_train.columns,
    'coefficient': log_reg.coef_[0]
})

# Sort by absolute value of coefficient
importance['abs_value'] = importance['coefficient'].abs()
importance = importance.sort_values(by='abs_value', ascending=False)

importance.head(15)




## 7.7 Plotting the feature

In [None]:
# 7.7 Plotting the feature

plt.figure(figsize=(10,6))
plt.barh(importance['feature'][:15], importance['abs_value'][:15])
plt.gca().invert_yaxis()
plt.title("Top 15 Most Important Features (Logistic Regression)")
plt.xlabel("Absolute Coefficient Value")
plt.show()


**Explanation:**
Linear models don’t build decision trees. They don’t compute “importance scores. They learn weights (coefficients) that show:

Positive → increases chance of fraud

Negative → decreases chance of fraud

Larger magnitude → more important

In summary, What does a positive and negative coefficient mean?

POSITIVE: A positive coefficient → increases the probability of fraud. Amount (+1.63) → Higher transaction amounts strongly increase the likelihood of fraud. This makes sense: fraudulent transactions often involve larger amounts. 22, V4, V1, V5, V28, V11. These PCA-transformed features (from the original anonymized dataset) contribute positively to the fraud probability.

NEGATIVE: A negative coefficient → decreases probability of fraud. V14 (–1.57) V12 (–1.27) V10 (–1.14) V17, V20. These features make a transaction less likely to be fraud.

So, Logistic Regression identifies the most influential variables by the magnitude of their coefficients. Larger absolute values indicate stronger impact on the model’s prediction

# 8. CONCLUSION
In this project, I built an end-to-end machine learning pipeline to detect fraudulent credit card transactions. Using Logistic Regression, proper preprocessing, scaling, class-imbalance handling, and threshold tuning, the model achieved strong performance with high recall and F1-score for the fraud class. Feature analysis showed that Amount, V14, V12, and V10 were the most influential predictors. Overall, the pipeline follows the theory learned in class and demonstrates how data preparation, model training, and evaluation techniques work together to improve fraud detection.