In [21]:
# ==========================================
# COGNATIVE-SHEILD--FINTECH FRAUD MODEL TRAINING
# JUPYTER / ANACONDA VERSION
# ==========================================

import pandas as pd
import numpy as nphgh
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from joblib import dump

In [22]:
# ------------------------------
# LOAD DATASET
# ------------------------------

df = pd.read_csv("master_synthetic_fraud_dataset.csv")
df["hour"] = df["hour"].astype(int)

print("Dataset Shape:", df.shape)

Dataset Shape: (284807, 14)


In [23]:
# ------------------------------
# DEFINE FEATURES AND TARGET
# ------------------------------

y = df["fraud_label"]
X = df.drop(columns=["fraud_label"])

# Encode categorical features
X = pd.get_dummies(X)

print("Encoded Shape:", X.shape)

Encoded Shape: (284807, 24)


In [24]:

# ------------------------------
# TRAIN TEST SPLIT
# ------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.25,
    random_state=42,
    stratify=y
)


In [25]:
# ------------------------------
# OVERSAMPLE FRAUD CLASS
# ------------------------------

def oversample_minority(X, y):
    pos_idx = np.where(y == 1)[0]
    neg_idx = np.where(y == 0)[0]

    if len(pos_idx) == 0:
        return X, y

    n_sample = len(neg_idx) - len(pos_idx)
    dup_idx = np.random.choice(pos_idx, size=n_sample, replace=True)

    X_res = np.vstack([X, X[dup_idx]])
    y_res = np.concatenate([y, y[dup_idx]])

    shuffle_idx = np.random.permutation(len(y_res))
    return X_res[shuffle_idx], y_res[shuffle_idx]


X_train_res, y_train_res = oversample_minority(X_train.values, y_train.values)

print("Balanced class distribution:", np.bincount(y_train_res))


Balanced class distribution: [202991 202991]


In [26]:
# ------------------------------
# FEATURE SCALING
# ------------------------------

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)



**LOGISTIC REGRESSION MODEL**

In [30]:
log_reg = LogisticRegression(
    max_iter=300,
    class_weight="balanced"
)

log_reg.fit(X_train_scaled, y_train_res)

# Adjust decision threshold (better precision)
lr_probs = log_reg.predict_proba(X_test_scaled)[:, 1]
threshold = 0.65
lr_preds = (lr_probs >= threshold).astype(int)


In [31]:

# ------------------------------
# MODEL METRICS
# ------------------------------

print("\n===== LOGISTIC REGRESSION RESULTS =====")

print("Accuracy:", accuracy_score(y_test, lr_preds))
print("Precision:", precision_score(y_test, lr_preds))
print("Recall:", recall_score(y_test, lr_preds))
print("F1 Score:", f1_score(y_test, lr_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_preds))

# ------------------------------
# FEATURE IMPORTANCE (INTERPRETABILITY)
# ------------------------------

feature_importance = pd.Series(
    log_reg.coef_[0],
    index=X.columns
).sort_values(ascending=False)

print("\nTop Fraud Indicators:")
print(feature_importance.head(10))



===== LOGISTIC REGRESSION RESULTS =====
Accuracy: 0.8268587961012331
Precision: 0.1973974111814927
Recall: 0.8103448275862069
F1 Score: 0.31746207507474256
Confusion Matrix:
 [[56007 11657]
 [  671  2867]]

Top Fraud Indicators:
is_new_device              1.476775
is_night                   1.396027
amount_deviation           1.040776
amount                     1.030162
device_id_Android_A        0.010022
merchant_id_phonepe@upi    0.008949
merchant_id_amazon@upi     0.008450
location_Kolkata           0.006977
location_Lucknow           0.006689
user_id                    0.005317
dtype: float64


In [32]:
# ------------------------------
# SAVE MODEL BUNDLE
# ------------------------------

model_bundle = {
    "scaler": scaler,
    "model": log_reg,
    "threshold": threshold,
    "feature_columns": list(X.columns)
}

dump(model_bundle, "fraud_detection_model.joblib")

print("\n Model saved as fraud_detection_model.joblib")


 Model saved as fraud_detection_model.joblib


In [33]:
from joblib import load

bundle = load("fraud_detection_model.joblib")

print("Model loaded successfully!")
print("Threshold:", bundle["threshold"])

Model loaded successfully!
Threshold: 0.65


In [34]:
def predict_transaction(input_df):
    bundle = load("fraud_detection_model.joblib")

    # Ensure same columns
    X_new = input_df.reindex(columns=bundle["feature_columns"], fill_value=0)

    # Scale features
    X_scaled = bundle["scaler"].transform(X_new)

    # Predict probability
    prob = bundle["model"].predict_proba(X_scaled)[:, 1][0]

    risk = "High Risk" if prob >= bundle["threshold"] else "Low Risk"

    return prob, risk

    

In [None]:
sample = X.iloc[[0]]  # take any row
prob, risk = predict_transaction(sample)

print("Fraud Probability:", prob)
print("Risk Level:", risk)