In [1]:
# === 1. Import Required Libraries ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from imblearn.over_sampling import SMOTE  # ✅ Changed here
import joblib

# === 2. Load Dataset ===
df = pd.read_csv("D:/MTech/Mini project/archive (9)/fraudTrain.csv")

# === 3. Feature Engineering ===
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['hour'] = df['trans_date_trans_time'].dt.hour
df['is_night'] = df['hour'].apply(lambda x: 1 if x <= 6 else 0)
df['dob'] = pd.to_datetime(df['dob'])
df['age'] = df['trans_date_trans_time'].dt.year - df['dob'].dt.year
df['log_amt'] = np.clip(np.log1p(df['amt']), a_min=None, a_max=7)
df['amt_to_pop_ratio'] = df['amt'] / (df['city_pop'] + 1)
df['amt_hour_ratio'] = df['log_amt'] / (df['hour'] + 1)
df['pop_amt_ratio'] = df['log_amt'] / (df['city_pop'] + 1)
df['merchant_avg_amt'] = df.groupby('merchant')['amt'].transform('mean')
df['amt_to_merchant_avg'] = df['amt'] / (df['merchant_avg_amt'] + 1)

# === 4. Encode Categorical Features ===
categorical_cols = ['category', 'merchant', 'job']
encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    encoders[col] = le

# === 5. Select Features ===
selected_features = [
    'log_amt', 'category', 'merchant', 'hour', 'is_night',
    'city_pop', 'lat', 'long', 'merch_lat', 'merch_long',
    'job', 'amt_to_pop_ratio', 'amt_hour_ratio', 'pop_amt_ratio',
    'amt_to_merchant_avg'
]

X = df[selected_features]
y = df['is_fraud']

# === 6. Train-Test Split ===
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

# === 7. Oversampling using SMOTE ===
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

# === 8. Define Models ===
models = {
    "DecisionTree_SMOTE": DecisionTreeClassifier(random_state=42),
    "RandomForest_SMOTE": RandomForestClassifier(n_estimators=100, random_state=42),
    "LogisticRegression_SMOTE": LogisticRegression(max_iter=1000, random_state=42),
    "ExtraTrees_SMOTE": ExtraTreesClassifier(n_estimators=100, random_state=42)
}

# === 9. Train and Evaluate Each Model ===
for name, model in models.items():
    model.fit(X_train_bal, y_train_bal)
    y_proba = model.predict_proba(X_test)[:, 1]
    precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    best_index = np.argmax(f1_scores)
    best_threshold = thresholds[best_index]
    y_pred = (y_proba >= best_threshold).astype(int)

    print(f"\n=== {name} Evaluation ===")
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()
    accuracy = (TP + TN) / (TP + TN + FP + FN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    specificity = TN / (TN + FP)
    roc_auc = roc_auc_score(y_test, y_proba)

    print(f"Best Threshold: {best_threshold:.4f}")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"Specificity: {specificity:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(classification_report(y_test, y_pred))

    # Optional: Save each model separately
    joblib.dump(model, f"{name}.pkl")



=== DecisionTree_SMOTE Evaluation ===
Best Threshold: 1.0000
Accuracy: 0.9921
Precision: 0.4050
Recall: 0.7615
Specificity: 0.9935
ROC AUC: 0.8775
              precision    recall  f1-score   support

           0       1.00      0.99      1.00    257834
           1       0.41      0.76      0.53      1501

    accuracy                           0.99    259335
   macro avg       0.70      0.88      0.76    259335
weighted avg       1.00      0.99      0.99    259335


=== RandomForest_SMOTE Evaluation ===
Best Threshold: 0.7400
Accuracy: 0.9971
Precision: 0.7812
Recall: 0.6995
Specificity: 0.9989
ROC AUC: 0.9874
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    257834
           1       0.78      0.70      0.74      1501

    accuracy                           1.00    259335
   macro avg       0.89      0.85      0.87    259335
weighted avg       1.00      1.00      1.00    259335


=== LogisticRegression_SMOTE Evaluation ===
Best 

In [3]:
import joblib
import numpy as np
from sklearn.metrics import confusion_matrix
from math import sqrt

# === Load test set ===
# If needed, reload test data from earlier steps
# Assuming X_test and y_test already available in memory

# === List of model names (same as saved filenames) ===
model_names = [
    "DecisionTree_SMOTE",
    "RandomForest_SMOTE",
    "LogisticRegression_SMOTE",
    "ExtraTrees_SMOTE"
]

for name in model_names:
    model = joblib.load(f"{name}.pkl")
    y_proba = model.predict_proba(X_test)[:, 1]

    # Find threshold by F1 or reuse a standard value like 0.5
    # Or reuse threshold value stored previously, e.g., best_threshold = 0.9648
    from sklearn.metrics import precision_recall_curve
    precisions, recalls, thresholds = precision_recall_curve(y_test, y_proba)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls + 1e-10)
    best_index = np.argmax(f1_scores)
    best_threshold = thresholds[best_index]

    y_pred = (y_proba >= best_threshold).astype(int)

    # === Confusion matrix ===
    cm = confusion_matrix(y_test, y_pred)
    TN, FP, FN, TP = cm.ravel()

    # === MCC Calculation ===
    numerator = (TP * TN) - (FP * FN)
    denominator = sqrt((TP + FP) * (TP + FN) * (TN + FP) * (TN + FN))
    mcc = numerator / denominator if denominator != 0 else 0

    print(f"\n📊 {name} — MCC: {mcc:.4f}")



📊 DecisionTree_SMOTE — MCC: 0.5520

📊 RandomForest_SMOTE — MCC: 0.7378

📊 LogisticRegression_SMOTE — MCC: 0.0037

📊 ExtraTrees_SMOTE — MCC: 0.7350
