In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

train_transaction = pd.read_csv(r"C:\Users\ADMIN\ml projects\fraud detection project\fraud detection dataset\train_transaction.csv")
train_identity = pd.read_csv(r"C:\Users\ADMIN\ml projects\fraud detection project\fraud detection dataset\train_identity.csv")
test_transaction = pd.read_csv(r"C:\Users\ADMIN\ml projects\fraud detection project\fraud detection dataset\test_transaction.csv")
test_identity = pd.read_csv(r"C:\Users\ADMIN\ml projects\fraud detection project\fraud detection dataset\test_identity.csv")

In [2]:
train_df = train_transaction.merge(train_identity, on='TransactionID', how='left')
print('Merged train data shape: ', train_df.shape)

Merged train data shape:  (590540, 434)


In [3]:
test_df = test_transaction.merge(test_identity, on='TransactionID', how='left')
print('Merged test data shape: ', test_df.shape)

Merged test data shape:  (506691, 433)


In [4]:
missing_cols = train_df.columns[train_df.isnull().any()]

results = []

for col in missing_cols:
    stats = train_df.groupby(train_df[col].isnull())['isFraud'].mean()
    
    if len(stats) == 2:
        missing_rate = stats[True]
        present_rate = stats[False]
        diff = missing_rate - present_rate
        
        results.append({
            'column': col,
            'fraud_rate_missing': missing_rate,
            'fraud_rate_present': present_rate,
            'absolute_diff': diff
        })

signal_df = pd.DataFrame(results).sort_values(by='absolute_diff', ascending=False)

print(signal_df.head(10))

    column  fraud_rate_missing  fraud_rate_present  absolute_diff
355   V321            0.166667            0.034987       0.131679
354   V320            0.166667            0.034987       0.131679
353   V319            0.166667            0.034987       0.131679
352   V318            0.166667            0.034987       0.131679
343   V309            0.166667            0.034987       0.131679
342   V308            0.166667            0.034987       0.131679
341   V307            0.166667            0.034987       0.131679
344   V310            0.166667            0.034987       0.131679
346   V312            0.166667            0.034987       0.131679
345   V311            0.166667            0.034987       0.131679


In [5]:
to_drop = []
for col in missing_cols:
    missing_pct = train_df[col].isnull().mean()
    sig = signal_df[signal_df['column'] == col]['absolute_diff'].values
    delta = sig[0] if len(sig) > 0 else 0
    
    if missing_pct > 0.90 and abs(delta) < 0.01:
        to_drop.append(col)

train_df.drop(columns=to_drop, inplace=True)
test_df.drop(columns=to_drop, inplace=True)
print(f"Dropped {len(to_drop)} columns with no predictive signal.")

Dropped 0 columns with no predictive signal.


In [6]:
from sklearn.model_selection import train_test_split

x = train_df.drop(columns=['isFraud'])
y = train_df['isFraud']

x_train, x_eval, y_train, y_eval = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
import joblib
import os


BASE_PATH = r"C:\Users\ADMIN\ml projects\fraud detection project\API"

METADATA = joblib.load(os.path.join(BASE_PATH, "lgb_feature_metadata_v3.joblib"))
FEATURE_COLUMNS = METADATA["feature_names"]
CATEGORICAL_FEATURES = METADATA["categorical_features"]
CATEGORICAL_MAPPINGS = METADATA["categorical_mappings"]

MEDIANS = joblib.load(os.path.join(BASE_PATH, "training_medians.joblib"))
CARD1_AMT_MEANS = joblib.load(os.path.join(BASE_PATH, "card1_amt_means.joblib"))
UID_AMT_MEANS = joblib.load(os.path.join(BASE_PATH, "uid_amt_means.joblib"))
UID_D1_MEANS = joblib.load(os.path.join(BASE_PATH, "uid_d1_means.joblib"))
UID_D15_MEANS = joblib.load(os.path.join(BASE_PATH, "uid_d15_means.joblib"))
C_FEATS_MEANS = joblib.load(os.path.join(BASE_PATH, "c_feats_card1_means.joblib"))
CARD1_ADDR_NUNIQUE = joblib.load(os.path.join(BASE_PATH, "card1_addr_nunique.joblib"))

def preprocess_transaction(raw_df: pd.DataFrame) -> pd.DataFrame:
    df = raw_df.copy()

    card1 = df["card1"].iloc[0]
    addr1 = df["addr1"].iloc[0] if "addr1" in df.columns else np.nan
    uid = (card1, addr1)

    df["card1_amt_mean"] = CARD1_AMT_MEANS.get(card1, np.nan)
    df["card1_amt_ratio"] = df["TransactionAmt"] / (df["card1_amt_mean"] + 0.01)
    df["uid_amt_mean"] = UID_AMT_MEANS.get(uid, np.nan)
    df["uid_amt_ratio"] = df["TransactionAmt"] / (df["uid_amt_mean"] + 0.01)
    df["card1_addr1_count"] = CARD1_ADDR_NUNIQUE.get(card1, 1)
    df["uid_D1_mean"] = UID_D1_MEANS.get(uid, np.nan)
    df["uid_D15_mean"] = UID_D15_MEANS.get(uid, np.nan)

    # --- Ensure all categorical columns exist ---
    for col in CATEGORICAL_FEATURES:
        if col not in df.columns:
            df[col] = 'missing'  # create missing column if absent
        allowed = CATEGORICAL_MAPPINGS[col]
        if 'missing' not in allowed:
            allowed = allowed + ['missing']
        df[col] = df[col].astype(str).replace('nan', 'missing')
        df.loc[~df[col].isin(allowed), col] = 'missing'
        df[col] = pd.Categorical(df[col], categories=allowed)

    # --- Final schema alignment ---
    for col in FEATURE_COLUMNS:
        if col not in df.columns:
            df[col] = 0  # numeric fill for missing engineered columns

    df = df[FEATURE_COLUMNS]

    # Fill remaining NaNs with training medians
    for col, median in MEDIANS.items():
        if col in df.columns:
            df[col] = df[col].fillna(median)

    # Ensure numeric types
    num_cols = df.select_dtypes(include=['number']).columns
    df[num_cols] = df[num_cols].astype('float32')

    return df


In [8]:
x_eval_processed = preprocess_transaction(x_eval)
print(x_eval_processed.shape)

  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric fill for missing engineered columns
  df[col] = 0  # numeric

(118108, 555)


In [9]:
import joblib

model = joblib.load(r"C:\Users\ADMIN\ml projects\fraud detection project\models\fraud_model_v3.pkl")

In [10]:
for col in CATEGORICAL_FEATURES:
    if col in x_eval_processed.columns:
        training_categories = CATEGORICAL_MAPPINGS[col]

        x_eval_processed[col] = pd.Categorical(
            x_eval_processed[col].astype(str).replace('nan', 'missing'), 
            categories=training_categories
        )

In [11]:
y_proba = model.predict(x_eval_processed)

y_pred = (y_proba >= 0.30).astype(int)

In [12]:
error_df = x_eval_processed.copy()
error_df['actual'] = y_eval.values
error_df['prediction'] = y_pred
error_df['proba'] = y_proba

# Identify the type of mistake
def get_error_type(row):
    if row['actual'] == 1 and row['prediction'] == 1: return 'TP (Caught)'
    if row['actual'] == 0 and row['prediction'] == 0: return 'TN (Safe)'
    if row['actual'] == 0 and row['prediction'] == 1: return 'FP (Insult)'
    if row['actual'] == 1 and row['prediction'] == 0: return 'FN (Leak)'

error_df['error_category'] = error_df.apply(get_error_type, axis=1)

In [13]:
category_counts = error_df['error_category'].value_counts()
category_pct = error_df['error_category'].value_counts(normalize=True) * 100

print("--- Error Category Distribution ---")
for cat in ['TP (Caught)', 'TN (Safe)', 'FP (Insult)', 'FN (Leak)']:
    count = category_counts.get(cat, 0)
    pct = category_pct.get(cat, 0)
    print(f"{cat}: {count} transactions ({pct:.2f}%)")


--- Error Category Distribution ---
TP (Caught): 2185 transactions (1.85%)
TN (Safe): 112670 transactions (95.40%)
FP (Insult): 1196 transactions (1.01%)
FN (Leak): 2057 transactions (1.74%)


In [14]:
tp = category_counts.get('TP (Caught)', 0)
fp = category_counts.get('FP (Insult)', 0)
fn = category_counts.get('FN (Leak)', 0)

precision = tp / (tp + fp) if (tp + fp) > 0 else 0
recall = tp / (tp + fn) if (tp + fn) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print("\n--- Model Performance Metrics ---")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1_score:.4f}")


--- Model Performance Metrics ---
Precision: 0.6463
Recall:    0.5151
F1-Score:  0.5733
