In [26]:
# Memory-Efficient Fraud Detection Pipeline
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [28]:
# -----------------------------
# 1. Load Data
# -----------------------------
train_data = pd.read_csv("fraudTrain.csv")
test_data = pd.read_csv("fraudTest.csv")

In [29]:
# -----------------------------
# 2. Drop Unnecessary Columns
# -----------------------------
drop_cols = ['Unnamed: 0', 'cc_num', 'trans_num', 'first', 'last', 'street', 'dob']
X_train = train_data.drop(columns=drop_cols + ['is_fraud'])
y_train = train_data['is_fraud']

X_test = test_data.drop(columns=drop_cols + ['is_fraud'])
y_test = test_data['is_fraud']

In [32]:
# -----------------------------
# 3. Feature Engineering: datetime
# -----------------------------
for df in [X_train, X_test]:
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['hour'] = df['trans_date_trans_time'].dt.hour
    df['day'] = df['trans_date_trans_time'].dt.day
    df['weekday'] = df['trans_date_trans_time'].dt.weekday
    df.drop(columns=['trans_date_trans_time'], inplace=True)

In [34]:
# -----------------------------
# 4. Encode Categorical Variables
# -----------------------------
high_card_cols = ['merchant', 'city', 'job']       # many unique values
low_card_cols = ['category', 'gender', 'state']   # few unique values

# Label encode high-cardinality columns
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
X_train[high_card_cols] = encoder.fit_transform(X_train[high_card_cols])
X_test[high_card_cols] = encoder.transform(X_test[high_card_cols])

# One-hot encode low-cardinality columns
X_train = pd.get_dummies(X_train, columns=low_card_cols)
X_test = pd.get_dummies(X_test, columns=low_card_cols)
X_test = X_test.reindex(columns=X_train.columns, fill_value=0)

In [36]:
# -----------------------------
# 5. Train Models
# -----------------------------
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'Decision Tree': DecisionTreeClassifier(max_depth=10, class_weight='balanced', random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)
}

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:,1]
    
    report = classification_report(y_test, y_pred, output_dict=True)
    auc = roc_auc_score(y_test, y_prob)
    
    results[name] = {
        'Precision': report['1']['precision'],
        'Recall': report['1']['recall'],
        'F1-score': report['1']['f1-score'],
        'ROC AUC': auc
    }



Training Logistic Regression...

Training Decision Tree...

Training Random Forest...


In [38]:
# -----------------------------
# 6. Display Results
# -----------------------------
results_df = pd.DataFrame(results).T
print("\nModel Comparison (Fraud Class Metrics):\n")
print(results_df)


Model Comparison (Fraud Class Metrics):

                     Precision    Recall  F1-score   ROC AUC
Logistic Regression   0.062834  0.751981  0.115976  0.830892
Decision Tree         0.087368  0.950583  0.160028  0.970540
Random Forest         0.982199  0.437296  0.605161  0.970332
