In [1]:
from google.colab import drive
import pandas as pd
import os

# 1. Mount your Google Drive
# When you run this, click the link that pops up and "Allow" access.
drive.mount('/content/drive')

# 2. Point to your specific folder
# Based on your screenshot: My Drive -> midterm -> Fraud Transaction
folder_path = '/content/drive/MyDrive/midterm/Fraud Transaction'

# 3. Load the data using Pandas
# We use Pandas instead of Polars because it works better with the Machine Learning code I gave you.
print("Loading train_transaction.csv... (This might take 1-2 minutes)")
df_train = pd.read_csv(os.path.join(folder_path, 'train_transaction.csv'))

print("Loading test_transaction.csv...")
df_test = pd.read_csv(os.path.join(folder_path, 'test_transaction.csv'))

print("SUCCESS! Data loaded.")
print(f"Train shape: {df_train.shape}")

Mounted at /content/drive
Loading train_transaction.csv... (This might take 1-2 minutes)
Loading test_transaction.csv...
SUCCESS! Data loaded.
Train shape: (590540, 394)


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

print("Starting Preprocessing...")

# 1. Memory Saving: Drop columns with > 70% missing values
missing_percent = df_train.isnull().mean()
drop_cols = missing_percent[missing_percent > 0.7].index
df_train = df_train.drop(columns=drop_cols)
df_test = df_test.drop(columns=drop_cols)

# 2. Fill Missing Values
for col in df_train.columns:
    if df_train[col].dtype == 'object':
        df_train[col] = df_train[col].fillna("Unknown")
        if col in df_test.columns:
            df_test[col] = df_test[col].fillna("Unknown")
    else:
        df_train[col] = df_train[col].fillna(0)
        if col in df_test.columns:
            df_test[col] = df_test[col].fillna(0)

# 3. Encode Text to Numbers
cat_cols = df_train.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    # Combine to

Starting Preprocessing...


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
import pandas as pd
import numpy as np

print("Starting Preprocessing... (Please wait)")

# 1. Memory Saving: Drop columns with > 70% missing values
# We use the dataframes df_train and df_test loaded in Step 1
missing_percent = df_train.isnull().mean()
drop_cols = missing_percent[missing_percent > 0.7].index
df_train = df_train.drop(columns=drop_cols)
df_test = df_test.drop(columns=drop_cols)

# 2. Fill Missing Values
for col in df_train.columns:
    # Fill text columns with "Unknown"
    if df_train[col].dtype == 'object':
        df_train[col] = df_train[col].fillna("Unknown")
        if col in df_test.columns:
            df_test[col] = df_test[col].fillna("Unknown")
    # Fill number columns with 0
    else:
        df_train[col] = df_train[col].fillna(0)
        if col in df_test.columns:
            df_test[col] = df_test[col].fillna(0)

# 3. Encode Text to Numbers
cat_cols = df_train.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    # Combine to ensure all categories are known
    combined = pd.concat([df_train[col], df_test[col]], axis=0).astype(str)
    le.fit(combined)
    df_train[col] = le.transform(df_train[col].astype(str))
    df_test[col] = le.transform(df_test[col].astype(str))

# 4. Define Features (X) and Target (y)
X = df_train.drop(columns=['isFraud', 'TransactionID', 'TransactionDT'])
y = df_train['isFraud']

# Prepare Test Data for final submission
test_ids = df_test['TransactionID']
X_final_test = df_test.drop(columns=['TransactionID', 'TransactionDT'])
# Ensure columns match exactly
X_final_test = X_final_test[X.columns]

# 5. Scale Data (Required for Deep Learning)
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_final_test = scaler.transform(X_final_test)

# 6. Split Train/Validation
# THIS is where X_train is created!
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

print("Preprocessing Complete. NOW you can run Step 3.")

Starting Preprocessing... (Please wait)
Preprocessing Complete. NOW you can run Step 3.


In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score

# --- Model 1: Traditional ML (Logistic Regression) ---
# class_weight='balanced' fixes the fraud imbalance issue
print("Training Logistic Regression...")
model_lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
model_lr.fit(X_train, y_train)

# Evaluate LR
prob_lr = model_lr.predict_proba(X_val)[:, 1]
print(f"Logistic Regression AUC Score: {roc_auc_score(y_val, prob_lr):.4f}")

# --- Model 2: Deep Learning (Neural Network) ---
print("Training Neural Network (MLP)...")
# Input -> 64 neurons -> 32 neurons -> Output
model_dl = MLPClassifier(hidden_layer_sizes=(64, 32), activation='relu', max_iter=200, random_state=42)
model_dl.fit(X_train, y_train)

# Evaluate DL
prob_dl = model_dl.predict_proba(X_val)[:, 1]
print(f"Neural Network AUC Score: {roc_auc_score(y_val, prob_dl):.4f}")

Training Logistic Regression...
Logistic Regression AUC Score: 0.8373
Training Neural Network (MLP)...
Neural Network AUC Score: 0.9110


In [8]:
print("Generating submission.csv...")

# We use the Logistic Regression model for the final file
# (It handles the imbalance explicitly and is safer for this dataset)
final_probs = model_lr.predict_proba(X_final_test)[:, 1]

submission = pd.DataFrame({
    'TransactionID': test_ids,
    'isFraud': final_probs
})

submission.to_csv('submission.csv', index=False)
print("DONE! 'submission.csv' has been created.")
print(submission.head())

Generating submission.csv...
DONE! 'submission.csv' has been created.
   TransactionID   isFraud
0        3663549  0.124887
1        3663550  0.357019
2        3663551  0.459140
3        3663552  0.091072
4        3663553  0.119322
