In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, roc_auc_score

# --- 1. CONFIGURATION (LOCAL PATH) ---
# We use r"..." to handle Windows backslashes correctly
dataset_path = r"C:\Tugas\tubes\ML\finalterm\Transaction\Fraud Transaction"

print(f"ðŸ“‚ Loading data from: {dataset_path}")

# --- 2. LOAD DATA ---
# This might take 10-20 seconds because the files are large
train_df = pd.read_csv(os.path.join(dataset_path, 'train_transaction.csv'))
test_df = pd.read_csv(os.path.join(dataset_path, 'test_transaction.csv'))

print(f"âœ… Data Loaded!")
print(f"Train Shape: {train_df.shape}")
print(f"Test Shape: {test_df.shape}")

# Separate Target and IDs
target = 'isFraud'
train_ids = train_df['TransactionID']
test_ids = test_df['TransactionID']

# Drop TransactionID (Index only)
train_df = train_df.drop('TransactionID', axis=1)
test_df = test_df.drop('TransactionID', axis=1)

# --- 3. PREPROCESSING (Handle Missing & Encode) ---
print("\n--- Starting Preprocessing ---")

# A. Drop columns with >70% missing values
missing_percent = train_df.isnull().sum() / len(train_df)
cols_to_drop = missing_percent[missing_percent > 0.7].index
print(f"Dropping {len(cols_to_drop)} columns with >70% missing data...")

train_df = train_df.drop(cols_to_drop, axis=1)
test_df = test_df.drop(cols_to_drop, axis=1)

# B. Fill remaining missing values
# Numbers -> 0, Text -> "Unknown"
print("Filling missing values...")
for col in train_df.columns:
    if col == target: continue
    
    if train_df[col].dtype == 'object':
        train_df[col] = train_df[col].fillna("Unknown")
        test_df[col] = test_df[col].fillna("Unknown")
    else:
        train_df[col] = train_df[col].fillna(0)
        test_df[col] = test_df[col].fillna(0)

# C. Encode Text to Numbers
print("Encoding categorical features...")
cat_cols = train_df.select_dtypes(include=['object']).columns

for col in cat_cols:
    le = LabelEncoder()
    # Combine to fit ensuring all categories are known
    combined = pd.concat([train_df[col], test_df[col]], axis=0).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col].astype(str))
    test_df[col] = le.transform(test_df[col].astype(str))

# --- 4. PREPARE FOR TRAINING ---
print("\n--- Splitting & Scaling Data ---")
X = train_df.drop(target, axis=1)
y = train_df[target]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_df_scaled = scaler.transform(test_df)

# Split 80/20
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

# --- 5. MODEL 1: LOGISTIC REGRESSION ---
print("\n--- Training Model 1: Logistic Regression ---")
lr = LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)
lr.fit(X_train, y_train)

# Evaluate
y_prob_lr = lr.predict_proba(X_val)[:, 1]
print(f"Logistic Regression ROC-AUC: {roc_auc_score(y_val, y_prob_lr):.4f}")

# --- 6. MODEL 2: NEURAL NETWORK (MLP) ---
print("\n--- Training Model 2: Neural Network ---")
# Using a simple architecture for speed
mlp = MLPClassifier(hidden_layer_sizes=(64, 32), 
                    activation='relu', 
                    max_iter=50, 
                    early_stopping=True,
                    random_state=42)
mlp.fit(X_train, y_train)

# Evaluate
y_prob_mlp = mlp.predict_proba(X_val)[:, 1]
print(f"Neural Network ROC-AUC: {roc_auc_score(y_val, y_prob_mlp):.4f}")

# --- 7. SAVE SUBMISSION ---
print("\n--- Generating Submission File ---")
final_preds = mlp.predict_proba(test_df_scaled)[:, 1]

submission = pd.DataFrame({
    'TransactionID': test_ids,
    'isFraud': final_preds
})

save_path = os.path.join(dataset_path, 'submission.csv')
submission.to_csv(save_path, index=False)
print(f"âœ… Done! Submission saved to:\n{save_path}")

ðŸ“‚ Loading data from: C:\Tugas\tubes\ML\finalterm\Transaction\Fraud Transaction
âœ… Data Loaded!
Train Shape: (590540, 394)
Test Shape: (506691, 393)

--- Starting Preprocessing ---
Dropping 168 columns with >70% missing data...
Filling missing values...
Encoding categorical features...

--- Splitting & Scaling Data ---

--- Training Model 1: Logistic Regression ---
Logistic Regression ROC-AUC: 0.8398

--- Training Model 2: Neural Network ---
Neural Network ROC-AUC: 0.9050

--- Generating Submission File ---
âœ… Done! Submission saved to:
C:\Tugas\tubes\ML\finalterm\Transaction\Fraud Transaction\submission.csv
