In [2]:
!pip install pandas scikit-learn matplotlib seaborn




In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# -----------------------------
# LOAD DATA
# -----------------------------
train_df = pd.read_csv("fraudTrain.csv")
test_df  = pd.read_csv("fraudTest.csv")

# -----------------------------
# BASIC CLEANUP
# -----------------------------
DROP_COLS = ["Unnamed: 0", "trans_num"]

for col in DROP_COLS:
    if col in train_df.columns:
        train_df.drop(columns=col, inplace=True)
    if col in test_df.columns:
        test_df.drop(columns=col, inplace=True)

# -----------------------------
# TARGET / FEATURES
# -----------------------------
TARGET = "is_fraud"
assert TARGET in train_df.columns, "Target column missing"

# Drop rows where the target variable is NaN in the training set
train_df.dropna(subset=[TARGET], inplace=True)

X = train_df.drop(TARGET, axis=1)
y = train_df[TARGET]

# -----------------------------
# COLUMN TYPES
# -----------------------------
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols   = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# -----------------------------
# PREPROCESSOR (SPARSE!)
# -----------------------------
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numerical_cols),
        (
            "cat",
            OneHotEncoder(
                handle_unknown="ignore",
                sparse_output=True,        # CRITICAL: Changed 'sparse' to 'sparse_output'
                dtype=np.float32    # reduce memory further
            ),
            categorical_cols
        ),
    ]
)

# -----------------------------
# MODEL (SPARSE-SAFE)
# -----------------------------
model = LogisticRegression(
    max_iter=1000,
    class_weight="balanced",
    solver="saga",   # REQUIRED for sparse + large data
    n_jobs=-1
)

# -----------------------------
# PIPELINE
# -----------------------------
pipeline = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("model", model)
    ]
)

# -----------------------------
# TRAIN / VALIDATION SPLIT
# -----------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

# -----------------------------
# TRAIN
# -----------------------------
pipeline.fit(X_train, y_train)

# -----------------------------
# EVALUATION
# -----------------------------
y_pred = pipeline.predict(X_val)

print("\nCLASSIFICATION REPORT\n")
print(classification_report(y_val, y_pred, digits=4))

print("\nCONFUSION MATRIX\n")
print(confusion_matrix(y_val, y_pred))

# -----------------------------
# TEST SET PREDICTIONS
# -----------------------------
# Drop rows with NaN values from test_df before prediction
test_df.dropna(inplace=True)
test_preds = pipeline.predict(test_df)
test_df["predicted_fraud"] = test_preds
test_df.to_csv("fraud_predictions.csv", index=False)

print("\nSaved: fraud_predictions.csv")




CLASSIFICATION REPORT

              precision    recall  f1-score   support

         0.0     0.9986    0.9812    0.9898     14644
         1.0     0.3056    0.8582    0.4507       141

    accuracy                         0.9800     14785
   macro avg     0.6521    0.9197    0.7202     14785
weighted avg     0.9920    0.9800    0.9847     14785


CONFUSION MATRIX

[[14369   275]
 [   20   121]]

Saved: fraud_predictions.csv
