In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
DATA_PATH = "data/raw/ieee-cis/"

tx = pd.read_csv(DATA_PATH + "train_transaction.csv")
idn = pd.read_csv(DATA_PATH + "train_identity.csv")

df = tx.merge(idn, on="TransactionID", how="left")

print(df.shape)

(590540, 434)


In [3]:
TARGET = "isFraud"

y = df[TARGET].copy()

# IDs and leakage-prone columns
id_cols = [
    "TransactionID",
    "TransactionDT"
]

X = df.drop(columns=[TARGET] + id_cols)

print(X.shape, y.shape)

(590540, 431) (590540,)


In [4]:
numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_cols = X.select_dtypes(exclude=[np.number]).columns.tolist()

print(f"Numeric features: {len(numeric_cols)}")
print(f"Categorical features: {len(categorical_cols)}")

Numeric features: 400
Categorical features: 31


In [5]:
X_num = X[numeric_cols].copy()
X_cat = X[categorical_cols].copy()

# Numeric: median imputation
X_num = X_num.fillna(X_num.median())

# Categorical: explicit missing category
X_cat = X_cat.fillna("missing")

In [6]:
from sklearn.preprocessing import OrdinalEncoder

if len(categorical_cols) > 0:
    encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
    X_cat_encoded = encoder.fit_transform(X_cat)
    X_cat_encoded = pd.DataFrame(X_cat_encoded, columns=categorical_cols)
else:
    X_cat_encoded = pd.DataFrame(index=X_num.index)

In [7]:
X_processed = pd.concat([X_num, X_cat_encoded], axis=1)

print(X_processed.shape)

(590540, 431)


In [8]:
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X_processed)

X_scaled = pd.DataFrame(
    X_scaled,
    columns=X_processed.columns,
    index=X_processed.index
)

X_scaled.describe().T[["mean", "std"]].head()

Unnamed: 0,mean,std
TransactionAmt,5.914973000000001e-17,1.000001
card1,1.379238e-16,1.000001
card2,3.7864970000000005e-17,1.000001
card3,-8.991336e-16,1.000001
card5,2.326644e-16,1.000001


In [9]:
assert not X_scaled.isna().any().any(), "NaNs still present!"
assert X_scaled.shape[0] == y.shape[0], "Row mismatch!"

print("Preprocessing complete and consistent.")

Preprocessing complete and consistent.


In [11]:
# Optional: save processed data
X_scaled.to_parquet("data/processed/X_train.parquet")
y.to_frame("isFraud").to_parquet("data/processed/y_train.parquet")

In [12]:
# Optional: save processed data
X_scaled.to_csv("data/processed/X_train_1.csv")
y.to_frame("isFraud").to_csv("data/processed/y_train_1.csv")