In [1]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer

# WoE library
from xverse.transformer import WOE





In [2]:
df = pd.read_csv(r"C:\Users\ZAK-TECH\Desktop\KAIM_week4\data\raw\data (1).csv")


2.1 Aggregate Features per Customer

In [3]:
class CustomerAggregateFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, customer_col="CustomerId", amount_col="Amount"):
        self.customer_col = customer_col
        self.amount_col = amount_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        agg = X.groupby(self.customer_col)[self.amount_col].agg(
            TotalTransactionAmount="sum",
            AverageTransactionAmount="mean",
            TransactionCount="count",
            StdTransactionAmount="std"
        ).reset_index()
        return X.merge(agg, on=self.customer_col, how="left")


2.2 Date-Time Feature Extraction

In [4]:
class DateTimeFeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self, datetime_col="TransactionStartTime"):
        self.datetime_col = datetime_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.datetime_col] = pd.to_datetime(X[self.datetime_col])
        X["TransactionHour"] = X[self.datetime_col].dt.hour
        X["TransactionDay"] = X[self.datetime_col].dt.day
        X["TransactionMonth"] = X[self.datetime_col].dt.month
        X["TransactionYear"] = X[self.datetime_col].dt.year
        return X.drop(columns=[self.datetime_col])


feature lists

In [5]:
numeric_features = [
    "Amount", "Value", "TransactionHour", "TransactionDay",
    "TransactionMonth", "TransactionYear",
    "TotalTransactionAmount", "AverageTransactionAmount",
    "TransactionCount", "StdTransactionAmount"
]

categorical_features = [
    "CurrencyCode", "CountryCode", "ProviderId", "ProductId",
    "ProductCategory", "ChannelId", "PricingStrategy"
]


Numeric & Categorical Pipelines

In [6]:
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())  # Standardization
])

categorical_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
])


Column Transformer

In [7]:
preprocessor = ColumnTransformer(transformers=[
    ("num", numeric_pipeline, numeric_features),
    ("cat", categorical_pipeline, categorical_features)
])


Full End-to-End Pipeline

In [8]:
full_pipeline = Pipeline(steps=[
    ("aggregate_features", CustomerAggregateFeatures()),
    ("datetime_features", DateTimeFeatureExtractor()),
    ("preprocessor", preprocessor)
])


apply pipeline to dataset

In [9]:
# Separate features and target
X = df.drop(columns=["FraudResult"])
y = df["FraudResult"]

# Transform features
X_transformed = full_pipeline.fit_transform(X)


woe and iv

In [10]:
from feature_engine.encoding import WoEEncoder

In [None]:

# DROP ID COLUMNS

id_cols = [
    "TransactionId", "BatchId", "AccountId",
    "SubscriptionId", "CustomerId",
    "ProviderId", "ProductId"
]

df_model = df.drop(columns=id_cols)


# HANDLE TIME FEATURE

df_model["TransactionStartTime"] = pd.to_datetime(
    df_model["TransactionStartTime"]
)

df_model["TxnHour"] = df_model["TransactionStartTime"].dt.hour
df_model["TxnDayOfWeek"] = df_model["TransactionStartTime"].dt.dayofweek
df_model["IsWeekend"] = df_model["TxnDayOfWeek"].isin([5, 6]).astype(int)

df_model["TxnHour_bin"] = pd.cut(
    df_model["TxnHour"],
    bins=[-1, 5, 11, 17, 23],
    labels=["Night", "Morning", "Afternoon", "Evening"]
)


# BIN NUMERIC FEATURES (SAFE)

df_model["Amount_bin"] = pd.qcut(
    df_model["Amount"], q=2, duplicates="drop"
)
df_model["Value_bin"] = pd.qcut(
    df_model["Value"], q=2, duplicates="drop"
)


# CLEAN CATEGORICAL FEATURES

cat_cols = [
    "CurrencyCode",
    "CountryCode",
    "ProductCategory",
    "ChannelId",
    "PricingStrategy"
]

df_model[cat_cols] = df_model[cat_cols].fillna("Missing")

# Merge rare categories (>3%)
threshold = int(0.03 * len(df_model))
for col in cat_cols:
    freq = df_model[col].value_counts()
    rare = freq[freq < threshold].index
    df_model[col] = df_model[col].replace(rare, "Other")


# FINAL FEATURES

features = [
    "Amount_bin",
    "Value_bin",
    "TxnHour_bin",
    "TxnDayOfWeek",
    "IsWeekend",
    "CurrencyCode",
    "CountryCode",
    "ProductCategory",
    "ChannelId",
    "PricingStrategy"
]

X = df_model[features]
y = df_model["FraudResult"]

# Treat numeric features that are categorical
X["TxnDayOfWeek"] = X["TxnDayOfWeek"].astype(str)


# MANUAL WoE CALCULATION (VERSION-SAFE)

def manual_woe(X, y, epsilon=1e-6):
    X_woe = pd.DataFrame(index=X.index)
    for col in X.select_dtypes(include=["object", "category"]).columns:
        df_tmp = pd.DataFrame({"feature": X[col], "target": y})
        grouped = df_tmp.groupby("feature")["target"]

        good = grouped.apply(lambda x: (x == 0).sum())
        bad = grouped.apply(lambda x: (x == 1).sum())

        good_dist = (good + epsilon) / (good.sum() + epsilon*len(good))
        bad_dist = (bad + epsilon) / (bad.sum() + epsilon*len(bad))

        woe_map = np.log(bad_dist / good_dist)
        X_woe[col] = X[col].map(woe_map)

    return X_woe

X_woe = manual_woe(X, y)


# IV CALCULATION (SAFE)

def calculate_iv(X, y):
    iv_dict = {}
    for col in X.columns:
        df_tmp = pd.DataFrame({"feature": X[col], "target": y})
        grouped = df_tmp.groupby("feature")["target"]

        good = grouped.apply(lambda x: (x == 0).sum())
        bad = grouped.apply(lambda x: (x == 1).sum())

        good_dist = (good + 1e-6) / (good.sum() + 1e-6*len(good))
        bad_dist = (bad + 1e-6) / (bad.sum() + 1e-6*len(bad))

        woe_vals = np.log(bad_dist / good_dist)
        iv = ((bad_dist - good_dist) * woe_vals).sum()

        iv_dict[col] = iv

    return pd.DataFrame(
        iv_dict.items(),
        columns=["variable", "IV"]
    ).sort_values("IV", ascending=False)

iv_df = calculate_iv(X, y)


# FILTER LOW-IV FEATURES

iv_threshold = 0.02
iv_df_filtered = iv_df[iv_df["IV"] >= iv_threshold]

print("\nInformation Value (IV) (Filtered, IV >= 0.02):")
print(iv_df_filtered)




  grouped = df_tmp.groupby("feature")["target"]
  grouped = df_tmp.groupby("feature")["target"]
  grouped = df_tmp.groupby("feature")["target"]
  grouped = df_tmp.groupby("feature")["target"]



Information Value (IV) (Filtered, IV >= 0.02):
          variable        IV
1        Value_bin  2.920896
0       Amount_bin  2.516343
8        ChannelId  1.154068
7  ProductCategory  0.821602
9  PricingStrategy  0.546647
3     TxnDayOfWeek  0.140917
2      TxnHour_bin  0.073065
