In [1]:
%%writefile feature_engineering.py
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer


class AggregateFeaturesTransformer(BaseEstimator, TransformerMixin):
    """Creates aggregate features per customer."""

    def __init__(self, customer_id_col="CustomerId", amount_col="Amount"):
        self.customer_id_col = customer_id_col
        self.amount_col = amount_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        agg = (
            X.groupby(self.customer_id_col)[self.amount_col]
            .agg(
                total_amount="sum",
                avg_amount="mean",
                txn_count="count",
                std_amount="std",
            )
            .fillna(0.0)
            .reset_index()
        )
        X = X.merge(agg, on=self.customer_id_col, how="left")
        return X


class TemporalFeaturesTransformer(BaseEstimator, TransformerMixin):
    """Extracts hour, day, month, year from a datetime column."""

    def __init__(self, datetime_col="TransactionStartTime"):
        self.datetime_col = datetime_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.datetime_col] = pd.to_datetime(X[self.datetime_col], errors="coerce")
        X["transaction_hour"] = X[self.datetime_col].dt.hour
        X["transaction_day"] = X[self.datetime_col].dt.day
        X["transaction_month"] = X[self.datetime_col].dt.month
        X["transaction_year"] = X[self.datetime_col].dt.year
        return X


class WoETransformer(BaseEstimator, TransformerMixin):
    """Minimal WoE transformer for categorical variables."""

    def __init__(self, cat_cols=None, target_col="is_high_risk"):
        self.cat_cols = cat_cols or []
        self.target_col = target_col
        self.woe_maps_ = {}

    def fit(self, X, y=None):
        if y is None and self.target_col not in X.columns:
            raise ValueError("WoETransformer requires y or target_col in X")

        if y is not None:
            target = pd.Series(y, name=self.target_col)
            df = pd.concat(
                [X[self.cat_cols].reset_index(drop=True),
                 target.reset_index(drop=True)],
                axis=1
            )
        else:
            df = X[self.cat_cols + [self.target_col]]

        for col in self.cat_cols:
            temp = df.groupby(col)[self.target_col].agg(["sum", "count"])
            temp["good"] = temp["count"] - temp["sum"]
            temp["good"] = temp["good"].replace(0, 0.5)
            temp["sum"] = temp["sum"].replace(0, 0.5)
            temp["dist_good"] = temp["good"] / temp["good"].sum()
            temp["dist_bad"] = temp["sum"] / temp["sum"].sum()
            temp["woe"] = np.log(temp["dist_good"] / temp["dist_bad"])
            self.woe_maps_[col] = temp["woe"].to_dict()
        return self

    def transform(self, X):
        X = X.copy()
        for col, mapping in self.woe_maps_.items():
            X[col + "_woe"] = X[col].map(mapping).fillna(0.0)
        return X


def build_preprocessing_pipeline(
    numeric_cols,
    categorical_cols,
    woe_categorical_cols=None,
    customer_id_col="CustomerId",
    amount_col="Amount",
    datetime_col="TransactionStartTime",
    use_woe=False,
):
    """
    1) Aggregate features
    2) Temporal features
    3) Encoding
    4) Imputation
    5) Scaling
    6) Optional WoE
    """
    base_steps = [
        ("agg_features", AggregateFeaturesTransformer(customer_id_col, amount_col)),
        ("temporal_features", TemporalFeaturesTransformer(datetime_col)),
    ]
    if use_woe and woe_categorical_cols:
        base_steps.append(("woe_transformer", WoETransformer(cat_cols=woe_categorical_cols)))
    base_pipeline = Pipeline(steps=base_steps)

    numeric_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ])

    categorical_pipeline = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numeric_pipeline, numeric_cols),
            ("cat", categorical_pipeline, categorical_cols),
        ],
        remainder="drop",
    )

    full_pipeline = Pipeline(steps=[
        ("base", base_pipeline),
        ("preprocessor", preprocessor),
    ])
    return full_pipeline


Writing feature_engineering.py


In [5]:
# Path setup for Colab

import pandas as pd
from feature_engineering import build_preprocessing_pipeline

# Load raw data (adjust path as needed)
df = pd.read_csv('/content/data.csv')

# Define columns
numeric_cols = [
    "Amount", "Value"
]
# 'total_amount', 'avg_amount', 'txn_count', 'std_amount', 'transaction_hour',
# 'transaction_day', 'transaction_month', 'transaction_year' will be generated by the pipeline.
categorical_cols = ["ProductCategory", "ChannelId", "ProviderId"]
woe_categorical_cols = ["ProductCategory", "ChannelId"]  # optional

# Build pipeline
pipeline = build_preprocessing_pipeline(
    numeric_cols=numeric_cols,
    categorical_cols=categorical_cols,
    woe_categorical_cols=woe_categorical_cols,
    customer_id_col="CustomerId",
    amount_col="Amount",
    datetime_col="TransactionStartTime",
    use_woe=True
)

# Fit & transform (requires df["is_high_risk"] to exist)
X_processed = pipeline.fit_transform(df, df["is_high_risk"])

# Save processed data to repo
processed_df = pd.DataFrame(X_processed)
processed_path = '/content/processed_task3.csv'
processed_df.to_csv(processed_path, index=False)

print("Saved:", processed_path)
processed_df.head()

Saved: /content/processed_task3.csv


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,-0.652919,-0.642294,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
1,0.63999,0.642294,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-1.299373,-1.284587,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,-0.006465,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,1.932898,1.926881,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
