In [11]:
import os
os.environ["GITHUB_USER"] = "redecon"
os.environ["GITHUB_TOKEN"] = "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"


In [2]:
!git clone https://github.com/redecon/Credit-Risk-Probability-Model-for-Alternative-Data.git
%cd Credit-Risk-Probability-Model-for-Alternative-Data


Cloning into 'Credit-Risk-Probability-Model-for-Alternative-Data'...
remote: Enumerating objects: 30, done.[K
remote: Counting objects: 100% (30/30), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 30 (delta 8), reused 8 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (30/30), 200.75 KiB | 13.38 MiB/s, done.
Resolving deltas: 100% (8/8), done.
/content/Credit-Risk-Probability-Model-for-Alternative-Data


In [12]:
!git remote set-url origin https://redecon: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@github.com/redecon/Credit-Risk-Probability-Model-for-Alternative-Data.git


In [4]:
!git config --global user.email "redietbekele02@outlook.com"
!git config --global user.name "redecon"

In [6]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer

# ============================================================
# Custom Transformers
# ============================================================

class AggregateFeaturesTransformer(BaseEstimator, TransformerMixin):
    """Creates aggregate features per customer."""
    def __init__(self, customer_id_col="CustomerId", amount_col="Amount"):
        self.customer_id_col = customer_id_col
        self.amount_col = amount_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        agg = (
            X.groupby(self.customer_id_col)[self.amount_col]
            .agg(
                total_amount="sum",
                avg_amount="mean",
                txn_count="count",
                std_amount="std",
            )
            .reset_index()
        )
        agg["std_amount"] = agg["std_amount"].fillna(0.0)
        X = X.merge(agg, on=self.customer_id_col, how="left")
        return X


class TemporalFeaturesTransformer(BaseEstimator, TransformerMixin):
    """Extracts temporal features from datetime column."""
    def __init__(self, datetime_col="TransactionStartTime"):
        self.datetime_col = datetime_col

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        X[self.datetime_col] = pd.to_datetime(X[self.datetime_col], errors="coerce")
        X["transaction_hour"] = X[self.datetime_col].dt.hour.fillna(-1).astype(int)
        X["transaction_day"] = X[self.datetime_col].dt.day.fillna(-1).astype(int)
        X["transaction_month"] = X[self.datetime_col].dt.month.fillna(-1).astype(int)
        X["transaction_year"] = X[self.datetime_col].dt.year.fillna(-1).astype(int)
        return X


class WoETransformer(BaseEstimator, TransformerMixin):
    """Weight of Evidence for selected categoricals. Requires y during fit."""
    def __init__(self, cat_cols=None, alpha=0.5):
        self.cat_cols = cat_cols or []
        self.alpha = alpha
        self.woe_maps_ = {}

    def fit(self, X, y):
        df = X.copy()
        df["target"] = y

        for col in self.cat_cols:
            temp = df.groupby(col)["target"].agg(["sum", "count"])
            temp["good"] = temp["count"] - temp["sum"]
            temp["good"] += self.alpha
            temp["bad"] = temp["sum"] + self.alpha
            total_good = temp["good"].sum()
            total_bad = temp["bad"].sum()
            temp["dist_good"] = temp["good"] / total_good
            temp["dist_bad"] = temp["bad"] / total_bad
            temp["woe"] = np.log(temp["dist_good"] / temp["dist_bad"])
            self.woe_maps_[col] = temp["woe"].to_dict()

        return self

    def transform(self, X):
        X = X.copy()
        for col in self.cat_cols:
            woe_col = f"{col}_woe"
            X[woe_col] = X[col].map(self.woe_maps_[col]).fillna(0.0)
        return X


# ============================================================
# Pipeline Builder
# ============================================================

def build_preprocessing_pipeline(
    numeric_cols,
    categorical_cols,
    woe_categorical_cols=None,
    customer_id_col="CustomerId",
    amount_col="Amount",
    datetime_col="TransactionStartTime",
    target_col="FraudResult",  # Updated to match your data
    use_woe=False,
):
    woe_categorical_cols = woe_categorical_cols or []

    base_steps = [
        ("agg_features", AggregateFeaturesTransformer(customer_id_col, amount_col)),
        ("temporal_features", TemporalFeaturesTransformer(datetime_col)),
    ]

    if use_woe:
        base_steps.append(("woe", WoETransformer(cat_cols=woe_categorical_cols)))

    # After base transforms, dynamically select columns
    class DynamicPreprocessor(BaseEstimator, TransformerMixin):
        def __init__(self):
            self.numeric_cols_ = numeric_cols + [c + "_woe" for c in woe_categorical_cols if use_woe]
            self.categorical_cols_ = [c for c in categorical_cols if c not in woe_categorical_cols]

        def fit(self, X, y=None):
            transformers = []

            if self.numeric_cols_:
                num_pipe = Pipeline([
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler()),
                ])
                transformers.append(("num", num_pipe, self.numeric_cols_))

            if self.categorical_cols_:
                cat_pipe = Pipeline([
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("onehot", OneHotEncoder(handle_unknown="ignore", sparse_output=False)),
                ])
                transformers.append(("cat", cat_pipe, self.categorical_cols_))

            self.preprocessor_ = ColumnTransformer(
                transformers=transformers,
                remainder="drop"
            )
            self.preprocessor_.fit(X)
            return self

        def transform(self, X):
            if not hasattr(self, 'preprocessor_'):
                raise RuntimeError("Must fit first!")
            return self.preprocessor_.transform(X)

    full_pipeline = Pipeline([
        ("base", Pipeline(base_steps)),
        ("dynamic_preprocess", DynamicPreprocessor()),
    ])

    return full_pipeline


# ============================================================
# Usage - Adjusted for your actual data
# ============================================================

# Load your data (update the path if needed)
df_raw = pd.read_csv("/content/data.csv")  # or the actual filename

print("Columns:", df_raw.columns.tolist())
print("Shape:", df_raw.shape)

# Define columns based on your data
numeric_cols = [
    "Amount", "Value",
    "total_amount", "avg_amount", "txn_count", "std_amount",
    "transaction_hour", "transaction_day", "transaction_month", "transaction_year"
]

categorical_cols = ["ProductCategory", "ChannelId", "ProviderId", "PricingStrategy"]

woe_categorical_cols = ["ProductCategory", "ChannelId"]  # Good choices

# Target column is FraudResult (0/1)
has_target = "FraudResult" in df_raw.columns

pipeline = build_preprocessing_pipeline(
    numeric_cols=numeric_cols,
    categorical_cols=categorical_cols,
    woe_categorical_cols=woe_categorical_cols,
    customer_id_col="CustomerId",
    amount_col="Amount",
    datetime_col="TransactionStartTime",
    use_woe=has_target  # Only apply WoE if target exists (train data)
)

if has_target:
    X_processed = pipeline.fit_transform(df_raw, df_raw["FraudResult"])
else:
    X_processed = pipeline.fit_transform(df_raw)

print("Processed shape:", X_processed.shape)

# Save (customer-level features)
pd.DataFrame(X_processed).to_csv("/content/processed_task3.csv", index=False)
print("Saved to /content/processed_task3.csv")

Columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']
Shape: (95662, 16)
Processed shape: (95662, 22)
Saved to /content/processed_task3.csv


In [13]:
# 1. Stage all your changes (new files, modified files, etc.)
!git add .

# 2. Commit your work for Task 3
!git commit -m "feat: complete task 3 - robust feature engineering pipeline with aggregates, temporal features, WoE, and sklearn Pipeline"

# 3. Make sure main is up to date (safe to do)
!git pull origin main

# 4. Create and switch to a new branch called task-3
!git checkout -b task-3

# 5. Push the new task-3 branch to GitHub
!git push origin task-3

On branch task-3
nothing to commit, working tree clean
From https://github.com/redecon/Credit-Risk-Probability-Model-for-Alternative-Data
 * branch            main       -> FETCH_HEAD
Already up to date.
fatal: A branch named 'task-3' already exists.
Total 0 (delta 0), reused 0 (delta 0), pack-reused 0
remote: 
remote: Create a pull request for 'task-3' on GitHub by visiting:[K
remote:      https://github.com/redecon/Credit-Risk-Probability-Model-for-Alternative-Data/pull/new/task-3[K
remote: 
To https://github.com/redecon/Credit-Risk-Probability-Model-for-Alternative-Data.git
 * [new branch]      task-3 -> task-3
