In [1]:
import pandas as pd
import sys
sys.path.append("../src")
from data_processing import DataLoader, FeatureEngineer, WoETransformer

Load Raw Data

In [2]:
data_loader = DataLoader("../data/raw/data.csv")
data = data_loader.load_data()
data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


### Feature Engineering

#### Extract Datetime Features

Transaction timestamps are transformed into multiple time-based features to capture customer behavioral patterns.

In [3]:
fe = FeatureEngineer()

data = fe.extract_datetime_features(data)

data[
    [
        "TransactionStartTime",
        "transaction_hour",
        "transaction_day",
        "transaction_month",
        "transaction_year",
    ]
].head()

Unnamed: 0,TransactionStartTime,transaction_hour,transaction_day,transaction_month,transaction_year
0,2018-11-15 02:18:49+00:00,2,15,11,2018
1,2018-11-15 02:19:08+00:00,2,15,11,2018
2,2018-11-15 02:44:21+00:00,2,15,11,2018
3,2018-11-15 03:32:55+00:00,3,15,11,2018
4,2018-11-15 03:34:21+00:00,3,15,11,2018


#### Aggregate Customer-Level Features

Transaction-level data is aggregated to the customer level to create behavioral features used for credit risk modeling.

In [4]:
agg_data = fe.aggregate_customer_features(data)
agg_data.head()


Unnamed: 0,CustomerId,total_transaction_amount,avg_transaction_amount,transaction_count,std_transaction_amount
0,CustomerId_1,-10000.0,-10000.0,1,0.0
1,CustomerId_10,-10000.0,-10000.0,1,0.0
2,CustomerId_1001,20000.0,4000.0,5,6558.963333
3,CustomerId_1002,4225.0,384.090909,11,560.498966
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146


#### Merge Aggregate Features Back to Transaction Data

Customer-level aggregate features are merged back into the main dataset.
This allows the model to leverage both:

- Transaction-level temporal features
- Customer-level behavioral summaries

In [5]:
data = data.merge(agg_data, on="CustomerId", how="left")
data[['CustomerId','total_transaction_amount','avg_transaction_amount']].head()

Unnamed: 0,CustomerId,total_transaction_amount,avg_transaction_amount
0,CustomerId_4406,109921.75,923.712185
1,CustomerId_4406,109921.75,923.712185
2,CustomerId_4683,1000.0,500.0
3,CustomerId_988,228727.2,6019.136842
4,CustomerId_988,228727.2,6019.136842


### Preprocessing Pipeline: Categorical Encoding, Missing Value Handling & Feature Scaling

We transform features into a model-ready format using a sklearn pipeline:

* Encode Categorical Variables: One-Hot Encoding converts categorical features into binary vectors. (Label Encoding is not used to avoid artificial ordering.)

* Handle Missing Values: Imputation fills missing values — median for numerical features, mode for categorical features. No rows or columns are removed.

* Normalize/Standardize Numerical Features: Numerical features are standardized using StandardScaler (mean=0, std=1). Normalization to [0,1] is not applied, as standardization is preferred for model consistency.

In [6]:
preprocessor = FeatureEngineer.build_preprocessing_pipeline()

X = preprocessor.fit_transform(data)

# Numerical features
num_features = FeatureEngineer.NUMERICAL_FEATURES

# Categorical features after One-Hot Encoding
cat_features = (
    preprocessor
    .named_transformers_["cat"]
    .named_steps["encoder"]
    .get_feature_names_out(FeatureEngineer.CATEGORICAL_FEATURES)
)

all_features = list(num_features) + list(cat_features)

Convert to DataFrame

In [7]:
X_df = pd.DataFrame(
    X.toarray() if hasattr(X, "toarray") else X,
    columns=all_features
)

X_df.head()


Unnamed: 0,total_transaction_amount,avg_transaction_amount,transaction_count,std_transaction_amount,transaction_hour,transaction_day,transaction_month,transaction_year,CurrencyCode_UGX,CountryCode_256,...,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,PricingStrategy_0,PricingStrategy_1,PricingStrategy_2,PricingStrategy_4
0,0.170118,-0.067623,-0.311831,-0.167016,-2.15553,-0.100739,0.848684,-0.994246,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.170118,-0.067623,-0.311831,-0.167016,-2.15553,-0.100739,0.848684,-0.994246,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.165122,-0.072568,-0.444993,-0.201209,-2.15553,-0.100739,0.848684,-0.994246,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,0.175567,-0.008155,-0.40402,-0.008243,-1.949214,-0.100739,0.848684,-0.994246,1.0,1.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.175567,-0.008155,-0.40402,-0.008243,-1.949214,-0.100739,0.848684,-0.994246,1.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0


In [8]:
# Save processed data
data_loader.df = data  
data_loader.save_data("../data/processed/feature_engineered_data.csv")


### Feature Engineering with WoE and IV (*Future Step*)
Once the binary target `is_high_risk` is created in Task 4, we will:
- Apply WoE transformation to categorical variables.
- Calculate Information Value (IV) to select predictive features.
