In [113]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler


In [116]:
# Sample DataFrame creation (replace this with your actual data)
data = pd.read_csv('../data/data.csv')
transactions = pd.DataFrame(data)

data.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [117]:
# 1. Create Aggregate Features
agg_features = transactions.groupby('CustomerId').agg(
    total_transaction_amount=('Amount', 'sum'),
    average_transaction_amount=('Amount', 'mean'),
    transaction_count=('Amount', 'count'),
    std_dev_transaction_amount=('Amount', 'std')
).reset_index()


In [122]:
# 2. Extract Features
transactions['TransactionStartTime'] = pd.to_datetime(transactions['TransactionStartTime'], errors='coerce')


transactions['transaction_hour'] = transactions['TransactionStartTime'].dt.hour
transactions['transaction_day'] = transactions['TransactionStartTime'].dt.day
transactions['transaction_month'] = transactions['TransactionStartTime'].dt.month
transactions['transaction_year'] = transactions['TransactionStartTime'].dt.year


In [125]:
# One-Hot Encoding
one_hot_encoder = OneHotEncoder(sparse_output=False)  # Use sparse_output instead of sparse
category_encoded = one_hot_encoder.fit_transform(transactions[['ProductCategory']])
category_encoded_df = pd.DataFrame(category_encoded, columns=one_hot_encoder.get_feature_names_out(['ProductCategory']))

# Concatenate the original DataFrame with the encoded DataFrame
transactions = pd.concat([transactions, category_encoded_df], axis=1)

# Label Encoding
label_encoder = LabelEncoder()
transactions['category_encoded'] = label_encoder.fit_transform(transactions['ProductCategory'])

# Display the first few rows of the processed DataFrame
print(transactions.head())



         TransactionId         BatchId       AccountId       SubscriptionId  \
0  TransactionId_76871   BatchId_36123  AccountId_3957   SubscriptionId_887   
1  TransactionId_73770   BatchId_15642  AccountId_4841  SubscriptionId_3829   
2  TransactionId_26203   BatchId_53941  AccountId_4229   SubscriptionId_222   
3    TransactionId_380  BatchId_102363   AccountId_648  SubscriptionId_2185   
4  TransactionId_28195   BatchId_38780  AccountId_4841  SubscriptionId_3829   

        CustomerId CurrencyCode  CountryCode    ProviderId     ProductId  \
0  CustomerId_4406          UGX          256  ProviderId_6  ProductId_10   
1  CustomerId_4406          UGX          256  ProviderId_4   ProductId_6   
2  CustomerId_4683          UGX          256  ProviderId_6   ProductId_1   
3   CustomerId_988          UGX          256  ProviderId_1  ProductId_21   
4   CustomerId_988          UGX          256  ProviderId_4   ProductId_6   

      ProductCategory  ... ProductCategory_airtime  \
0             