In [1]:
import sys
sys.path.append('../src')
import pandas as pd
import numpy as np
from data_processing import DataLoader
from feature_engineering import FeatureEngineer
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

#### Load Cleaned Fraud Data

In [2]:
loader = DataLoader("../data/processed/fraud_data_merged.csv")
fraud_data = loader.load_data()

# Show dtypes to verify
print("Data types:\n", fraud_data.dtypes)

Data types:
 user_id             int64
signup_time        object
purchase_time      object
purchase_value      int64
device_id          object
source             object
browser            object
sex                object
age                 int64
ip_address        float64
class               int64
ip_int              int64
country            object
dtype: object


In [3]:
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'], errors='coerce')
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'], errors='coerce')
print(fraud_data[['purchase_time', 'signup_time']].info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 2 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   purchase_time  151112 non-null  datetime64[ns]
 1   signup_time    151112 non-null  datetime64[ns]
dtypes: datetime64[ns](2)
memory usage: 2.3 MB
None


### Feature Engineering - Fraud_Data
Initialize FeatureEngineer

In [4]:
fe_fraud = FeatureEngineer(fraud_data)

### Time-based features

In [5]:
fraud_data = fe_fraud.add_time_features_fraud(
    purchase_col="purchase_time",
    signup_col="signup_time"
)
print("\nSample time features:")
print(fraud_data[['hour_of_day', 'day_of_week', 'time_since_signup']].head())


Sample time features:
   hour_of_day  day_of_week  time_since_signup
0            2            5        1251.856111
1            1            0           4.984444
2           18            3           0.000278
3           13            0         136.690278
4           18            2        1211.516944


### Transaction frequency / velocity

In [6]:
fraud_data = fe_fraud.add_transaction_counts(group_cols=["user_id", "device_id", "country"])
print("\nSample transaction counts:")
print(fraud_data[['user_id_tx_count', 'device_id_tx_count', 'country_tx_count']].head())


Sample transaction counts:
   user_id_tx_count  device_id_tx_count  country_tx_count
0                 1                   1              7306
1                 1                   1             58049
2                 1                  12             58049
3                 1                   1             21966
4                 1                   1             58049


### Scale numerical features

In [7]:
numerical_cols = ["purchase_value", "age", "time_since_signup"]
fraud_data = fe_fraud.scale_numerical(numerical_cols, method="standard")
print("\nScaled numerical features:")
print(fraud_data[numerical_cols].head())


Scaled numerical features:
   purchase_value       age  time_since_signup
0       -0.160204  0.679914          -0.136057
1       -1.142592  2.304476          -1.571877
2       -1.197169  2.304476          -1.577617
3        0.385567  0.911994          -1.420213
4        0.112681  1.376155          -0.182509


### Encode categorical features

In [8]:
categorical_cols = ["sex", "browser", "source", "country"]
fraud_data = fe_fraud.encode_categorical(categorical_cols)
print("\nColumns after encoding:")
print(fraud_data.columns)


Columns after encoding:
Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value',
       'device_id', 'age', 'ip_address', 'class', 'ip_int', 'hour_of_day',
       ...
       'country_Unknown', 'country_Uruguay', 'country_Uzbekistan',
       'country_Vanuatu', 'country_Venezuela', 'country_Viet Nam',
       'country_Virgin Islands (U.S.)', 'country_Yemen', 'country_Zambia',
       'country_Zimbabwe'],
      dtype='object', length=203)


### Tran-test Split

In [9]:
drop_cols = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address']
X = fraud_data.drop(columns=drop_cols + ['class'])  # drop identifiers + target
y = fraud_data['class']  # target variable

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Training class distribution BEFORE SMOTE:\n", y_train.value_counts())
print("Test class distribution (unchanged):\n", y_test.value_counts())

Training class distribution BEFORE SMOTE:
 class
0    109568
1     11321
Name: count, dtype: int64
Test class distribution (unchanged):
 class
0    27393
1     2830
Name: count, dtype: int64


### Handle Class Imbalance with SMOTE

In [10]:
smote = SMOTE(random_state=42)
X_train_bal, y_train_bal = smote.fit_resample(X_train, y_train)

print("\nTraining class distribution AFTER SMOTE:\n", y_train_bal.value_counts())

# Now X_train_bal and y_train_bal are balanced and ready for modeling


Training class distribution AFTER SMOTE:
 class
0    109568
1    109568
Name: count, dtype: int64


In [11]:
# Save processed Fraud_Data
loader.df = fraud_data
loader.save_data("../data/processed/fraud_data_features.csv")