In [1]:
import pandas as pd
import numpy as np
import pickle
import os

from sklearn.preprocessing import LabelEncoder, StandardScaler

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

In [2]:

folder_path = "simulated-data-raw"

# List all pickle files in the folder
pickle_files = [f for f in os.listdir(folder_path) if f.endswith('.pkl')]

# Load all pickle files and merge them
dataframes = []
for file in pickle_files:
    file_path = os.path.join(folder_path, file)
    with open(file_path, "rb") as f:
        df = pickle.load(f)
        df = pd.DataFrame(df)  # Ensure it's a DataFrame
        dataframes.append(df)

# Concatenate all DataFrames
final_df = pd.concat(dataframes, ignore_index=True)

# Display dataset info
print(final_df.info())
print(final_df.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754155 entries, 0 to 1754154
Data columns (total 9 columns):
 #   Column             Dtype         
---  ------             -----         
 0   TRANSACTION_ID     int64         
 1   TX_DATETIME        datetime64[ns]
 2   CUSTOMER_ID        object        
 3   TERMINAL_ID        object        
 4   TX_AMOUNT          float64       
 5   TX_TIME_SECONDS    object        
 6   TX_TIME_DAYS       object        
 7   TX_FRAUD           int64         
 8   TX_FRAUD_SCENARIO  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 120.4+ MB
None
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0               0 2018-04-01 00:00:31         596        3156      57.16   
1               1 2018-04-01 00:02:10        4961        3412      81.51   
2               2 2018-04-01 00:07:56           2        1365     146.00   
3               3 2018-04-01 00:09:29        4128        8737      

In [3]:
# Check for Missing Values
missing_values = final_df.isnull().sum()
missing_values[missing_values > 0]

Series([], dtype: int64)

 ### Timestamps

In [5]:
df['TX_DATETIME'] = pd.to_datetime(df['TX_DATETIME'])

df[['TX_DATETIME']].head()


Unnamed: 0,TX_DATETIME
1744506,2018-09-30 00:00:01
1744507,2018-09-30 00:00:27
1744508,2018-09-30 00:00:38
1744509,2018-09-30 00:01:49
1744510,2018-09-30 00:02:09


In [6]:
# Extract Hour, Day of the Week, and Week of the Year
df['TX_HOUR'] = df['TX_DATETIME'].dt.hour
df['TX_DAY_OF_WEEK'] = df['TX_DATETIME'].dt.dayofweek  # 0 = Monday, 6 = Sunday
df['TX_WEEK_OF_YEAR'] = df['TX_DATETIME'].dt.isocalendar().week  # Week number

# Create a feature: Is Weekend? (1 if Saturday/Sunday, else 0)
df['TX_IS_WEEKEND'] = df['TX_DAY_OF_WEEK'].apply(lambda x: 1 if x >= 5 else 0)

# Drop TX_DATETIME (not needed for modeling)
df.drop(columns=['TX_DATETIME'], inplace=True)

# Display the new features
df[['TX_HOUR', 'TX_DAY_OF_WEEK', 'TX_WEEK_OF_YEAR', 'TX_IS_WEEKEND']].head()


Unnamed: 0,TX_HOUR,TX_DAY_OF_WEEK,TX_WEEK_OF_YEAR,TX_IS_WEEKEND
1744506,0,6,39,1
1744507,0,6,39,1
1744508,0,6,39,1
1744509,0,6,39,1
1744510,0,6,39,1


In [7]:
# 📌 5️⃣ Encode Categorical Features (CUSTOMER_ID & TERMINAL_ID)

# Encode CUSTOMER_ID
customer_encoder = LabelEncoder()
df['CUSTOMER_ID'] = customer_encoder.fit_transform(df['CUSTOMER_ID'])

# Encode TERMINAL_ID
terminal_encoder = LabelEncoder()
df['TERMINAL_ID'] = terminal_encoder.fit_transform(df['TERMINAL_ID'])

# Display encoded columns
df[['CUSTOMER_ID', 'TERMINAL_ID']].head()


Unnamed: 0,CUSTOMER_ID,TERMINAL_ID
1744506,3279,4640
1744507,2731,3283
1744508,2430,733
1744509,1005,5104
1744510,915,818


In [8]:
# 📌 6️⃣ Normalize Numeric Features using StandardScaler

scaler = StandardScaler()

# Scale TX_AMOUNT
df['TX_AMOUNT'] = scaler.fit_transform(df[['TX_AMOUNT']])

# Display scaled feature
df[['TX_AMOUNT']].head()


Unnamed: 0,TX_AMOUNT
1744506,-0.96881
1744507,3.971312
1744508,-1.051134
1744509,-0.463579
1744510,0.173606


In [9]:
# 📌 7️⃣ Save the Processed Data as Pickle (for Modeling)
df.to_csv("processed_data.csv", index=False)  
processed_path = "processed_data.pkl"
with open(processed_path, "wb") as f:
    pickle.dump(df, f)

print(f"Processed data saved to {processed_path} ✅")


Processed data saved to processed_data.pkl ✅
