In [7]:
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/raw/enhanced_fraud_detection_dataset.csv')

# Summary of missing values again (optional refresh)
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)


geo_location              4991
day_of_week               2536
payment_method            2512
device_type               2496
mouse_movement_score      2467
ip_risk_score             2451
avg_transaction_amount    2428
dtype: int64

In [8]:
# 1. Fill numeric columns with median
df['mouse_movement_score'] = df['mouse_movement_score'].fillna(df['mouse_movement_score'].median())
df['ip_risk_score'] = df['ip_risk_score'].fillna(df['ip_risk_score'].median())

# 2. Group-based fill for avg_transaction_amount (by device_type)
df['avg_transaction_amount'] = df.groupby('device_type')['avg_transaction_amount']\
                                 .transform(lambda x: x.fillna(x.median()))

# 3. Fill geo_location with mode
df['geo_location'] = df['geo_location'].fillna(df['geo_location'].mode()[0])


In [9]:
# Fill categorical nulls with mode
for col in ['day_of_week', 'payment_method', 'device_type']:
    df[col] = df[col].fillna(df[col].mode()[0])

# Refill avg_transaction_amount using global median as fallback
df['avg_transaction_amount'] = df['avg_transaction_amount'].fillna(df['avg_transaction_amount'].median())

df.isnull().sum().sort_values(ascending=False).head()


transaction_id    0
user_id           0
amount            0
payment_method    0
device_type       0
dtype: int64

In [10]:
from sklearn.preprocessing import LabelEncoder

# Identify categorical columns
cat_cols = df.select_dtypes(include='object').columns
cat_cols = [col for col in cat_cols if col not in ['transaction_id']]  # exclude ID

# Apply Label Encoding
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col].astype(str))

print("Encoded columns:", cat_cols)
df[cat_cols].head()


Encoded columns: ['payment_method', 'device_type', 'geo_location', 'day_of_week', 'region', 'country', 'timestamp', 'device_os', 'browser']


Unnamed: 0,payment_method,device_type,geo_location,day_of_week,region,country,timestamp,device_os,browser
0,1,1,1,0,0,3,1388,0,2
1,4,2,0,6,2,3,1148,0,1
2,0,2,2,3,1,3,633,4,1
3,2,0,6,0,3,3,83,1,2
4,1,1,4,4,4,3,695,4,3


In [11]:
from sklearn.model_selection import train_test_split

# Drop ID columns
df_model = df.drop(columns=['transaction_id', 'user_id'])

# Define features and target
X = df_model.drop('fraud_flag', axis=1)
y = df_model['fraud_flag']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    stratify=y, 
                                                    random_state=42)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (40000, 22)
Test shape: (10000, 22)


In [12]:
# Save the datasets
X_train.to_csv('../data/processed/X_train.csv', index=False)
X_test.to_csv('../data/processed/X_test.csv', index=False)
y_train.to_csv('../data/processed/y_train.csv', index=False)
y_test.to_csv('../data/processed/y_test.csv', index=False)

print("✅ All processed files saved.")


✅ All processed files saved.
