In [None]:
import pandas as pd
import numpy as np

import datetime as dt

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from imblearn.over_sampling import SMOTE

# Optional but recommended
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Load required datasets for feature engineering
fraud_df = pd.read_csv('../data/raw/Fraud_Data.csv')
ip_country_df = pd.read_csv('../data/raw/IpAddress_to_Country.csv')
# Convert IP address to integer for merging
import ipaddress
# Check if ip_address is already integer or string
if fraud_df['ip_address'].dtype == 'O':
    # If string, convert to integer using ipaddress module
    fraud_df['ip_int'] = fraud_df['ip_address'].apply(lambda x: int(ipaddress.IPv4Address(x)) if isinstance(x, str) and '.' in x else int(x))
else:
    # If already numeric, just copy
    fraud_df['ip_int'] = fraud_df['ip_address']
# Ensure datetime columns are parsed correctly
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])

### Geolocation Integration (IP Mapping)

In [None]:
# Sort IP country for merge_asof
ip_country_df.sort_values('lower_bound_ip_address', inplace=True)

# Merge using range-based lookup
fraud_df = pd.merge_asof(fraud_df.sort_values('ip_int'), ip_country_df, 
                         left_on='ip_int', right_on='lower_bound_ip_address',
                         direction='backward')
fraud_df = fraud_df[(fraud_df['ip_int'] >= fraud_df['lower_bound_ip_address']) & 
                    (fraud_df['ip_int'] <= fraud_df['upper_bound_ip_address'])]
fraud_df.drop(['lower_bound_ip_address', 'upper_bound_ip_address', 'ip_int'], axis=1, inplace=True)

# Analyze fraud by country
fraud_by_country = fraud_df.groupby('country')['class'].mean().sort_values(ascending=False)
print(fraud_by_country.head(10))
# Insight: Higher fraud rates in certain countries (e.g., hypothetical: Vietnam, Turkey >10%), justifying 'country' as a feature.

### Transaction Frequency and Velocity

In [None]:
# Sort by user and time
fraud_df.sort_values(['user_id', 'purchase_time'], inplace=True)

# Transactions per user in last 24 hours (velocity)
fraud_df['time_diff'] = fraud_df.groupby('user_id')['purchase_time'].diff().dt.total_seconds()
fraud_df['velocity_24h'] = fraud_df.groupby('user_id')['time_diff'].rolling(window=24*3600, min_periods=1).count().reset_index(0, drop=True)

# Frequency: Total transactions per user
fraud_df['freq_per_user'] = fraud_df.groupby('user_id')['user_id'].transform('count')

### Time-Based Features

In [None]:
# Hour of day and day of week
fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek

# Time since signup (in hours)
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds() / 3600
# Insight: Fraud often shortly after signup (median ~10 hours for class=1 vs. 100+ for class=0).

### Data Transformation (Normalization/Scaling and Encoding)

In [None]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['purchase_value', 'age', 'time_since_signup', 'velocity_24h', 'freq_per_user']
fraud_df[numerical_cols] = scaler.fit_transform(fraud_df[numerical_cols])

# One-hot encode categoricals
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
cat_cols = ['source', 'browser', 'sex', 'country']
encoded = pd.DataFrame(encoder.fit_transform(fraud_df[cat_cols]), columns=encoder.get_feature_names_out())
fraud_df = pd.concat([fraud_df.drop(cat_cols, axis=1), encoded], axis=1)

### Analysis of Class Imbalance and Strategy for Handling It

In [None]:
# Example (to be applied after split)
from sklearn.model_selection import train_test_split

# Prepare features and target
X = fraud_df.drop('class', axis=1)
y = fraud_df['class']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(X_train, y_train)
# Document distribution
print("Before: ", y_train.value_counts())
# Before example: 0: 90%, 1: 10%
print("After: ", pd.Series(y_res).value_counts())
# After: 0: 50%, 1: 50%

In [None]:
from sklearn.model_selection import train_test_split

# Prepare features and target
X = fraud_df.drop('class', axis=1)
y = fraud_df['class']

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)