In [1]:
ip_df = pd.read_csv('data/raw/IpAddress_to_Country.csv')

# Convert IPs to integers (handle float to int conversion)
fraud_df['ip_int'] = fraud_df['ip_address'].apply(lambda x: int(float(x)))
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].apply(lambda x: int(x))
ip_df['upper_bound_ip_address'] = ip_df['upper_bound_ip_address'].apply(lambda x: int(x))

# Sort for merge_asof
fraud_df = fraud_df.sort_values('ip_int')
ip_df = ip_df.sort_values('lower_bound_ip_address')

# Range-based merge
merged_df = pd.merge_asof(fraud_df, ip_df, left_on='ip_int', right_on='lower_bound_ip_address')
merged_df = merged_df[merged_df['ip_int'] <= merged_df['upper_bound_ip_address']]  # Filter valid ranges

# Analyze fraud by country
fraud_by_country = merged_df.groupby('country')['class'].agg(['mean', 'count']).sort_values('mean', ascending=False)
print(fraud_by_country.head(10))  # Top countries with highest fraud rate

# Visualize
plt.figure(figsize=(12, 6))
fraud_by_country['mean'].head(10).plot(kind='bar')
plt.title('Fraud Rate by Top 10 Countries')
plt.savefig('figures/fraud_by_country.png')

# Save processed with country
merged_df.to_csv('data/processed/fraud_with_country.csv', index=False)

NameError: name 'pd' is not defined

In [None]:
# Time_since_signup (in hours)
merged_df['time_since_signup'] = (merged_df['purchase_time'] - merged_df['signup_time']).dt.total_seconds() / 3600

# Time-based features
merged_df['hour_of_day'] = merged_df['purchase_time'].dt.hour
merged_df['day_of_week'] = merged_df['purchase_time'].dt.weekday

# Transaction frequency/velocity per user (example: total transactions per user)
user_trans_count = merged_df.groupby('user_id').size().reset_index(name='trans_count')
merged_df = merged_df.merge(user_trans_count, on='user_id')

# More advanced velocity: transactions in last 24 hours (per user)
def count_in_window(group, window_hours=24):
    group = group.sort_values('purchase_time')
    group['trans_in_24h'] = group['purchase_time'].rolling(window=pd.Timedelta(hours=window_hours), min_periods=1).count() - 1  # Exclude self
    return group

merged_df = merged_df.groupby('user_id').apply(count_in_window).reset_index(drop=True)

# Save engineered
merged_df.to_csv('data/processed/engineered_fraud.csv', index=False)

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

numerical_cols = ['purchase_value', 'age', 'time_since_signup', 'hour_of_day', 'trans_count', 'trans_in_24h']
categorical_cols = ['source', 'browser', 'sex', 'country']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Fit and transform (but apply after split in modeling; here for demo)
# transformed = preprocessor.fit_transform(merged_df)

In [None]:
from imblearn.over_sampling import SMOTE

# Before
print(merged_df['class'].value_counts(normalize=True))

# Demo SMOTE (don't apply to full data in practice)
smote = SMOTE(random_state=42)
X_demo = merged_df.drop('class', axis=1)  # Features
y_demo = merged_df['class']
X_res, y_res = smote.fit_resample(X_demo.select_dtypes(include=['float', 'int']), y_demo)  # Numerical only for demo

print(y_res.value_counts(normalize=True))  # Should be ~50/50