In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

ImportError: cannot import name '_is_pandas_df' from 'sklearn.utils.validation' (/home/rahel/Desktop/Improved detection of frauds/venv/lib/python3.12/site-packages/sklearn/utils/validation.py)

In [1]:
# Load processed data
fraud_df = pd.read_csv('../data/processed/fraud_data_processed.csv')
credit_df = pd.read_csv('../data/processed/creditcard_processed.csv')

print("Fraud data shape:", fraud_df.shape)
print("Credit card data shape:", credit_df.shape)

NameError: name 'pd' is not defined

## Fraud_Data.csv Feature Engineering

In [5]:
# Convert timestamps back to datetime if needed
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])

# Additional time features
fraud_df['signup_dayofweek'] = fraud_df['signup_time'].dt.dayofweek
fraud_df['purchase_dayofweek'] = fraud_df['purchase_time'].dt.dayofweek
fraud_df['is_weekend_signup'] = fraud_df['signup_dayofweek'].isin([5, 6]).astype(int)
fraud_df['is_weekend_purchase'] = fraud_df['purchase_dayofweek'].isin([5, 6]).astype(int)

# Transaction velocity features (simplified - in real scenario would need user history)
# For demo, we'll create proxy features
fraud_df['hourly_transaction_density'] = fraud_df.groupby('purchase_hour')['purchase_value'].transform('count')
fraud_df['device_transaction_count'] = fraud_df.groupby('device_id')['device_id'].transform('count')

print("Additional features added to fraud data")

Additional features added to fraud data


In [6]:
# Prepare features for modeling
# Drop unnecessary columns
cols_to_drop = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address', 'ip_address_int']
fraud_features = fraud_df.drop(cols_to_drop + ['class'], axis=1)
fraud_target = fraud_df['class']

# Identify column types
numerical_cols = ['purchase_value', 'age', 'time_since_signup', 'signup_hour', 'signup_day', 
                  'signup_month', 'purchase_hour', 'purchase_day', 'purchase_month',
                  'signup_dayofweek', 'purchase_dayofweek', 'hourly_transaction_density', 
                  'device_transaction_count']
categorical_cols = ['source', 'browser', 'sex', 'country']

print("Numerical columns:", numerical_cols)
print("Categorical columns:", categorical_cols)

Numerical columns: ['purchase_value', 'age', 'time_since_signup', 'signup_hour', 'signup_day', 'signup_month', 'purchase_hour', 'purchase_day', 'purchase_month', 'signup_dayofweek', 'purchase_dayofweek', 'hourly_transaction_density', 'device_transaction_count']
Categorical columns: ['source', 'browser', 'sex', 'country']


In [7]:
# Create preprocessing pipeline
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Fit and transform
X_fraud_processed = preprocessor.fit_transform(fraud_features)

# Get feature names
num_feature_names = numerical_cols
cat_feature_names = preprocessor.named_transformers_['cat'].get_feature_names_out(categorical_cols)
feature_names = np.concatenate([num_feature_names, cat_feature_names])

print("Processed fraud data shape:", X_fraud_processed.shape)
print("Number of features:", len(feature_names))

Processed fraud data shape: (151112, 24)
Number of features: 24


In [8]:
# Apply SMOTE for class imbalance
smote = SMOTE(random_state=42)
X_fraud_smote, y_fraud_smote = smote.fit_resample(X_fraud_processed, fraud_target)

print("Original class distribution:")
print(fraud_target.value_counts())
print("\nAfter SMOTE:")
print(pd.Series(y_fraud_smote).value_counts())

NameError: name 'SMOTE' is not defined

## Credit Card Data Feature Engineering

In [None]:
# Credit card data preprocessing (minimal since already PCA transformed)
X_credit = credit_df.drop('Class', axis=1)
y_credit = credit_df['Class']

# Scale Amount and Time
scaler = StandardScaler()
X_credit_scaled = scaler.fit_transform(X_credit)

print("Credit card data shape:", X_credit_scaled.shape)

In [None]:
# Apply SMOTE to credit card data
X_credit_smote, y_credit_smote = smote.fit_resample(X_credit_scaled, y_credit)

print("Original credit card class distribution:")
print(y_credit.value_counts())
print("\nAfter SMOTE:")
print(pd.Series(y_credit_smote).value_counts())

In [None]:
# Save processed datasets
import joblib

# Save fraud data
joblib.dump(X_fraud_smote, '../data/processed/X_fraud_smote.pkl')
joblib.dump(y_fraud_smote, '../data/processed/y_fraud_smote.pkl')
joblib.dump(feature_names, '../data/processed/fraud_feature_names.pkl')

# Save credit card data
joblib.dump(X_credit_smote, '../data/processed/X_credit_smote.pkl')
joblib.dump(y_credit_smote, '../data/processed/y_credit_smote.pkl')

# Save preprocessors
joblib.dump(preprocessor, '../models/fraud_preprocessor.pkl')
joblib.dump(scaler, '../models/credit_scaler.pkl')

print("All processed data and models saved!")