In [61]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, LSTM, GRU, Dropout
import mlflow
import mlflow.tensorflow

In [50]:
# Load preprocessed data
pre_fraud_data_df = pd.read_csv('../data/preprocessed_fraud_data.csv')

In [51]:
pre_fraud_data_df

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,transaction_frequency,transaction_velocity,hour_of_day,day_of_week
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,89215,2,0,1,39,732758368,0,84,1,0.000008,2,5
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,24078,0,0,0,53,350311387,0,171,1,0.000892,1,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,131216,2,3,1,53,2621473820,1,171,1,15.000000,18,3
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,3977,2,4,1,41,3840542443,0,181,1,0.000089,13,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,68757,0,4,1,45,415583117,0,171,1,0.000009,18,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,125335,2,0,1,28,3451154526,1,171,1,0.000008,0,6
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,63001,2,4,1,32,2439047221,0,118,1,0.000038,12,1
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,64204,2,2,0,26,2748470523,0,84,1,0.000006,7,2
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,13118,2,0,1,37,3601174708,0,171,1,0.000009,9,0


In [52]:
#Function to extract datetime features month and year 
def extract_datetime_features(df, datetime_column):
    df_copy = df.copy()
    df_copy[datetime_column] = pd.to_datetime(df_copy[datetime_column])
    df_copy['year'] = df_copy[datetime_column].dt.year
    df_copy['month'] = df_copy[datetime_column].dt.month
    df_copy = df_copy.drop(columns=[datetime_column])
    return df_copy
datetime_column = 'purchase_time' 
fraud_data_df = extract_datetime_features(pre_fraud_data_df, datetime_column)

In [53]:
fraud_data_df 

Unnamed: 0,user_id,signup_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,transaction_frequency,transaction_velocity,hour_of_day,day_of_week,year,month
0,22058,2015-02-24 22:55:49,34,89215,2,0,1,39,732758368,0,84,1,0.000008,2,5,2015,4
1,333320,2015-06-07 20:39:50,16,24078,0,0,0,53,350311387,0,171,1,0.000892,1,0,2015,6
2,1359,2015-01-01 18:52:44,15,131216,2,3,1,53,2621473820,1,171,1,15.000000,18,3,2015,1
3,150084,2015-04-28 21:13:25,44,3977,2,4,1,41,3840542443,0,181,1,0.000089,13,0,2015,5
4,221365,2015-07-21 07:09:52,39,68757,0,4,1,45,415583117,0,171,1,0.000009,18,2,2015,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,43,125335,2,0,1,28,3451154526,1,171,1,0.000008,0,6,2015,3
151108,274471,2015-05-15 17:43:29,35,63001,2,4,1,32,2439047221,0,118,1,0.000038,12,1,2015,5
151109,368416,2015-03-03 23:07:31,40,64204,2,2,0,26,2748470523,0,84,1,0.000006,7,2,2015,5
151110,207709,2015-07-09 20:06:07,46,13118,2,0,1,37,3601174708,0,171,1,0.000009,9,0,2015,9


In [46]:
fraud_data_df .info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 17 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   user_id                151112 non-null  int64  
 1   signup_time            151112 non-null  object 
 2   purchase_value         151112 non-null  int64  
 3   device_id              151112 non-null  int64  
 4   source                 151112 non-null  int64  
 5   browser                151112 non-null  int64  
 6   sex                    151112 non-null  int64  
 7   age                    151112 non-null  int64  
 8   ip_address             151112 non-null  int64  
 9   class                  151112 non-null  int64  
 10  country                151112 non-null  int64  
 11  transaction_frequency  151112 non-null  int64  
 12  transaction_velocity   151112 non-null  float64
 13  hour_of_day            151112 non-null  int64  
 14  day_of_week            151112 non-nu

In [59]:

# Convert 'year' and 'month' columns to int64
fraud_data_df['year'] = fraud_data_df['year'].astype('int64')
fraud_data_df['month'] = fraud_data_df['month'].astype('int64')
fraud_data_df['transaction_velocity'] = fraud_data_df['transaction_velocity'].astype('int64')

In [58]:
fraud_data_df

Unnamed: 0,user_id,signup_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,transaction_frequency,transaction_velocity,hour_of_day,day_of_week,year,month
0,22058,2015-02-24 22:55:49,34,89215,2,0,1,39,732758368,0,84,1,0,2,5,2015,4
1,333320,2015-06-07 20:39:50,16,24078,0,0,0,53,350311387,0,171,1,0,1,0,2015,6
2,1359,2015-01-01 18:52:44,15,131216,2,3,1,53,2621473820,1,171,1,15,18,3,2015,1
3,150084,2015-04-28 21:13:25,44,3977,2,4,1,41,3840542443,0,181,1,0,13,0,2015,5
4,221365,2015-07-21 07:09:52,39,68757,0,4,1,45,415583117,0,171,1,0,18,2,2015,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,43,125335,2,0,1,28,3451154526,1,171,1,0,0,6,2015,3
151108,274471,2015-05-15 17:43:29,35,63001,2,4,1,32,2439047221,0,118,1,0,12,1,2015,5
151109,368416,2015-03-03 23:07:31,40,64204,2,2,0,26,2748470523,0,84,1,0,7,2,2015,5
151110,207709,2015-07-09 20:06:07,46,13118,2,0,1,37,3601174708,0,171,1,0,9,0,2015,9


In [62]:
# Encode Categorical Features
def encode_categorical_features(df):
    le = LabelEncoder()
    for column in df.select_dtypes(include=['object']).columns:
        df[column] = le.fit_transform(df[column])
    return df
fraud_data_df = encode_categorical_features(fraud_data_df)

In [63]:
fraud_data_df

Unnamed: 0,user_id,signup_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,transaction_frequency,transaction_velocity,hour_of_day,day_of_week,year,month
0,22058,41956,34,89215,2,0,1,39,732758368,0,84,1,0,2,5,2015,4
1,333320,106645,16,24078,0,0,0,53,350311387,0,171,1,0,1,0,2015,6
2,1359,971,15,131216,2,3,1,53,2621473820,1,171,1,15,18,3,2015,1
3,150084,81672,44,3977,2,4,1,41,3840542443,0,181,1,0,13,0,2015,5
4,221365,133741,39,68757,0,4,1,45,415583117,0,171,1,0,18,2,2015,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,23950,43,125335,2,0,1,28,3451154526,1,171,1,0,0,6,2015,3
151108,274471,92134,35,63001,2,4,1,32,2439047221,0,118,1,0,12,1,2015,5
151109,368416,46366,40,64204,2,2,0,26,2748470523,0,84,1,0,7,2,2015,5
151110,207709,126488,46,13118,2,0,1,37,3601174708,0,171,1,0,9,0,2015,9


In [64]:
# Save preprocessed datasets
fraud_data_df.to_csv('cleaned_fraud_data.csv', index=False)

In [66]:
cleaned_fraud_data = pd.read_csv('../data/cleaned_fraud_data.csv')

In [67]:
 #Define functions for model building and evaluation

def build_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(filters=32, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    return model

In [68]:
def build_rnn_model(input_shape):
    model = Sequential()
    model.add(GRU(units=64, return_sequences=True, input_shape=input_shape))
    model.add(GRU(units=32))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [69]:
def build_lstm_model(input_shape):
    model = Sequential()
    model.add(LSTM(units=64, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(units=32))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [70]:

def train_and_evaluate_model(model, X_train, y_train, X_test, y_test, model_name, experiment_name):
    # MLflow tracking
    mlflow.set_tracking_uri('http://localhost:5000')  # Set your MLflow tracking server
    mlflow.set_experiment(experiment_name)
    with mlflow.start_run():
        mlflow.log_param('model', model_name)

        # Compile the model
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

        history = model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

        # Evaluate the model
        y_pred = model.predict(X_test)
        y_pred_binary = (y_pred > 0.5).astype(int)

        accuracy = accuracy_score(y_test, y_pred_binary)
        precision = precision_score(y_test, y_pred_binary)
        recall = recall_score(y_test, y_pred_binary)
        f1 = f1_score(y_test, y_pred_binary)
        roc_auc = roc_auc_score(y_test, y_pred)

        # Log metrics to MLflow
        mlflow.log_metric('accuracy', accuracy)
        mlflow.log_metric('precision', precision)
        mlflow.log_metric('recall', recall)
        mlflow.log_metric('f1_score', f1)
        mlflow.log_metric('roc_auc', roc_auc)

        # Log model to MLflow
        mlflow.tensorflow.log_model(model, artifact_path='model')

        print(f'Model: {model_name}')
        print(f'Accuracy: {accuracy:.4f}')
        print(f'Precision: {precision:.4f}')
        print(f'Recall: {recall:.4f}')
        print(f'F1 Score: {f1:.4f}')
        print(f'ROC AUC: {roc_auc:.4f}')


In [71]:
# Data Preparation
# Fraud Data
X_fraud = cleaned_fraud_data.drop('class', axis=1)
y_fraud = cleaned_fraud_data['class']
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

In [72]:
X_fraud_train = X_fraud_train.values.reshape(X_fraud_train.shape[0], X_fraud_train.shape[1], 1)
X_fraud_test = X_fraud_test.values.reshape(X_fraud_test.shape[0], X_fraud_test.shape[1], 1)

In [73]:
# Fraud Data
print("Fraud Data - CNN")
cnn_model_fraud = build_cnn_model(X_fraud_train.shape[1:])
train_and_evaluate_model(cnn_model_fraud, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test, 'CNN', 'Fraud Detection')

Fraud Data - CNN


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2ms/step - accuracy: 0.8183 - loss: 4831446.0000 - val_accuracy: 0.9057 - val_loss: 3260006.7500
Epoch 2/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 1ms/step - accuracy: 0.8367 - loss: 978372.8125 - val_accuracy: 0.9056 - val_loss: 292810.8750
Epoch 3/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.8348 - loss: 898857.0000 - val_accuracy: 0.9055 - val_loss: 971917.1875
Epoch 4/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.8353 - loss: 675151.3125 - val_accuracy: 0.9055 - val_loss: 1528416.0000
Epoch 5/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step - accuracy: 0.8370 - loss: 792270.1250 - val_accuracy: 0.9040 - val_loss: 285545.3750
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 755us/step




Model: CNN
Accuracy: 0.9040
Precision: 0.3571
Recall: 0.0228
F1 Score: 0.0429
ROC AUC: 0.5093




In [74]:
print("Fraud Data - LSTM")
lstm_model_fraud = build_lstm_model(X_fraud_train.shape[1:])
train_and_evaluate_model(lstm_model_fraud, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test, 'LSTM', 'Fraud Detection')

Fraud Data - LSTM
Epoch 1/5


  super().__init__(**kwargs)


[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 9ms/step - accuracy: 0.9483 - loss: 0.1982 - val_accuracy: 0.9564 - val_loss: 0.1771
Epoch 2/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 9ms/step - accuracy: 0.9587 - loss: 0.1701 - val_accuracy: 0.9564 - val_loss: 0.1769
Epoch 3/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 9ms/step - accuracy: 0.9567 - loss: 0.1765 - val_accuracy: 0.9564 - val_loss: 0.1770
Epoch 4/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 9ms/step - accuracy: 0.9561 - loss: 0.1780 - val_accuracy: 0.9564 - val_loss: 0.1768
Epoch 5/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 9ms/step - accuracy: 0.9571 - loss: 0.1748 - val_accuracy: 0.9564 - val_loss: 0.1768
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step




Model: LSTM
Accuracy: 0.9564
Precision: 1.0000
Recall: 0.5379
F1 Score: 0.6995
ROC AUC: 0.7717


In [75]:

print("Fraud Data - RNN")
rnn_model_fraud = build_rnn_model(X_fraud_train.shape[1:])
train_and_evaluate_model(rnn_model_fraud, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test, 'RNN', 'Fraud Detection')

Fraud Data - RNN
Epoch 1/5


  super().__init__(**kwargs)


[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 12ms/step - accuracy: 0.9471 - loss: 0.1967 - val_accuracy: 0.9564 - val_loss: 0.1772
Epoch 2/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 11ms/step - accuracy: 0.9569 - loss: 0.1759 - val_accuracy: 0.9564 - val_loss: 0.1771
Epoch 3/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 11ms/step - accuracy: 0.9563 - loss: 0.1774 - val_accuracy: 0.9564 - val_loss: 0.1770
Epoch 4/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 10ms/step - accuracy: 0.9552 - loss: 0.1809 - val_accuracy: 0.9564 - val_loss: 0.1783
Epoch 5/5
[1m3778/3778[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 11ms/step - accuracy: 0.9562 - loss: 0.1780 - val_accuracy: 0.9564 - val_loss: 0.1770
[1m945/945[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step




Model: RNN
Accuracy: 0.9564
Precision: 1.0000
Recall: 0.5379
F1 Score: 0.6995
ROC AUC: 0.7589
