In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import mlflow
import mlflow.sklearn


In [22]:
# Load preprocessed data
pre_fraud_data_df = pd.read_csv('../data/preprocessed_fraud_data.csv')

In [23]:
pre_fraud_data_df

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,transaction_frequency,transaction_velocity,hour_of_day,day_of_week
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,89215,2,0,1,39,732758368,0,84,1,0.000008,2,5
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,24078,0,0,0,53,350311387,0,171,1,0.000892,1,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,131216,2,3,1,53,2621473820,1,171,1,15.000000,18,3
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,3977,2,4,1,41,3840542443,0,181,1,0.000089,13,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,68757,0,4,1,45,415583117,0,171,1,0.000009,18,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,125335,2,0,1,28,3451154526,1,171,1,0.000008,0,6
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,63001,2,4,1,32,2439047221,0,118,1,0.000038,12,1
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,64204,2,2,0,26,2748470523,0,84,1,0.000006,7,2
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,13118,2,0,1,37,3601174708,0,171,1,0.000009,9,0


In [24]:
# Load the fraud dataset
X_fraud = pre_fraud_data_df.drop(columns=['class'])
y_fraud = pre_fraud_data_df['class']

In [25]:
#Function to extract datetime features month and year 
def extract_datetime_features(df, datetime_column):
    df_copy = df.copy()
    df_copy[datetime_column] = pd.to_datetime(df_copy[datetime_column])
    df_copy['year'] = df_copy[datetime_column].dt.year
    df_copy['month'] = df_copy[datetime_column].dt.month
    df_copy = df_copy.drop(columns=[datetime_column])
    return df_copy
datetime_column = 'purchase_time' 
X_fraud = extract_datetime_features(X_fraud, datetime_column)

In [26]:
X_fraud

Unnamed: 0,user_id,signup_time,purchase_value,device_id,source,browser,sex,age,ip_address,country,transaction_frequency,transaction_velocity,hour_of_day,day_of_week,year,month
0,22058,2015-02-24 22:55:49,34,89215,2,0,1,39,732758368,84,1,0.000008,2,5,2015,4
1,333320,2015-06-07 20:39:50,16,24078,0,0,0,53,350311387,171,1,0.000892,1,0,2015,6
2,1359,2015-01-01 18:52:44,15,131216,2,3,1,53,2621473820,171,1,15.000000,18,3,2015,1
3,150084,2015-04-28 21:13:25,44,3977,2,4,1,41,3840542443,181,1,0.000089,13,0,2015,5
4,221365,2015-07-21 07:09:52,39,68757,0,4,1,45,415583117,171,1,0.000009,18,2,2015,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,43,125335,2,0,1,28,3451154526,171,1,0.000008,0,6,2015,3
151108,274471,2015-05-15 17:43:29,35,63001,2,4,1,32,2439047221,118,1,0.000038,12,1,2015,5
151109,368416,2015-03-03 23:07:31,40,64204,2,2,0,26,2748470523,84,1,0.000006,7,2,2015,5
151110,207709,2015-07-09 20:06:07,46,13118,2,0,1,37,3601174708,171,1,0.000009,9,0,2015,9


In [27]:
# Identify numeric and categorical columns
numeric_features = X_fraud.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_fraud.select_dtypes(include=['object']).columns.tolist()

In [33]:
# Preprocessing pipeline for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
# Preprocess the data
X_fraud = preprocessor.fit_transform(pre_fraud_data_df)

In [34]:

 # Split the data into training and test sets
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

# Define models
models = {
    #"Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP": MLPClassifier(max_iter=1000)
}







In [35]:
# Helper function to train and evaluate models
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
    
    print(classification_report(y_test, y_pred))
    print("AUC-ROC:", roc_auc_score(y_test, y_prob))
    
    return model

In [39]:
# Function to log experiments using MLflow
def log_experiment(model_name, model, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("auc", auc)
        
        mlflow.sklearn.log_model(model, "model")
        
        print(f"{model_name} logged successfully.")

In [40]:
# Evaluate and log models for fraud data
print("\nEvaluating models for fraud data:")
for name, model in models.items():
    print(f"\n{name}")
    trained_model = train_and_evaluate(model, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test)
    log_experiment(name, trained_model, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test)


Evaluating models for fraud data:

Gradient Boosting
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

AUC-ROC: 0.7688141471202575
Gradient Boosting logged successfully.

MLP




              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

AUC-ROC: 0.7709365292088951




MLP logged successfully.
