import libraries

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import mlflow
import mlflow.sklearn

In [2]:
# Load preprocessed data
pre_credit_data = pd.read_csv('../data/preprocessed_creditcard_data.csv')
pre_fraud_data_df = pd.read_csv('../data/preprocessed_fraud_data.csv')

additional Data Preparation

In [3]:
# Feature and target separation
X_credit = pre_credit_data.drop(columns=['Class'])
y_credit = pre_credit_data['Class']

X_fraud = pre_fraud_data_df.drop(columns=['class'])
y_fraud = pre_fraud_data_df['class']

In [4]:
#Function to extract datetime features month and year 
def extract_datetime_features(df, datetime_column):
    df_copy = df.copy()
    df_copy[datetime_column] = pd.to_datetime(df_copy[datetime_column])
    df_copy['year'] = df_copy[datetime_column].dt.year
    df_copy['month'] = df_copy[datetime_column].dt.month
    df_copy = df_copy.drop(columns=[datetime_column])
    return df_copy

In [5]:
datetime_column = 'purchase_time' 
X_fraud = extract_datetime_features(X_fraud, datetime_column)

In [6]:
# Identify numeric and categorical columns
numeric_features = X_fraud.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X_fraud.select_dtypes(include=['object']).columns.tolist()

In [7]:
# Preprocessing pipeline for numeric and categorical data
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])


In [8]:
# Preprocess the data
X_fraud = preprocessor.fit_transform(X_fraud)

# Train-test split
X_credit_train, X_credit_test, y_credit_train, y_credit_test = train_test_split(X_credit, y_credit, test_size=0.2, random_state=42)
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

Model Selection

In [10]:
# Define models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "MLP": MLPClassifier(max_iter=1000)
}

Model Training and Evaluation

In [11]:
# Helper function to train and evaluate models
def train_and_evaluate(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
    
    print(classification_report(y_test, y_pred))
    print("AUC-ROC:", roc_auc_score(y_test, y_prob))
    
    return model

# MLOps Steps
Versioning and Experiment Tracking

In [13]:
# Function to log experiments using MLflow
def log_experiment(model_name, model, X_train, y_train, X_test, y_test):
    with mlflow.start_run(run_name=model_name):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else y_pred
        
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        auc = roc_auc_score(y_test, y_prob)
        
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_param("model_name", model_name)
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("auc", auc)
        
        mlflow.sklearn.log_model(model, "model")
        
        print(f"{model_name} logged successfully.")



In [15]:
# Evaluate and log models for fraud data
print("\nEvaluating models for fraud data:")
for name, model in models.items():
    print(f"\n{name}")
    trained_model = train_and_evaluate(model, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test)
    log_experiment(name, trained_model, X_fraud_train, y_fraud_train, X_fraud_test, y_fraud_test)


Evaluating models for fraud data:

Logistic Regression
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

AUC-ROC: 0.7643414915837798




Logistic Regression logged successfully.

Decision Tree
              precision    recall  f1-score   support

           0       0.95      1.00      0.98     27373
           1       1.00      0.54      0.70      2850

    accuracy                           0.96     30223
   macro avg       0.98      0.77      0.84     30223
weighted avg       0.96      0.96      0.95     30223

AUC-ROC: 0.7689473684210526




Decision Tree logged successfully.

Random Forest


In [14]:
# Evaluate and log models for credit card data
print("Evaluating models for credit card data:")
for name, model in models.items():
    print(f"\n{name}")
    trained_model = train_and_evaluate(model, X_credit_train, y_credit_train, X_credit_test, y_credit_test)
    log_experiment(name, trained_model, X_credit_train, y_credit_train, X_credit_test, y_credit_test)



Evaluating models for credit card data:

Logistic Regression


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.85      0.59      0.70        90

    accuracy                           1.00     56746
   macro avg       0.93      0.79      0.85     56746
weighted avg       1.00      1.00      1.00     56746

AUC-ROC: 0.9430624195927076


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression logged successfully.

Decision Tree
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.66      0.72      0.69        90

    accuracy                           1.00     56746
   macro avg       0.83      0.86      0.84     56746
weighted avg       1.00      1.00      1.00     56746

AUC-ROC: 0.8608110546298912




Decision Tree logged successfully.

Random Forest
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.97      0.72      0.83        90

    accuracy                           1.00     56746
   macro avg       0.98      0.86      0.91     56746
weighted avg       1.00      1.00      1.00     56746

AUC-ROC: 0.9369927476544605




Random Forest logged successfully.

Gradient Boosting
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.90      0.63      0.75        90

    accuracy                           1.00     56746
   macro avg       0.95      0.82      0.87     56746
weighted avg       1.00      1.00      1.00     56746

AUC-ROC: 0.7665101666195988




Gradient Boosting logged successfully.

MLP
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56656
           1       0.52      0.52      0.52        90

    accuracy                           1.00     56746
   macro avg       0.76      0.76      0.76     56746
weighted avg       1.00      1.00      1.00     56746

AUC-ROC: 0.8558014842009476
MLP logged successfully.




In [None]:
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense, Conv1D, LSTM
# CNN
cnn = Sequential()
cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_credit_train.shape[1], 1)))
# Add more layers as needed
cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# LSTM
lstm = Sequential()
lstm.add(LSTM(units=64, input_shape=(X_fraud_train.shape[1], 1)))
# Add more layers as needed
lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])