In [3]:
# Import necessary libraries
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

In [4]:
fraud_data = pd.read_csv('/home/semre/fraud_detection_data_analysis/data/Fraud_Data.csv')
ip_data = pd.read_csv('/home/semre/fraud_detection_data_analysis/data/IpAddress_to_Country.csv')
creditcard_data = pd.read_csv('/home/semre/fraud_detection_data_analysis/data/creditcard.csv')

#### 1. Data Preparation

##### a. Feature and Target Separation

In [5]:
# Separate features and targets for both datasets
# Credit card dataset
X_creditcard = creditcard_data.drop(columns=['Class'])  # Features
y_creditcard = creditcard_data['Class']  # Target

# Fraud data dataset
X_fraud = fraud_data.drop(columns=['class'])  # Features
y_fraud = fraud_data['class']  # Target

##### b. Train-Test Split

In [6]:
from sklearn.model_selection import train_test_split

# Train-test split for credit card data
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(X_creditcard, y_creditcard, test_size=0.2, random_state=42)

# Train-test split for fraud data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(X_fraud, y_fraud, test_size=0.2, random_state=42)

#### 2. Model Selection

##### a. Logistic Regression, Decision Tree, Random Forest, and Gradient Boosting

In [11]:
from scipy.sparse import hstack

def preprocess_data(X_train, X_test, date_columns=None, cat_columns=None):
    # Convert date columns if they exist
    if date_columns:
        for col in date_columns:
            X_train[col] = pd.to_datetime(X_train[col])
            X_test[col] = pd.to_datetime(X_test[col])
            X_train[f'{col}_hour'] = X_train[col].dt.hour
            X_train[f'{col}_day'] = X_train[col].dt.day
            X_test[f'{col}_hour'] = X_test[col].dt.hour
            X_test[f'{col}_day'] = X_test[col].dt.day
            X_train.drop(columns=[col], inplace=True)
            X_test.drop(columns=[col], inplace=True)

    # Handle categorical features using OneHotEncoder
    if cat_columns:
        encoder = OneHotEncoder(sparse=True, handle_unknown='ignore')  # Keep sparse
        X_train_cat = encoder.fit_transform(X_train[cat_columns])
        X_test_cat = encoder.transform(X_test[cat_columns])
        
        # Drop the original categorical columns
        X_train.drop(columns=cat_columns, inplace=True)
        X_test.drop(columns=cat_columns, inplace=True)
        
        # Concatenate the sparse matrices with the original DataFrames
        X_train = hstack([X_train, X_train_cat])
        X_test = hstack([X_test, X_test_cat])
    
    # Scale the numerical features
    scaler = StandardScaler(with_mean=False)  # Use with_mean=False to avoid issues with sparse data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    return X_train_scaled, X_test_scaled

##### b. Multi-Layer Perceptron (MLP)

In [14]:
from sklearn.preprocessing import OneHotEncoder

def preprocess_data(X_train, X_test, date_columns=None, cat_columns=None):
    # Convert date columns if they exist
    if date_columns:
        for col in date_columns:
            X_train[col] = pd.to_datetime(X_train[col])
            X_test[col] = pd.to_datetime(X_test[col])

            # Extract useful datetime features (e.g., hour, day)
            X_train[f'{col}_hour'] = X_train[col].dt.hour
            X_train[f'{col}_day'] = X_train[col].dt.day
            X_test[f'{col}_hour'] = X_test[col].dt.hour
            X_test[f'{col}_day'] = X_test[col].dt.day

            # Drop original date columns if not needed
            X_train.drop(columns=[col], inplace=True)
            X_test.drop(columns=[col], inplace=True)
    
    # Handle categorical features using OneHotEncoder
    if cat_columns:
        # Apply one-hot encoding to categorical columns
        encoder = OneHotEncoder(sparse_output=True, handle_unknown='ignore')  # Change made here
        X_train_cat = encoder.fit_transform(X_train[cat_columns])
        X_test_cat = encoder.transform(X_test[cat_columns])
        
        # Convert to DataFrame and reset index to merge
        X_train_cat_df = pd.DataFrame(X_train_cat.toarray(), columns=encoder.get_feature_names_out(cat_columns), index=X_train.index)
        X_test_cat_df = pd.DataFrame(X_test_cat.toarray(), columns=encoder.get_feature_names_out(cat_columns), index=X_test.index)
        
        # Drop the original categorical columns and merge the encoded data
        X_train = X_train.drop(columns=cat_columns).join(X_train_cat_df)
        X_test = X_test.drop(columns=cat_columns).join(X_test_cat_df)
    
    # Scale the data using StandardScaler
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled

#### 4. MLOps Steps

In [16]:
import mlflow
import mlflow.sklearn
import pandas as pd
from sklearn.linear_model import LogisticRegression

# Set experiment name
mlflow.set_experiment("fraud-detection")

# Start MLflow run
with mlflow.start_run():
    # Example: Assuming log_reg is your trained logistic regression model
    log_reg = LogisticRegression(max_iter=1000)  # Your model initialization and training goes here

    # Create an example input DataFrame (adjust the columns as per your model)
    example_input = pd.DataFrame({
        'feature1': [0.5],  # Replace 'feature1', 'feature2', etc., with actual feature names
        'feature2': [1.5],
        # Add more features as needed
    })

    # Log the model with input example
    mlflow.sklearn.log_model(log_reg, "logistic_regression", input_example=example_input)

    # Log parameters
    mlflow.log_param("max_iter", 1000)
    
    # Log metrics
    mlflow.log_metric("accuracy", accuracy_score(y_test_cc, y_pred_cc))
    

  "dataframe_split": {
    "columns": [
      "feature1",
      "feature2"
    ],
    "data": [
      [
        0.5,
        1.5
      ]
    ]
  }
}. Alternatively, you can avoid passing input example and pass model signature instead when logging the model. To ensure the input example is valid prior to serving, please try calling `mlflow.models.validate_serving_input` on the model uri and serving input example. A serving input example can be generated from model input example using `mlflow.models.convert_input_example_to_serving_input` function.
Got error: This LogisticRegression instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
