In [None]:

! pip install tensorflow

In [9]:
# Cell 1: Import Necessary Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from keras.models import Sequential
from keras.layers import Dense, LSTM, Conv2D, Flatten, Dropout
from sklearn.metrics import classification_report, accuracy_score
import mlflow
import mlflow.sklearn


In [12]:
# Load the credit card and fraud datasets
creditcard_data = pd.read_csv('../data/creditcard.csv')
fraud_data = pd.read_csv('../data/Fraud_Data.csv')

# Display the first few rows of each dataset
print("Credit Card Data:")
print(creditcard_data.head())
print("\nFraud Data:")
print(fraud_data.head())


Credit Card Data:
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647376   
4 -0.270533  0.817739  ... -0.009431  0.798278 -0.137458  0.141267 -0.206010   

        V26   

In [13]:
# Feature and target separation for credit card data
X_creditcard = creditcard_data.drop(columns=['Class'])
y_creditcard = creditcard_data['Class']

# Feature and target separation for fraud data
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']


In [14]:
# Train-test split for credit card data
X_train_creditcard, X_test_creditcard, y_train_creditcard, y_test_creditcard = train_test_split(
    X_creditcard, y_creditcard, test_size=0.2, random_state=42, stratify=y_creditcard
)

# Train-test split for fraud data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

In [15]:
# Logistic Regression for credit card data
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train_creditcard, y_train_creditcard)
y_pred_creditcard_log_reg = log_reg.predict(X_test_creditcard)

# Evaluation
print("Logistic Regression on Credit Card Data:")
print(classification_report(y_test_creditcard, y_pred_creditcard_log_reg))


Logistic Regression on Credit Card Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.83      0.66      0.74        98

    accuracy                           1.00     56962
   macro avg       0.92      0.83      0.87     56962
weighted avg       1.00      1.00      1.00     56962



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# Decision Tree for credit card data
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train_creditcard, y_train_creditcard)
y_pred_creditcard_tree = decision_tree.predict(X_test_creditcard)

# Evaluation
print("Decision Tree on Credit Card Data:")
print(classification_report(y_test_creditcard, y_pred_creditcard_tree))


Decision Tree on Credit Card Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.72      0.73      0.73        98

    accuracy                           1.00     56962
   macro avg       0.86      0.87      0.86     56962
weighted avg       1.00      1.00      1.00     56962



In [17]:
# Random Forest for credit card data
random_forest = RandomForestClassifier()
random_forest.fit(X_train_creditcard, y_train_creditcard)
y_pred_creditcard_rf = random_forest.predict(X_test_creditcard)

# Evaluation
print("Random Forest on Credit Card Data:")
print(classification_report(y_test_creditcard, y_pred_creditcard_rf))


Random Forest on Credit Card Data:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56864
           1       0.94      0.83      0.88        98

    accuracy                           1.00     56962
   macro avg       0.97      0.91      0.94     56962
weighted avg       1.00      1.00      1.00     56962



In [18]:
# Gradient Boosting for credit card data
gradient_boosting = GradientBoostingClassifier()
gradient_boosting.fit(X_train_creditcard, y_train_creditcard)
y_pred_creditcard_gb = gradient_boosting.predict(X_test_creditcard)

# Evaluation
print("Gradient Boosting on Credit Card Data:")
print(classification_report(y_test_creditcard, y_pred_creditcard_gb))


In [None]:
# MLP for credit card data
mlp = MLPClassifier(max_iter=500)
mlp.fit(X_train_creditcard, y_train_creditcard)
y_pred_creditcard_mlp = mlp.predict(X_test_creditcard)

# Evaluation
print("MLP on Credit Card Data:")
print(classification_report(y_test_creditcard, y_pred_creditcard_mlp))


In [None]:
# Reshape data for CNN (assuming 2D input for simplicity)
X_train_cnn = X_train_creditcard.values.reshape(-1, 28, 28, 1)  
X_test_cnn = X_test_creditcard.values.reshape(-1, 28, 28, 1)

# CNN Model
cnn_model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(28, 28, 1)),
    Flatten(),
    Dense(64, activation='relu'),
    Dense(1, activation='sigmoid')
])

cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_cnn, y_train_creditcard, epochs=10, batch_size=32)

# Evaluation
cnn_loss, cnn_accuracy = cnn_model.evaluate(X_test_cnn, y_test_creditcard)
print(f"CNN Accuracy on Credit Card Data: {cnn_accuracy}")


In [None]:
# Reshape data for LSTM
X_train_lstm = X_train_creditcard.values.reshape((X_train_creditcard.shape[0], 1, X_train_creditcard.shape[1]))
X_test_lstm = X_test_creditcard.values.reshape((X_test_creditcard.shape[0], 1, X_test_creditcard.shape[1]))

# LSTM Model
lstm_model = Sequential([
    LSTM(50, return_sequences=True, input_shape=(1, X_train_creditcard.shape[1])),
    LSTM(50),
    Dense(1, activation='sigmoid')
])

lstm_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model.fit(X_train_lstm, y_train_creditcard, epochs=10, batch_size=32)

# Evaluation
lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test_lstm, y_test_creditcard)
print(f"LSTM Accuracy on Credit Card Data: {lstm_accuracy}")


In [None]:
# Set the experiment name
experiment_name = "Fraud_Detection_Experiment" 
mlflow.set_experiment(experiment_name)

# Start an MLflow run to log metrics and models
with mlflow.start_run():
    mlflow.log_param("model_type", "Logistic Regression")
    mlflow.log_param("test_size", 0.2)
    mlflow.log_metric("accuracy", accuracy_score(y_test_creditcard, y_pred_creditcard_log_reg))
    
    # Log the Logistic Regression model
    mlflow.sklearn.log_model(log_reg, "logistic_regression_model")

