In [1]:


# Import dependencies
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder


In [2]:
# Load cleaned data
clean_train_data = pd.read_csv("Resources/clean_train_data.csv")
# clean_test_data = pd.read_csv("Resources/clean_test_data.csv")

In [3]:
# split the train_data into a smaller subset (optional, if the dataset is too large for hyperparameter tuning):

train_data_sample = clean_train_data.sample(frac=0.1, random_state=42)

In [4]:
from sklearn.model_selection import train_test_split

X = clean_train_data.drop("isFraud", axis=1)
y = clean_train_data["isFraud"]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
from sklearn.preprocessing import LabelEncoder

categorical_features = X_train.select_dtypes(include=['object']).columns

for feature in categorical_features:
    le = LabelEncoder()
    X_train[feature] = le.fit_transform(X_train[feature].astype(str))
    X_test[feature] = le.transform(X_test[feature].astype(str))

In [7]:
#Standardize the data:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#Save the processed data to an SQL database:
import sqlite3

# Create a connection to the database
conn = sqlite3.connect("fraud_detection.db")

# Save the processed data to the database
X_train_df = pd.DataFrame(X_train, columns=X.columns)
y_train_df = pd.DataFrame(y_train, columns=["isFraud"])
X_test_df = pd.DataFrame(X_test, columns=X.columns)
y_test_df = pd.DataFrame(y_test, columns=["isFraud"])

X_train_df.to_sql("X_train", conn, if_exists="replace", index=False)
y_train_df.to_sql("y_train", conn, if_exists="replace", index=False)
X_test_df.to_sql("X_test", conn, if_exists="replace", index=False)
y_test_df.to_sql("y_test", conn, if_exists="replace", index=False)

# Close the connection
conn.close()

In [None]:
# Train and evaluate the RandomForest and XGBoost models:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

# Train the RandomForest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

In [None]:
# Evaluate the models
y_pred_rf = rf_model.predict(X_test)

In [None]:
print("Random Forest:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print("ROC AUC:", roc_auc_score(y_test, y_pred_rf))