#### Necessary Libraries

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from random import seed,sample
import datetime as dt
from datetime import datetime
import seaborn as sns

import scipy.stats as stats

from sklearn.model_selection import train_test_split

# pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

# feature scaling & OHE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MinMaxScaler # data encoders

# moodels 
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score, confusion_matrix

# resampling techniques for imbalanced data
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE #
from imblearn.pipeline import Pipeline as ImbPipeline

# formatting 
%matplotlib inline
pd.set_option('display.max_columns', 500)

In [2]:
X_train = pd.read_csv('../X_train.csv')
X_test = pd.read_csv('../X_test.csv')
y_train = pd.read_csv('../y_train.csv')
y_test = pd.read_csv('../y_test.csv')

In [3]:
# Define categorical and numerical features
categorical_cols = ['category', 'gender', 'state', 'job', 'day_of_week']
numerical_cols = ['amount(usd)', 'lat', 'long', 'merch_lat', 'merch_long', 'hour_of_day', 'age']

# Create the preprocessing for numeric and categorical features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# Create a pipeline with preprocessing and the model
model = ImbPipeline(steps=[('preprocessor', preprocessor),
                           ('smote', SMOTE(random_state=42)),
                           ('classifier', LogisticRegression(random_state=42))])

# Train the model
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_test)

# Calculate evaluation metrics on the validation set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Output the metrics
print(f'Validation Metrics: \n Accuracy: {accuracy}\n Precision: {precision}\n Recall: {recall}\n F1-Score: {f1}\n ROC AUC: {roc_auc}')

# If the performance is satisfactory, you can proceed to evaluate the model on the test set in a similar fashion

Validation Metrics: 
 Accuracy: 0.8842688481048876
 Precision: 0.03471159527885941
 Recall: 0.7913644214162349
 F1-Score: 0.06650603808639108
 ROC AUC: 0.8380598932978747


## 2. Random Forest 
#### 2.1 Baseline Model

In [4]:
from sklearn.ensemble import RandomForestClassifier

# Assuming preprocessor is already defined as before
# Update the model in the pipeline to RandomForestClassifier
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('classifier', RandomForestClassifier(random_state=42))])

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_test)

# Calculate the probabilities for ROC AUC
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Calculate evaluation metrics on the validation set
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba) # Use probabilities for ROC AUC

# Print the metrics
print(f'Validation Metrics with Random Forest: \n Accuracy: {accuracy:.4f}\n Precision: {precision:.4f}\n Recall: {recall:.4f}\n F1-Score: {f1:.4f}\n ROC AUC: {roc_auc:.4f}')


Validation Metrics with Random Forest: 
 Accuracy: 0.9984
 Precision: 0.9864
 Recall: 0.7016
 F1-Score: 0.8199
 ROC AUC: 0.9815


#### 2.2 SMOTE

#### 2.3 Random Under-Sampling

#### Hyperparameter Tuning