# Basic set up

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import os
import warnings
import joblib

from joblib import parallel_backend

import sklearn.metrics as skm

from sklearn.exceptions import ConvergenceWarning
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from imblearn.combine import SMOTETomek

In [2]:
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [3]:
seed = 11

In [4]:
df_train = pd.read_csv("../data/processed/train_data.csv", index_col = 0)
df_val = pd.read_csv("../data/processed/validation_data.csv", index_col = 0)

# Classes 

## Data Prep Classs

In [5]:
class PrepDataset():

    def __init__(
            self, df : pd.DataFrame, df_name : str, ohe_cutoff : int = 20, outliers : str = None, 
            target : str = "isFraud", imbalance : str = None, seed : int = 11, normalize : str = None,
            cols_to_remove : list = ["step", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]):
        
        valid_names = ["train", "validation", "test"]

        if df_name not in valid_names:
            raise ValueError(f"invalid df_name : {df_name}, use one of {valid_names}")
        
        self.df_name = df_name

        self.df = df.copy()
        self.df_orginal = df.copy()

        self.remove_columns(col_list = cols_to_remove)

        self.encode_categoriacals(ohe_cutoff = ohe_cutoff)

        if outliers:
            self.replace_outliers(method = outliers, df_name = df_name)

        self.x, self.y = None, None
        self.split_x_y(target = target)

        if imbalance and df_name == "train":
            self.resampling(method = imbalance)

        if normalize:
            self.normalize_dataset(df_name = df_name, method = normalize)
    
    def remove_columns(self, col_list : list):
        """
        This function drops a list of columns from 
        the dataframe.
        """

        self.df.drop(col_list, axis = 1, inplace = True)
    
    def encode_categoriacals(self, ohe_cutoff : int = 20, mappings_save_path : str = "../data/other/"):
        """
        This function encodes the categorical columns so they can be feed into 
        a model. The parameter ohe_cutoff divides the columns into two lists. The 
        first list, containing less unique values per column than then the parameter, will store 
        the column names that will be one hot encoded. The second list, conataining more or 
        equal unique values per column than the parameter, will store the column names that will
        be label encoded.
        """
        mappings_save_path += f"mappings_label_encoding_{self.df_name}.json"

        df_unq_vals = self.df.select_dtypes(include = "object").nunique().reset_index().rename({"index" : "col_name", 0 : "n_unique_values"}, axis = 1)
        ohe_list = df_unq_vals.loc[df_unq_vals.n_unique_values < ohe_cutoff, "col_name"].tolist()
        label_enc_list = df_unq_vals.loc[df_unq_vals.n_unique_values >= ohe_cutoff, "col_name"].tolist()

        # one hot encoding 
        self.df = pd.get_dummies(data = self.df, columns = ohe_list, dtype = int)
        
        # label encoding
        mappings_label_encoding = {}

        for col in label_enc_list:
            self.df[col] = self.df[col].astype("category")
            mappings_label_encoding[col] = dict(enumerate(self.df[col].cat.categories))

            self.df[col] = self.df[col].cat.codes

        with open(mappings_save_path, 'w') as f:
            json.dump(mappings_label_encoding, f)
    
    def restore_data(self):
        """
        This function restores the data to the original version before
        replacing outliers.
        """

        self.df = self.original_df.copy()
    
    def replace_outliers(self, method : str = "z_score", df_name : str = "train"):
        """
        This function substitutes outliers with the border value 
        according to which method chosen.
        """
        
        valid_methods = ["z_score", "iqr", "both"]

        if method not in valid_methods:
            raise ValueError(f"invalid method : {method}, use one of {valid_methods}")
        
        
        bounds_json = "../data/other/outliers_bounds.json"

        if not os.path.exists(bounds_json):
            outlier_bounds = {}
        else:
            with open(bounds_json, "r") as file:
                outlier_bounds = json.load(file)
        
        
        col_outliers = self.df.select_dtypes(include = "float").columns.tolist()

        for col in col_outliers:

            if df_name == "train":
                mean = self.df[col].mean()
                std = self.df[col].std()

                self.df["z_score"] = (self.df[col] - mean) / std
                zscore_upper_bound = self.df[self.df.z_score <= 3].sort_values(by = col, ascending = False)[col].head(1).values[0]
                zscore_lower_bound = self.df[self.df.z_score >= -3].sort_values(by = col)[col].head(1).values[0]
                self.df.drop("z_score", axis = 1, inplace = True)

                q1 = self.df[col].quantile(0.25)
                q3 = self.df[col].quantile(0.75)
                iqr = q3 - q1 

                iqr_upper_bound = q3 + 1.5 * iqr
                iqr_lower_bound = q1 - 1.5 * iqr

                outlier_bounds[col] = {
                "z_upper": zscore_upper_bound, "z_lower": zscore_lower_bound,
                "iqr_upper": iqr_upper_bound, "iqr_lower": iqr_lower_bound
                }

                with open(bounds_json, "w") as file:
                    json.dump(outlier_bounds, file, indent=4)

            else:
                bounds = outlier_bounds.get(col)

                if bounds is None:
                    raise ValueError(f"No stored bounds for column {col}.")
                
                zscore_upper_bound = bounds["z_upper"]
                zscore_lower_bound = bounds["z_lower"]
                iqr_upper_bound = bounds["iqr_upper"]
                iqr_lower_bound = bounds["iqr_lower"]

            if method == "both":
                upper_bound  = min(zscore_upper_bound, iqr_upper_bound)
                lower_bound = max(zscore_lower_bound, iqr_lower_bound)
            
            elif method == "z_score":
                upper_bound  = zscore_upper_bound
                lower_bound = zscore_lower_bound
            
            elif method == "iqr":
                upper_bound  = iqr_upper_bound
                lower_bound = iqr_lower_bound
        
            if len(self.df[self.df[col] > upper_bound]) > 0:
                    self.df.loc[self.df[col] > upper_bound, col] = upper_bound
                
            if len(self.df[self.df[col] < lower_bound]) > 0:
                self.df.loc[self.df[col] < lower_bound, col] = lower_bound
    
    def split_x_y(self, target : str) -> tuple[pd.DataFrame, pd.Series]:
        """
        This function splits a dataframe in x (the dataframe containing the predictors) and
        y (the series witht the target)
        """

        predictors = self.df.columns.tolist()
        predictors.remove(target)

        self.x = self.df[predictors]
        self.y = self.df[target]
    
    def resampling(self, method : str = "smote", sampling_strategy : float = 0.05):
        """
        This function resamples the dataset based on the strategy chosen via the 
        method parameter 
        """
        valid_methods = ["smote", "tomek", "both"]

        if method not in valid_methods:
            raise ValueError(f"invalid method : {method}, use one of {valid_methods}")
        
        if method == "smote":
            resampler = SMOTE(sampling_strategy = sampling_strategy, random_state = seed)
        
        elif method == "tomek":
            resampler = TomekLinks(sampling_strategy = "auto")
        
        elif method == "both":
            resampler = SMOTETomek(random_state = seed, sampling_strategy = sampling_strategy)
        
        self.x, self.y = resampler.fit_resample(self.x, self.y)

    def normalize_dataset(self, df_name : str, method : str = "standard"):
        """
        This function normalizes the dataset.
        """

        valid_methods = ["standard", "min_max"]

        if method not in valid_methods:
            raise ValueError(f"invalid method : {method}, use one of {valid_methods}")
        
        if df_name == "train":

            if method == "standard":
                scaler = StandardScaler()
            
            elif method == "min_max":
                scaler = MinMaxScaler(feature_range = (0, 1))
            
            self.x = scaler.fit_transform(self.x)
            self.x = pd.DataFrame(self.x, columns = scaler.feature_names_in_)

            joblib.dump(scaler, f"../models/scalers/{method}_scaler.pkl")
        
        else:
            scaler_path = f"../models/scalers/{method}_scaler.pkl"

            if not os.path.exists(scaler_path):
                raise FileNotFoundError(f"No scaler found at {scaler_path}. Normalize training data first.")
            
            else:
                scaler = joblib.load(scaler_path)

            self.x = scaler.transform(self.x)
            self.x = pd.DataFrame(self.x, columns = scaler.feature_names_in_)

    def return_x_y(self) -> tuple[pd.DataFrame, pd.Series]:
        """
        This function returns the dataframe.
        """  

        return self.x, self.y

## Classical Models Evaluation Class

In [6]:
class ClassicModelsEvaluation():

    def __init__(
            self, x_train : pd.DataFrame, y_train : pd.Series, x_val : pd.DataFrame, y_val : pd.Series, 
            df_eval : pd.DataFrame, imbalance : str, outliers : str, seed : int = 11):
        self.x_train =  x_train
        self.y_train = y_train
        self.x_val = x_val
        self.y_val = y_val

        self.df_eval = df_eval

        self.imbalance = imbalance
        self.outliers = outliers

        with parallel_backend("threading"):
            self.evaluate_model(model = LogisticRegression(n_jobs = -1, random_state = seed), model_name = "logistic regression")
            self.evaluate_model(model = RandomForestClassifier(n_jobs = -1, random_state = seed), model_name = "random forest")
            self.evaluate_model(model = XGBClassifier(n_jobs = -1, random_state = seed), model_name = "xgb")
        
    def evaluate_model(self, model, model_name : str):
        """
        This function first trains the model on the training data contained in the class, and 
        then evaluates it using both the training data and the validation data, also present in the
        class. The results are then stored in the df_eval.
        """

        # training the model
        model.fit(self.x_train, self.y_train)

        # computing predictions
        y_pred_train = model.predict(self.x_train)
        y_pred_val = model.predict(self.x_val)
        
        # calculating metrics
        accuracy_train = skm.accuracy_score(y_pred = y_pred_train, y_true = self.y_train)
        recall_train = skm.recall_score(y_pred = y_pred_train, y_true = self.y_train, zero_division=0)
        precision_train = skm.precision_score(y_pred = y_pred_train, y_true = self.y_train, zero_division=0)
        f1_score_train = skm.f1_score(y_pred = y_pred_train, y_true = self.y_train, zero_division=0)

        accuracy_val = skm.accuracy_score(y_pred = y_pred_val, y_true = self.y_val)
        recall_val = skm.recall_score(y_pred = y_pred_val, y_true = self.y_val, zero_division=0)
        precision_val = skm.precision_score(y_pred = y_pred_val, y_true = self.y_val, zero_division=0)
        f1_score_val = skm.f1_score(y_pred = y_pred_val, y_true = self.y_val, zero_division=0)

        # saving results
        self.df_eval.loc[len(self.df_eval)] = {
            "imbalance" : self.imbalance,
            "outliers" : self.outliers,
            "model" : model_name, 
            "dataset" : "train", 
            "accuracy" : accuracy_train,
            "recall" : recall_train, 
            "precision" : precision_train, 
            "f1_score" : f1_score_train}

        self.df_eval.loc[len(self.df_eval)] = {
            "imbalance" : self.imbalance,
            "outliers" : self.outliers,
            "model" : model_name, 
            "dataset" : "validation", 
            "accuracy" : accuracy_val,
            "recall" : recall_val, 
            "precision" : precision_val, 
            "f1_score" : f1_score_val}
        
    def return_df_eval(self):
        """
        This function retuns the df eval.
        """

        return self.df_eval

# Modeling

## Classical Models

### Model and data preparation selction

In [7]:
compute  = False 
# 43 min to compute 

if compute:
    df_eval = pd.DataFrame(columns = ["imbalance", "outliers", "model", "dataset", "accuracy", "recall", "precision", "f1_score"])

    outliers_methods = [None, "z_score", "iqr", "both"]
    resampling_methods = [None, "smote", "tomek", "both"]

    for outlier_method in outliers_methods:
        for resampling_method in resampling_methods:
            print("outlier_method :", outlier_method, "resampling_method :", resampling_method)

            outliers = outlier_method
            imbalance = resampling_method

            # Data Prep
            data_prep_train = PrepDataset(df = df_train, df_name = "train", outliers= outliers, imbalance = imbalance)
            x_train, y_train = data_prep_train.return_x_y()

            data_prep_val = PrepDataset(df = df_val, df_name = "validation", outliers= outliers, imbalance = imbalance)
            x_val, y_val = data_prep_val.return_x_y()

            if not outliers:
                outliers = "none"
            if not imbalance:
                imbalance = "none"

            # Model Evaluation
            classsic_model_eval = ClassicModelsEvaluation(
                x_train = x_train, y_train = y_train, 
                x_val = x_val, y_val = y_val,
                df_eval = df_eval , imbalance = imbalance, outliers = outliers)
    
    df_eval.to_csv("../data/evaluation/outliers_resampling.csv")

else:
    df_eval = pd.read_csv("../data/evaluation/outliers_resampling.csv", index_col = 0)

In [8]:
df_top_10 = df_eval[df_eval.dataset == "validation"].sort_values(by = "f1_score", ascending = False).head(10)
df_top_10

Unnamed: 0,imbalance,outliers,model,dataset,accuracy,recall,precision,f1_score
39,tomek,z_score,random forest,validation,0.998857,0.158247,0.783133,0.263291
15,tomek,none,random forest,validation,0.998853,0.157638,0.775449,0.262013
3,none,none,random forest,validation,0.998853,0.157638,0.773134,0.261881
27,none,z_score,random forest,validation,0.998849,0.155204,0.768072,0.258228
29,none,z_score,xgb,validation,0.998853,0.151552,0.790476,0.254341
17,tomek,none,xgb,validation,0.998846,0.152161,0.766871,0.253936
41,tomek,z_score,xgb,validation,0.998853,0.149117,0.800654,0.251411
35,smote,z_score,xgb,validation,0.998295,0.185027,0.267841,0.218862
11,smote,none,xgb,validation,0.998186,0.190505,0.242448,0.213361
47,both,z_score,xgb,validation,0.998156,0.186245,0.232523,0.206827


In [9]:
df_top_10_all = df_eval.loc[[num for x in df_top_10.index.tolist() for num in (x -1, x)]]
df_top_10_all

Unnamed: 0,imbalance,outliers,model,dataset,accuracy,recall,precision,f1_score
38,tomek,z_score,random forest,train,0.999995,0.996347,1.0,0.99817
39,tomek,z_score,random forest,validation,0.998857,0.158247,0.783133,0.263291
14,tomek,none,random forest,train,0.999996,0.996695,1.0,0.998345
15,tomek,none,random forest,validation,0.998853,0.157638,0.775449,0.262013
2,none,none,random forest,train,0.999997,0.997391,1.0,0.998694
3,none,none,random forest,validation,0.998853,0.157638,0.773134,0.261881
26,none,z_score,random forest,train,0.999996,0.997217,1.0,0.998607
27,none,z_score,random forest,validation,0.998849,0.155204,0.768072,0.258228
28,none,z_score,xgb,train,0.998897,0.165072,0.894439,0.278708
29,none,z_score,xgb,validation,0.998853,0.151552,0.790476,0.254341


Now that we have computed all the possibilities we have a rough idea as to which model, outlier strategy, and imbalance stratgey to use going forward. 

While the random forest has the best results it is very slow to train and the differences in results on validation set are marginal, so keeping this in mind I am going to move forward with the XGBoost Classifier for further analysis.

For dealing with outliers we can rule out IQR and both, while depending on the cases none and Z score have producedd good results.

Regarding the imbalance in the target I don't feel confindent exclduing any of the methodologies as I believe they require more resarch before reaching a definitive conclusion.

### Further analysis of XGboost 