# Basic set up

In [None]:
import pandas as pd
import json

import sklearn.metrics as skm

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
seed = 11

In [3]:
df_train = pd.read_csv("../data/processed/train_data.csv", index_col = 0)
df_val = pd.read_csv("../data/processed/validation_data.csv", index_col = 0)

# Classes 

## Data Prep Classs

In [None]:
class PrepDataset():

    def __init__(self, df : pd.DataFrame, df_name : str, ohe_cutoff : int = 20, outliers : str = None,
                 cols_to_remove : list = ["step", "oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]):
        
        valid_names = ["train", "validation", "test"]

        if df_name not in valid_names:
            raise ValueError(f"invalid df_name : {df_name}, use one of {valid_names}")
        
        self.df_name = df_name

        self.df = df
        self.df_orginal = df.copy()

        self.remove_columns(col_list = cols_to_remove)

        self.encode_categoriacals(ohe_cutoff = ohe_cutoff)

        if outliers:
            self.replace_outliers(method = outliers)
    
    def remove_columns(self, col_list : list):
        """
        This function drops a list of columns from 
        the dataframe.
        """

        self.df.drop(col_list, axis = 1, inplace = True)
    
    def encode_categoriacals(self, ohe_cutoff : int = 20, mappings_save_path : str = "../data/other/"):
        """
        This function encodes the categorical columns so they can be feed into 
        a model. The parameter ohe_cutoff divides the columns into two lists. The 
        first list, containing less unique values per column than then the parameter, will store 
        the column names that will be one hot encoded. The second list, conataining more or 
        equal unique values per column than the parameter, will store the column names that will
        be label encoded.
        """
        mappings_save_path += f"mappings_label_encoding_{self.df_name}.json"

        df_unq_vals = self.df.select_dtypes(include = "object").nunique().reset_index().rename({"index" : "col_name", 0 : "n_unique_values"}, axis = 1)
        ohe_list = df_unq_vals.loc[df_unq_vals.n_unique_values < ohe_cutoff, "col_name"].tolist()
        label_enc_list = df_unq_vals.loc[df_unq_vals.n_unique_values >= ohe_cutoff, "col_name"].tolist()

        # one hot encoding 
        self.df = pd.get_dummies(data = self.df, columns = ohe_list, dtype = int)
        
        # label encoding
        mappings_label_encoding = {}

        for col in label_enc_list:
            self.df[col] = self.df[col].astype("category")
            mappings_label_encoding[col] = dict(enumerate(self.df[col].cat.categories))

            self.df[col] = self.df[col].cat.codes

        with open(mappings_save_path, 'w') as f:
            json.dump(mappings_label_encoding, f)
    
    def restore_data(self):
        """
        This function restores the data to the original version before
        replacing outliers.
        """

        self.df = self.original_df.copy()
    
    def replace_outliers(self, method : str = "z_score"):
        """
        This function substitutes outliers with the border value 
        according to which method chosen.
        """
        
        valid_methods = ["z_score", "iqr", "both"]

        if method not in valid_methods:
            raise ValueError(f"invalid method : {method}, use one of {valid_methods}")
        
        col_outliers = self.df.select_dtypes(include = "float").columns.tolist()

        for col in col_outliers:
            mean = self.df[col].mean()
            std = self.df[col].std()

            self.df["z_score"] = (self.df[col] - mean) / std
            zscore_upper_bound = self.df[self.df.z_score <= 3].sort_values(by = "amount", ascending = False)["amount"].head(1).values[0]
            zscore_lower_bound = self.df[self.df.z_score >= -3].sort_values(by = "amount")["amount"].head(1).values[0]
            self.df.drop("z_score", axis = 1, inplace = True)

            q1 = self.df[col].quantile(0.25)
            q3 = self.df[col].quantile(0.75)
            iqr = q3 - q1 

            iqr_upper_bound = q3 + 1.5 * iqr
            iqr_lower_bound = q1 - 1.5 * iqr

            if method == "both":
                upper_bound  = min(zscore_upper_bound, iqr_upper_bound)
                lower_bound = max(zscore_lower_bound, iqr_lower_bound)
            
            elif method == "z_score":
                upper_bound  = zscore_upper_bound
                lower_bound = zscore_lower_bound
            
            elif method == "iqr":
                upper_bound  = iqr_upper_bound
                lower_bound = iqr_lower_bound
            
            if len(self.df[self.df.amount > upper_bound]) > 0:
                    self.df.loc[self.df.amount > upper_bound, "amount"] = upper_bound
                
            if len(self.df[self.df.amount < lower_bound]) > 0:
                self.df.loc[self.df.amount < lower_bound, "amount"] = lower_bound

    def return_dataset(self) -> pd.DataFrame:
        """
        This function returns the dataframe.
        """  

        return self.df

## Classical Models Evaluation Class

In [42]:
class ClassicModelsEvaluation():

    def __init__(self, df_train : pd.DataFrame, df_val : pd.DataFrame, df_eval : pd.DataFrame, imbalance : str,
                  outliers : str, target : str = "isFraud"):
        self.df_train  = df_train
        self.df_val = df_val

        self.df_eval = df_eval

        self.imbalance = imbalance
        self.outliers = outliers

        self.x_train, self.y_train = self.split_x_y(df = self.df_train, target = target)
        self.x_val, self.y_val = self.split_x_y(df = self.df_val, target = target)

        self.evaluate_model(model = LogisticRegression(n_jobs = -1, random_state = seed), model_name = "logistic regression")
        self.evaluate_model(model = RandomForestClassifier(n_jobs = -1, random_state = seed), model_name = "random forest")
        self.evaluate_model(model = XGBClassifier(n_jobs = -1, random_state = seed), model_name = "xgb")

    def split_x_y(self, df : pd.DataFrame, target : str) -> tuple[pd.DataFrame, pd.Series]:
        """
        This function splits a dataframe in x (the dataframe containing the predictors) and
        y (the series witht the target)
        """

        predictors = df.columns.tolist()
        predictors.remove(target)

        x = df[predictors]
        y = df[target]

        return x, y
    
    def evaluate_model(self, model, model_name : str):
        """
        This function first trains the model on the training data contained in the class, and 
        then evaluates it using both the training data and the validation data, also present in the
        class. The results are then stored in the df_eval.
        """

        # training the model
        model.fit(self.x_train, self.y_train)

        # computing predictions
        y_pred_train = model.predict(self.x_train)
        y_pred_val = model.predict(self.x_val)
        
        # calculating metrics
        accuracy_train = skm.accuracy_score(y_pred = y_pred_train, y_true = self.y_train)
        recall_train = skm.recall_score(y_pred = y_pred_train, y_true = self.y_train)
        precision_train = skm.precision_score(y_pred = y_pred_train, y_true = self.y_train)
        f1_score_train = skm.f1_score(y_pred = y_pred_train, y_true = self.y_train)

        accuracy_val = skm.accuracy_score(y_pred = y_pred_val, y_true = self.y_val)
        recall_val = skm.recall_score(y_pred = y_pred_val, y_true = self.y_val)
        precision_val = skm.precision_score(y_pred = y_pred_val, y_true = self.y_val)
        f1_score_val = skm.f1_score(y_pred = y_pred_val, y_true = self.y_val)

        # saving results
        self.df_eval.loc[len(self.df_eval)] = {
            "imbalance" : self.imbalance,
            "outliers" : self.outliers,
            "model" : model_name, 
            "dataset" : "train", 
            "accuracy" : accuracy_train,
            "recall" : recall_train, 
            "precision" : precision_train, 
            "f1_score" : f1_score_train}

        self.df_eval.loc[len(self.df_eval)] = {
            "imbalance" : self.imbalance,
            "outliers" : self.outliers,
            "model" : model_name, 
            "dataset" : "validation", 
            "accuracy" : accuracy_val,
            "recall" : recall_val, 
            "precision" : precision_val, 
            "f1_score" : f1_score_val}
        
    def return_df_eval(self):
        """
        This function retuns the df eval.
        """

        return self.df_eval

# Modeling

In [43]:
df_eval = pd.DataFrame(columns = ["imbalance", "outliers", "model", "dataset", "accuracy", "recall", "precision", "f1_score"])
df_eval

Unnamed: 0,imbalance,outliers,model,dataset,accuracy,recall,precision,f1_score


In [5]:
data_prep_train = PrepDataset(df = df_train, df_name = "train")
df_train = data_prep_train.return_dataset()

In [6]:
data_prep_val = PrepDataset(df = df_val, df_name = "validation")
df_val = data_prep_val.return_dataset()

In [40]:
classsic_model_eval = ClassicModelsEvaluation(df_train = df_train, df_val = df_val, df_eval = df_eval, imbalance = "none", outliers = "none")

In [41]:
df_eval = classsic_model_eval.return_df_eval()
df_eval

Unnamed: 0,imbalance,outliers,model,dataset,accuracy,recall,precision,f1_score
0,none,none,logistic regression,train,0.998697,0.0,0.0,0.0
1,none,none,logistic regression,validation,0.998696,0.0,0.0,0.0
2,none,none,random forest,train,0.999997,0.997391,1.0,0.998694
3,none,none,random forest,validation,0.998853,0.157638,0.773134,0.261881
4,none,none,xgb,train,0.998775,0.103322,0.665174,0.178862
5,none,none,xgb,validation,0.998744,0.090079,0.589641,0.156283
