# Packages and Loading Data

In [2]:
import pandas as pd

In [3]:
df_train = pd.read_csv("../data/split/train_data.csv", index_col = 0)
df_train.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
df_val = pd.read_csv("../data/split/validation_data.csv", index_col = 0)
df_val.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
4453834,323,PAYMENT,36803.44,C653758498,0.0,0.0,M2114359036,0.0,0.0,0,0
4453835,323,PAYMENT,41815.13,C1154913534,11111.0,0.0,M821726718,0.0,0.0,0,0
4453836,323,CASH_IN,229384.36,C714505539,95395.0,324779.36,C437999309,37059.08,0.0,0,0
4453837,323,CASH_IN,109600.23,C1613588355,324779.36,434379.59,C1202120651,189235.02,79634.79,0,0
4453838,323,CASH_IN,174450.23,C279735836,434379.59,608829.82,C1853554454,535059.8,360609.56,0,0


In [106]:
df_test = pd.read_csv("../data/split/test_data.csv", index_col = 0)
df_test.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
5726358,399,CASH_IN,134026.34,C993872822,602.0,134628.34,C786960629,0.0,0.0,0,0
5726359,399,CASH_IN,256289.51,C734624004,140.0,256429.51,C1844307683,11063.34,0.0,0,0
5726360,399,CASH_OUT,105813.06,C2112729029,29762.0,0.0,C137554848,35547.12,141360.19,0,0
5726361,399,PAYMENT,50199.15,C2016143960,30629.0,0.0,M689597968,0.0,0.0,0,0
5726362,399,CASH_OUT,104446.36,C1232806766,101215.0,0.0,C1117209015,62403.45,166849.81,0,0


# Class

In [138]:
class FeatureCreator():

    def __init__(self, df_train : pd.DataFrame, df_val : pd.DataFrame, df_test : pd.DataFrame):
        
        # initialize a copy of the df
        self.df = pd.concat([df_train, df_val, df_test], axis = 0).copy()

        # inialize list to resplit the dfs
        self.idx_sep_list = [df_train.index[-1], df_val.index[0], df_val.index[-1], df_test.index[0]]
        
        # remove and rename columns
        self.remove_data_leak_cols()
        self.rename_columns()
    
    def remove_data_leak_cols(self):
        """
        Removes the columns determined to be possible causes of 
        a data leakage in the eda.
        """
        col_to_remove = ["oldbalanceOrg", "newbalanceOrig", "oldbalanceDest", "newbalanceDest"]

        self.df.drop(col_to_remove, axis = 1, inplace = True)

    def rename_columns(self):
        """
        renames the columns for more readablity.
        """
        
        self.df.rename({
            "nameOrig" : "orig",
            "nameDest" : "dest",
            "isFraud" : "fraud",
            "isFlaggedFraud" : "flag_fraud",
            }, axis = 1, inplace = True)

    def count_txn_in_n_hours(self, n : int = 24):
        """
        Creates a column in the dataframe that counts how many 
        transcation occured binning for n hours from every origin and
        one from every destination.
        """

        targets = ["orig", "dest"]
        reference_time = pd.Timestamp("2025-01-01 00:00:00")

        self.df["datetime"] = reference_time + pd.to_timedelta(self.df["step"], unit = "h")

        for target in targets:

            df_sorted = self.df.sort_values([target, "datetime"]).copy()

            counts = (
                df_sorted.groupby(target)
                        .rolling(f"{n}h", on = "datetime", closed = "left")["step"]
                        .count()
                        .reset_index()
                        .rename(columns={"step": f"n_txn_{n}h_{target}"})
            )

            counts.drop_duplicates(inplace = True)
            
            self.df = self.df.merge(
                counts,
                on = [target, "datetime"],
                how = "left"
            )

        self.df.drop("datetime", axis = 1, inplace = True)
        self.df.fillna(value = 0, inplace = True)
    
    def avg_amount_in_n_hours(self, n : int = 48):
        """
        Creates two columns containing the average transaction amount in
        the last n hours, one by origin one by destination.
        """

        targets = ["orig", "dest"]
        reference_time = pd.Timestamp("2025-01-01 00:00:00")

        self.df['datetime'] = reference_time + pd.to_timedelta(self.df['step'], unit='h')

        for target in targets:
            df_sorted = self.df.sort_values([target, 'datetime']).copy()

            averages = (
                df_sorted.groupby(target)
                        .rolling(f"{n}h", on = "datetime", closed = "left")["amount"]
                        .mean()
                        .reset_index()
                        .rename(columns={"amount": f'avg_amnt_{n}h_{target}'})
                )
            
            averages.drop_duplicates(inplace = True)

            self.df = self.df.merge(
                averages,
                on = [target, "datetime"],
                how = "left"
            )
            
        self.df.drop("datetime", axis = 1, inplace = True)
        self.df.fillna(value = 0, inplace = True)

    def ratio_amount_by_average(self):
        """
        Creates two columns with the ratio between the amount and the amount
        average by respectively average amount by origin and average amount
        by destination.
        """

        targets = ["orig", "dest"]

        for target in targets:
            df_sorted = self.df.sort_values(by = [target, "step"]).copy()

            df_sorted["cum_sum"] = df_sorted.groupby(by = target)["amount"].cumsum() - df_sorted["amount"]
            df_sorted["cum_count"] = df_sorted.groupby(by = target).cumcount()

            df_sorted[f"avg_amt_{target}"] = df_sorted["cum_sum"] / df_sorted["cum_count"]
            df_sorted[f"amt/avg_amt_{target}"] = df_sorted["amount"] / df_sorted[f"avg_amt_{target}"]
            
            df_sorted.fillna(value = 0, inplace = True)
            
            self.df = self.df.merge(df_sorted[f"amt/avg_amt_{target}"], right_index = True, left_index = True, how = "left")
    
    def time_delta_last_transaction(self):
        """
        Creates two columns which report the time delta from
        the last transaction respectively from the same origin
        and the same destiantion.
        """

        targets = ["orig", "dest"]

        for target in targets:
            df_sorted = self.df.sort_values(by = [target, "step"]).copy()

            df_sorted[f"t_delta_txn_{target}"] = (
                df_sorted.groupby(by = target)["step"]
                .diff()
                .fillna(0)
            )

            self.df = self.df.merge(df_sorted[f"t_delta_txn_{target}"], right_index = True, left_index = True, how = "left")
            
    def split_datasets(self) -> list[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """
        returns a list with respectively the train, validation, and
        test datasets.
        """

        return [self.df.loc[:self.idx_sep_list[0]], self.df.loc[self.idx_sep_list[1] : self.idx_sep_list[2]], self.df.loc[self.idx_sep_list[3]:]]
    
    def save_datasets(self):
        """
        Save the three separate datasets in the processed folder for further use.
        """

        train_data, validation_data, test_data = self.split_datasets()

        train_data.to_csv("../data/processed/train_data.csv")
        validation_data.to_csv("../data/processed/validation_data.csv")
        test_data.to_csv("../data/processed/test_data.csv")

# Creating Features

In [139]:
feat_creator = FeatureCreator(df_train = df_train, df_val = df_val, df_test = df_test)

In [140]:
feat_creator.df

Unnamed: 0,step,type,amount,orig,dest,fraud,flag_fraud
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0,0
1,1,PAYMENT,1864.28,C1666544295,M2044282225,0,0
2,1,TRANSFER,181.00,C1305486145,C553264065,1,0
3,1,CASH_OUT,181.00,C840083671,C38997010,1,0
4,1,PAYMENT,11668.14,C2048537720,M1230701703,0,0
...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,C776919290,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,C1881841831,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,C1365125890,1,0
6362618,743,TRANSFER,850002.52,C1685995037,C2080388513,1,0


## count_txn_in_n_hours

In [141]:
feat_creator.count_txn_in_n_hours()

In [142]:
feat_creator.df

Unnamed: 0,step,type,amount,orig,dest,fraud,flag_fraud,n_txn_24h_orig,n_txn_24h_dest
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0,0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,M2044282225,0,0,0.0,0.0
2,1,TRANSFER,181.00,C1305486145,C553264065,1,0,0.0,0.0
3,1,CASH_OUT,181.00,C840083671,C38997010,1,0,0.0,0.0
4,1,PAYMENT,11668.14,C2048537720,M1230701703,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,C776919290,1,0,0.0,0.0
6362616,743,TRANSFER,6311409.28,C1529008245,C1881841831,1,0,0.0,0.0
6362617,743,CASH_OUT,6311409.28,C1162922333,C1365125890,1,0,0.0,0.0
6362618,743,TRANSFER,850002.52,C1685995037,C2080388513,1,0,0.0,0.0


## avg_amount_in_n_hours

In [143]:
feat_creator.avg_amount_in_n_hours()

In [144]:
feat_creator.df

Unnamed: 0,step,type,amount,orig,dest,fraud,flag_fraud,n_txn_24h_orig,n_txn_24h_dest,avg_amnt_48h_orig,avg_amnt_48h_dest
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0,0,0.0,0.0,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,M2044282225,0,0,0.0,0.0,0.0,0.0
2,1,TRANSFER,181.00,C1305486145,C553264065,1,0,0.0,0.0,0.0,0.0
3,1,CASH_OUT,181.00,C840083671,C38997010,1,0,0.0,0.0,0.0,0.0
4,1,PAYMENT,11668.14,C2048537720,M1230701703,0,0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,C776919290,1,0,0.0,0.0,0.0,0.0
6362616,743,TRANSFER,6311409.28,C1529008245,C1881841831,1,0,0.0,0.0,0.0,0.0
6362617,743,CASH_OUT,6311409.28,C1162922333,C1365125890,1,0,0.0,0.0,0.0,0.0
6362618,743,TRANSFER,850002.52,C1685995037,C2080388513,1,0,0.0,0.0,0.0,0.0


## ratio_amount_by_average

In [145]:
feat_creator.ratio_amount_by_average()

In [146]:
feat_creator.df

Unnamed: 0,step,type,amount,orig,dest,fraud,flag_fraud,n_txn_24h_orig,n_txn_24h_dest,avg_amnt_48h_orig,avg_amnt_48h_dest,amt/avg_amt_orig,amt/avg_amt_dest
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0,0,0.0,0.0,0.0,0.0,0.0,0.000000
1,1,PAYMENT,1864.28,C1666544295,M2044282225,0,0,0.0,0.0,0.0,0.0,0.0,0.000000
2,1,TRANSFER,181.00,C1305486145,C553264065,1,0,0.0,0.0,0.0,0.0,0.0,0.000000
3,1,CASH_OUT,181.00,C840083671,C38997010,1,0,0.0,0.0,0.0,0.0,0.0,0.000000
4,1,PAYMENT,11668.14,C2048537720,M1230701703,0,0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,C776919290,1,0,0.0,0.0,0.0,0.0,0.0,1.044852
6362616,743,TRANSFER,6311409.28,C1529008245,C1881841831,1,0,0.0,0.0,0.0,0.0,0.0,0.000000
6362617,743,CASH_OUT,6311409.28,C1162922333,C1365125890,1,0,0.0,0.0,0.0,0.0,0.0,38.719870
6362618,743,TRANSFER,850002.52,C1685995037,C2080388513,1,0,0.0,0.0,0.0,0.0,0.0,0.000000


## time_delta_last_transaction

In [147]:
feat_creator.time_delta_last_transaction()

In [148]:
feat_creator.df

Unnamed: 0,step,type,amount,orig,dest,fraud,flag_fraud,n_txn_24h_orig,n_txn_24h_dest,avg_amnt_48h_orig,avg_amnt_48h_dest,amt/avg_amt_orig,amt/avg_amt_dest,t_delta_txn_orig,t_delta_txn_dest
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,M2044282225,0,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,1,TRANSFER,181.00,C1305486145,C553264065,1,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,1,CASH_OUT,181.00,C840083671,C38997010,1,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,1,PAYMENT,11668.14,C2048537720,M1230701703,0,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,C776919290,1,0,0.0,0.0,0.0,0.0,0.0,1.044852,0.0,465.0
6362616,743,TRANSFER,6311409.28,C1529008245,C1881841831,1,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
6362617,743,CASH_OUT,6311409.28,C1162922333,C1365125890,1,0,0.0,0.0,0.0,0.0,0.0,38.719870,0.0,489.0
6362618,743,TRANSFER,850002.52,C1685995037,C2080388513,1,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


# Final Result

In [149]:
feat_creator.df

Unnamed: 0,step,type,amount,orig,dest,fraud,flag_fraud,n_txn_24h_orig,n_txn_24h_dest,avg_amnt_48h_orig,avg_amnt_48h_dest,amt/avg_amt_orig,amt/avg_amt_dest,t_delta_txn_orig,t_delta_txn_dest
0,1,PAYMENT,9839.64,C1231006815,M1979787155,0,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
1,1,PAYMENT,1864.28,C1666544295,M2044282225,0,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,1,TRANSFER,181.00,C1305486145,C553264065,1,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,1,CASH_OUT,181.00,C840083671,C38997010,1,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,1,PAYMENT,11668.14,C2048537720,M1230701703,0,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,C776919290,1,0,0.0,0.0,0.0,0.0,0.0,1.044852,0.0,465.0
6362616,743,TRANSFER,6311409.28,C1529008245,C1881841831,1,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
6362617,743,CASH_OUT,6311409.28,C1162922333,C1365125890,1,0,0.0,0.0,0.0,0.0,0.0,38.719870,0.0,489.0
6362618,743,TRANSFER,850002.52,C1685995037,C2080388513,1,0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0


# Saving the Data

In [150]:
feat_creator.save_datasets()