In [1]:
import numpy as np
import pandas as pd

In [2]:
pathToData = '../DATA/'

df_banksim = pd.read_csv(pathToData + 'bs140513_032310.csv')
df_banksim.head()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount,fraud
0,0,'C1093826151','4','M','28007','M348934600','28007','es_transportation',4.55,0
1,0,'C352968107','2','M','28007','M348934600','28007','es_transportation',39.68,0
2,0,'C2054744914','4','F','28007','M1823072687','28007','es_transportation',26.89,0
3,0,'C1760612790','3','M','28007','M348934600','28007','es_transportation',17.25,0
4,0,'C757503768','5','M','28007','M348934600','28007','es_transportation',35.72,0


## Data Preprocessing
### Data Formatting

In [3]:
df_banksim = df_banksim.drop(['zipMerchant','zipcodeOri','step'],axis=1)

df_banksim['customer']= df_banksim['customer'].str.replace('\'','')
df_banksim['age']=(df_banksim['age'].str.replace('U','7').str.replace('\'','')).astype(int)
df_banksim['gender']=df_banksim['gender'].str.replace('\'','')
df_banksim['category']=df_banksim['category'].str.replace('\'','')
df_banksim['merchant']=df_banksim['merchant'].str.replace('\'','')

### Label Encoding

In [4]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, RocCurveDisplay, precision_recall_curve
import matplotlib.pyplot as plt
encoder = LabelEncoder()

encoder.fit(df_banksim['customer'])
df_banksim['customer_encoded'] = encoder.transform(df_banksim['customer'])

encoder.fit(df_banksim['gender'])
df_banksim['gender_encoded'] = encoder.transform(df_banksim['gender'])

encoder.fit(df_banksim['category'])
df_banksim['category_encoded'] = encoder.transform(df_banksim['category'])

encoder.fit(df_banksim['merchant'])
df_banksim['merchant_encoded'] = encoder.transform(df_banksim['merchant'])

In [5]:
df_banksim_encoded = df_banksim.drop(['gender','category'],axis=1)
X = df_banksim_encoded.drop('fraud', axis=1)
y = df_banksim_encoded['fraud']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

### Derived Features

#### Graph Generation

In [6]:
import networkx as nx

class Graph:
    bc = None
    dc = None

    def __init__(self, data : pd.DataFrame = None):
        self.G = nx.Graph()
        if data is not None:
            self.add_all(data)
        
    def add_nodes(self, data: pd.DataFrame):
        self.G.add_nodes_from(data['customer'].unique())
        self.G.add_nodes_from(data['merchant'].unique())

    def add_edges(self, data: pd.DataFrame):
        for i, row in data.iterrows():
            self.G.add_edge(row['customer'], row['merchant'])

    def add_dc(self, data: pd.DataFrame):
        if self.dc is None:
            self.dc = nx.degree_centrality(self.G)

        data['dc_customer'] = pd.Series()
        data['dc_merchant'] = pd.Series()
        
        data['dc_customer'] = data['customer'].apply(lambda x : self.dc[x] if x in self.dc.keys() else 0)
        data['dc_merchant'] = data['merchant'].apply(lambda x : self.dc[x] if x in self.dc.keys() else 0)
        
        return data

    def add_bc(self, data: pd.DataFrame):
        if self.bc is None:
            self.bc = nx.betweenness_centrality(self.G)

        data['bc_customer'] = pd.Series()
        data['bc_merchant'] = pd.Series()
        
        data['bc_customer'] = data['customer'].apply(lambda x : self.bc[x] if x in self.bc.keys() else 0)
        data['bc_merchant'] = data['merchant'].apply(lambda x : self.bc[x] if x in self.bc.keys() else 0)
        
        return data

    def add_all(self, data: pd.DataFrame):
        self.add_nodes(data)
        self.add_edges(data)
        self.add_dc(data)
        self.add_bc(data)

        return data

In [7]:
train_graph = Graph(X_train)

  data['dc_customer'] = pd.Series()
  data['dc_merchant'] = pd.Series()
  data['bc_customer'] = pd.Series()
  data['bc_merchant'] = pd.Series()


In [8]:
train_graph.add_dc(X_test)
train_graph.add_bc(X_test)

  data['dc_customer'] = pd.Series()
  data['dc_merchant'] = pd.Series()
  data['bc_customer'] = pd.Series()
  data['bc_merchant'] = pd.Series()


Unnamed: 0,customer,age,merchant,amount,customer_encoded,gender_encoded,category_encoded,merchant_encoded,dc_customer,dc_merchant,bc_customer,bc_merchant
70803,C746862122,3,M348934600,11.65,3559,2,12,30,0.001923,0.943523,1.340586e-05,0.237219
470791,C1760492708,1,M1823072687,1.60,1648,1,12,18,0.002403,0.857727,3.043174e-06,0.156899
568310,C1984083185,3,M1823072687,33.36,2150,1,12,18,0.001682,0.857727,1.191243e-06,0.156899
23709,C1530262146,2,M1823072687,8.01,1150,2,12,18,0.001923,0.857727,2.329173e-06,0.156899
49723,C1471216995,3,M348934600,38.11,1020,1,12,30,0.001682,0.943523,2.187481e-06,0.237219
...,...,...,...,...,...,...,...,...,...,...,...,...
60622,C1549367339,4,M348934600,13.00,1196,1,12,30,0.001923,0.943523,2.658918e-06,0.237219
386618,C1342118588,3,M348934600,25.13,752,1,12,30,0.002163,0.943523,1.365683e-05,0.237219
529143,C1505370540,3,M1823072687,46.25,1096,1,12,18,0.002644,0.857727,1.901750e-05,0.156899
446659,C991774315,3,M1823072687,64.05,4094,1,12,18,0.001682,0.857727,1.359217e-06,0.156899


In [9]:
def calc_std_dev(data: pd.DataFrame, key: str, ret_key: str):
    std_dev = np.std(data[key])
    mean_val = np.mean(data[key])
    data[ret_key] = (data[key] - mean_val) / std_dev
    return data.drop(columns = [key])

X_train = calc_std_dev(X_train, 'dc_customer', 'customer_dc_std_away')
X_train = calc_std_dev(X_train, 'dc_merchant', 'merchant_dc_std_away')
X_train = calc_std_dev(X_train, 'bc_customer', 'customer_bc_std_away')
X_train = calc_std_dev(X_train, 'bc_merchant', 'merchant_bc_std_away')
X_test = calc_std_dev(X_test, 'dc_customer', 'customer_dc_std_away')
X_test = calc_std_dev(X_test, 'dc_merchant', 'merchant_dc_std_away')
X_test = calc_std_dev(X_test, 'bc_customer', 'customer_bc_std_away')
X_test = calc_std_dev(X_test, 'bc_merchant', 'merchant_bc_std_away')

In [10]:
X_train = X_train.drop(columns=['customer', 'merchant'])
X_test = X_test.drop(columns=['customer', 'merchant'])

In [11]:
X_train.to_csv(pathToData + 'X_train.csv')
X_test.to_csv(pathToData + 'X_test.csv')
y_train.to_csv(pathToData + 'y_train.csv')
y_test.to_csv(pathToData + 'y_test.csv')