In [1]:
import os

In [4]:
computation = 'cloud'

In [5]:
if computation == 'cloud' :
  from google.colab import drive
  drive.mount('/content/drive')
  data_path = '/content/drive/MyDrive/uco_fraud_detector/data/transaction_fraud'
  model_save_path = '/content/drive/MyDrive/uco_fraud_detector/models'
  customer_fr_path = '/content/drive/MyDrive/uco_fraud_detector/models/customer_fr_database'
  merchant_fr_path = '/content/drive/MyDrive/uco_fraud_detector/models/merchant_fr_database'
  globalmodel_path = '/content/drive/MyDrive/uco_fraud_detector/models/globalmodel'
else :
  data_path = '../data/transaction_fraud'
  model_save_path = '../models'
  customer_fr_path = '../models/customer_fr_database'
  merchant_fr_path = '../models/merchant_fr_database'
  globalmodel_path = '../models/globalmodel'

print("searching for data path" , "found" if os.path.exists(data_path) else "not found")
print(os.listdir(data_path))

Mounted at /content/drive
searching for data path found
['bs140513_032310.csv', 'bsNET140513_032310.csv']


In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.preprocessing import LabelEncoder , StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
from collections import defaultdict
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Multiply
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
import pickle

In [7]:
np.random.seed(21)
tf.random.set_seed(21)

In [8]:
data_file = os.path.join(data_path, 'bs140513_032310.csv')

In [9]:
df = pd.read_csv(data_file)

In [10]:
data_preprocess_path = '/content/drive/MyDrive/uco_fraud_detector/data/data_preprocess'
with open(data_preprocess_path, 'rb') as f :
  data_preprocess = pickle.load(f)

In [11]:
df = df.replace("'" , "" , regex=True)
df = df.drop(['zipcodeOri' , 'zipMerchant'] , axis = 1)
df['account_frequency'] = df['customer'].apply(lambda x: data_preprocess['customer_frequency_rating_encoder'][x])
df['merchant_frequency'] = df['merchant'].apply(lambda x: data_preprocess['merchant_frequency_rating_encoder'][x])
df = df.rename(columns={'customer': 'account_id', 'merchant': 'merchant_id', 'step': 'time_delta'})
df = df.sort_values(['account_id', 'time_delta'])
df = df.reset_index(drop=True)
for c in data_preprocess['CATEGORICAL_COLS'] :
  df[c] = data_preprocess[c + '_label_encoder'].transform(df[c])
df[data_preprocess['NUMERICAL_COLS']] = data_preprocess['numerical_scaler'].transform(df[data_preprocess['NUMERICAL_COLS']])
df.head()

Unnamed: 0,time_delta,account_id,age,gender,merchant_id,category,amount,fraud,account_frequency,merchant_frequency
0,-1.272914,C1000148617,5,2,22,9,0.951322,0,-0.918953,-2.237926
1,-1.116216,C1000148617,5,2,15,10,-0.190302,0,-0.918953,-2.241786
2,-1.037867,C1000148617,5,2,22,9,0.164178,0,-0.918953,-2.237926
3,-1.01828,C1000148617,5,2,42,11,-0.207806,0,-0.918953,-2.233031
4,-0.998692,C1000148617,5,2,18,12,0.085544,0,-0.918953,0.765246


In [39]:
import pickle
embeddings_path = '/content/drive/MyDrive/uco_fraud_detector/data/account_embedding'
with open(embeddings_path, 'rb') as f :
  embeddings = pickle.load(f)

account_ids = df['account_id']
account_to_idx = {account_id: idx for idx, account_id in enumerate(account_ids)}
print("number of account in data set : " , df['account_id'].nunique())
print("number of account in embedding : " , len(embeddings.keys()))

number of account in data set :  4112
number of account in embedding :  4112


In [28]:
NUMERICAL_COLS = data_preprocess['NUMERICAL_COLS']
CATEGORICAL_COLS = data_preprocess['CATEGORICAL_COLS']
TARGET_COL = data_preprocess['TARGET_COL']

In [29]:
k = 60  # Historical transactions per account
f = 8   # Transactions per neighbor account
n_neighbors = 5  # Number of nearest neighbor accounts
seq_length = k + n_neighbors * f  # Total sequence length: 60 + 5 * 8 = 100

In [45]:

def create_relevant_neighbors_sequences(df, embeddings, seq_length, numerical_cols, categorical_cols, target_col, k=60, f=8, n_neighbors=5):
    num_features = len(numerical_cols) + len(categorical_cols)
    total_transactions = len(df)
    sequences = np.zeros((total_transactions, seq_length, num_features), dtype=np.float32)
    fraud_masks = np.zeros((total_transactions, seq_length), dtype=np.float32)
    neighbor_masks = np.zeros((total_transactions, seq_length), dtype=np.float32)
    relevance_scores = np.zeros((total_transactions, seq_length), dtype=np.float32)
    targets = df[target_col].values

    # Prepare embeddings array for NearestNeighbors
    embedding_array = np.array(list(embeddings.values()))
    nbrs = NearestNeighbors(n_neighbors=n_neighbors, metric='cosine').fit(embedding_array)

    # Precompute account features
    account_features = df.groupby('account_id')[numerical_cols + categorical_cols].apply(lambda x: x.values)

    for idx in range(total_transactions):
        current_account_id = df.iloc[idx]['account_id']
        current_embedding = embeddings[current_account_id].reshape(1, -1)
        current_transaction = df.iloc[idx][numerical_cols + categorical_cols].values.reshape(1, -1)
        current_fraud = df.iloc[idx][target_col]

        # Get own account's historical data
        account_data = account_features[current_account_id]
        account_data = account_data[account_data[:, numerical_cols.index('time_delta')] <= df.iloc[idx]['time_delta']]
        if len(account_data) > k:
            account_data = account_data[-k:]  # Take last k transactions
        elif len(account_data) < k:
            account_data = np.pad(account_data, ((0, k - len(account_data)), (0, 0)), mode='constant')

        # Find nearest neighbor accounts
        distances, neighbor_indices = nbrs.kneighbors(current_embedding)
        neighbor_account_ids = [account_ids[i] for i in neighbor_indices[0] if account_ids[i] != current_account_id][:n_neighbors]

        # Collect neighbor transactions, limiting total to fit seq_length
        neighbor_transactions = []
        neighbor_frauds = []
        remaining_slots = seq_length - k  # Slots available for neighbor transactions
        max_neighbor_trans = min(f, remaining_slots // n_neighbors)  # Adjust f based on available slots

        for neighbor_id in neighbor_account_ids:
            neighbor_data = account_features[neighbor_id]
            neighbor_data = neighbor_data[neighbor_data[:, numerical_cols.index('time_delta')] <= df.iloc[idx]['time_delta']]
            if len(neighbor_data) > max_neighbor_trans:
                neighbor_data = neighbor_data[-max_neighbor_trans:]  # Take up to max_neighbor_trans
            neighbor_transactions.extend(neighbor_data)
            neighbor_frauds.extend(df[df['account_id'] == neighbor_id].iloc[-len(neighbor_data):][target_col].values)

        # Combine sequences
        seq = np.concatenate([account_data, np.array(neighbor_transactions)])
        if len(seq) > seq_length:
            seq = seq[:seq_length]  # Truncate if exceeds
        elif len(seq) < seq_length:
            padding_len = seq_length - len(seq)
            seq = np.pad(seq, ((0, padding_len), (0, 0)), mode='constant')

        # Masks and relevance
        total_frauds = [current_fraud] + neighbor_frauds
        fraud_mask = np.pad(total_frauds, (0, seq_length - len(total_frauds)), mode='constant')[:seq_length]
        neighbor_mask = np.zeros(seq_length)
        neighbor_mask[k:k + len(neighbor_transactions)] = 1  # Set 1 for neighbor transactions
        if k + len(neighbor_transactions) > seq_length:
            neighbor_mask = neighbor_mask[:seq_length]  # Truncate if necessary
        relevance = np.ones(seq_length)
        relevance[k:k + len(neighbor_transactions)] = 0.5  # Lower relevance for neighbors
        if k + len(neighbor_transactions) > seq_length:
            relevance = relevance[:seq_length]  # Truncate if necessary

        sequences[idx] = seq
        fraud_masks[idx] = fraud_mask
        neighbor_masks[idx] = neighbor_mask
        relevance_scores[idx] = relevance

        # Progress
        progress = (idx + 1) / total_transactions * 100
        if progress%5 == 0 :
          print(f"Processing transaction {idx + 1}/{total_transactions} ({progress:.2f}%)")

    return sequences, fraud_masks, neighbor_masks, relevance_scores, targets



In [46]:
sequences, fraud_masks, neighbor_masks, relevance_scores, targets = create_relevant_neighbors_sequences(
        df, embeddings, seq_length, NUMERICAL_COLS, CATEGORICAL_COLS, TARGET_COL
    )

ValueError: index can't contain negative values