In [1]:
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf

In [2]:
# Load your dataset
df = pd.read_csv('creditcard/fraudTrain.csv')  # replace with actual file path

In [3]:
# Label Encoding Function
def df_label_encoder(df, columns):
    le = preprocessing.LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col].astype(str))
    return df

In [4]:
# Preprocessing the data (Label Encoding and feature engineering)
def preprocess(df):
    # List of columns to label encode
    label_columns = ['merchant', 'category', 'gender', 'state', 'job']

    # Perform label encoding on categorical columns
    df = df_label_encoder(df, label_columns)

    # Convert timestamp to numerical format and normalize
    df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
    df['trans_date_trans_time'] = df['trans_date_trans_time'].apply(lambda x: x.value)
    df['trans_date_trans_time'] = (df['trans_date_trans_time'] - df['trans_date_trans_time'].min()) / (df['trans_date_trans_time'].max() - df['trans_date_trans_time'].min())

    # Now return the processed dataframe
    return df


In [5]:
# Node Embedding Functions
def paid_aggregate(paying_df):
    accounts = pd.DataFrame()
    accounts['Account'] = paying_df['cc_num'].unique()

    # Example: Average transaction amount per account
    accounts['avg_amt_paid'] = paying_df.groupby('cc_num')['amt'].transform('mean')

    return accounts

In [6]:
def get_node_attr(paying_df):
    node_df = paid_aggregate(paying_df)

    # Extract node labels if necessary
    node_label = paying_df['is_fraud'].values
    node_label = tf.convert_to_tensor(node_label, dtype=tf.float32)
    node_label = tf.reshape(node_label, (-1, 1))

    # Drop unnecessary columns from node embeddings
    node_df = node_df.drop(['Account'], axis=1)

    return node_df, node_label

In [None]:
# Apply preprocessing
df = preprocess(df)

In [None]:
# Split data for node embeddings
paying_df = df[['cc_num', 'amt', 'is_fraud']]  # Adjust depending on available columns
node_df, node_label = get_node_attr(paying_df)

In [None]:
print(node_df)
print(node_label)