In [43]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

df=pd.read_csv('creditcard.csv')
df.drop(columns=['Time'], inplace=True)

fraud_data=df[df['Class']==1]
normal_data=df[df['Class']==0].sample(n=len(fraud_data)*10,random_state=42)

selected_df=pd.concat([normal_data,fraud_data])

features=selected_df.drop("Class",axis=1)
cos_sim=cosine_similarity(features)

In [49]:
mask = cos_sim > 0.5

src, dst = np.nonzero(mask)
edge_index = np.stack([src, dst])
density = src.shape[0]/(features.shape[0]*features.shape[0])
print(f'Number of edges:{len(src)}')
print(density)

Number of edges:18675162
0.6376007246768698


In [45]:
label = selected_df['Class'].values

cnt = 0
for i, j in zip(src, dst):
    if label[i] == label[j]:
        cnt += 1


print(f'homo ratio : {cnt/len(src)}')

homo ratio : 0.8904653143035653


In [46]:
# create train validation mask(val ratio = 0.15)

from sklearn.model_selection import StratifiedShuffleSplit

X = selected_df.drop(columns=['Class']).values
y = selected_df['Class'].values

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.15)

train, val = list(sss.split(y,y))[0]

train_mask = np.ones(X.shape[0], dtype=bool)
for i in val:
    train_mask[i] = False

val_mask = np.zeros(X.shape[0], dtype=bool)
for i in val:
    val_mask[i] = True

In [48]:
import os

os.makedirs('./dataset/raw', exist_ok=True)

np.savez('./dataset/raw/creditcard.npz', features=X, label=y, cosine_sim=cos_sim, train_mask=train_mask, val_mask=val_mask)
data = np.load('./dataset/raw/creditcard.npz')

data.files

['features', 'label', 'cosine_sim', 'train_mask', 'val_mask']