In [9]:
import pandas as pd
import torch
from torch_geometric.data import Data, DataLoader
from torch.utils.data import Dataset




In [3]:
classesDF  = pd.read_csv(r"C:\Users\User\Desktop\UAB\3rd-year\2nd-semester\synthesis project II\elliptic_bitcoin_dataset\elliptic_txs_classes.csv")
edgesDF = pd.read_csv(r"C:\Users\User\Desktop\UAB\3rd-year\2nd-semester\synthesis project II\elliptic_bitcoin_dataset\elliptic_txs_edgelist.csv")
featuresDF = pd.read_csv(r"C:\Users\User\Desktop\UAB\3rd-year\2nd-semester\synthesis project II\elliptic_bitcoin_dataset\elliptic_txs_features.csv", header=None)
featuresDF.columns = ['txId', 'timestep'] + ['f' + str(i) for i in range(165)]

In [4]:
#class 2: LICIT // class 1: ILLICIT
classesDF['class'] = classesDF['class'].map({'2': 0, '1': 1, 'unknown': -1})

featuresDF = featuresDF.merge(classesDF, on='txId')

# Move features 'class' to first column
cols = list(featuresDF.columns)
cols = cols[:1] + [cols[-1]] + cols[1:-1]
featuresDF = featuresDF[cols]

featuresDF.head(5)

Unnamed: 0,txId,class,timestep,f0,f1,f2,f3,f4,f5,f6,...,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164
0,230425980,-1,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,-1,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,-1,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,0,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,-1,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [10]:


class EllipticDataset(Dataset):
    def __init__(self, features_df, edges_df, classes_df):
        self.features_df = features_df
        self.edges_df = edges_df
        self.classes_df = classes_df

        # Asegurarse que 'class' es numérico
        self.features_df['class'] = pd.Categorical(self.features_df['class']).codes

        # Mapear txId a índices continuos
        self.idx_map = {tx_id: idx for idx, tx_id in enumerate(self.features_df.index.unique())}
        self.features_df.reset_index(inplace=True)
        self.features_df['txId'] = self.features_df['txId'].map(self.idx_map)

        # Mapear txId en edges_df a los índices continuos
        self.edges_df['txId1'] = self.edges_df['txId1'].map(self.idx_map)
        self.edges_df['txId2'] = self.edges_df['txId2'].map(self.idx_map)

    def __len__(self):
        # Número de timesteps únicos
        return self.features_df['timestep'].nunique()

    def __getitem__(self, idx):
        # Seleccionar datos por timestep
        timestep = idx + 1
        mask = self.features_df['timestep'] == timestep
        timestep_data = self.features_df[mask]
        timestep_edges = self.edges_df[self.edges_df['txId1'].isin(timestep_data['txId']) & self.edges_df['txId2'].isin(timestep_data['txId'])]

        # Extraer características y etiquetas
        x = torch.tensor(timestep_data.drop(['class', 'timestep', 'txId'], axis=1).values, dtype=torch.float)
        y = torch.tensor(timestep_data['class'].values, dtype=torch.long)

        # Construir edge_index
        edge_index = torch.tensor(timestep_edges[['txId1', 'txId2']].values.T, dtype=torch.long)

        return Data(x=x, edge_index=edge_index, y=y)




In [12]:
# Ejemplo de creación y uso del dataset
dataset = EllipticDataset(featuresDF, edgesDF, classesDF)

batch_size = 32
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

num_batches = 0  # Contador para los lotes
num_elements = 0  # Contador para los elementos totales procesados

for data in loader:
    num_batches += 1  # Incrementar el contador de lotes
    num_elements += data.num_graphs  # Sumar el número de grafos en el lote actual
    print(data)  # Imprime los datos del lote actual

print(f"Total de lotes procesados: {num_batches}")
print(f"Total de elementos procesados: {num_elements}")





DataBatch(x=[3519, 167], edge_index=[2, 232596], y=[3519], batch=[3519], ptr=[2])
DataBatch(x=[2975, 167], edge_index=[2, 232576], y=[2975], batch=[2975], ptr=[2])
DataBatch(x=[6803, 167], edge_index=[2, 232576], y=[6803], batch=[6803], ptr=[2])
DataBatch(x=[1089, 167], edge_index=[2, 232576], y=[1089], batch=[1089], ptr=[2])
DataBatch(x=[1976, 167], edge_index=[2, 232576], y=[1976], batch=[1976], ptr=[2])
DataBatch(x=[4296, 167], edge_index=[2, 232744], y=[4296], batch=[4296], ptr=[2])
DataBatch(x=[6727, 167], edge_index=[2, 232609], y=[6727], batch=[6727], ptr=[2])
DataBatch(x=[2486, 167], edge_index=[2, 232690], y=[2486], batch=[2486], ptr=[2])
DataBatch(x=[3639, 167], edge_index=[2, 232576], y=[3639], batch=[3639], ptr=[2])
DataBatch(x=[2760, 167], edge_index=[2, 232576], y=[2760], batch=[2760], ptr=[2])
DataBatch(x=[4592, 167], edge_index=[2, 232576], y=[4592], batch=[4592], ptr=[2])
DataBatch(x=[4975, 167], edge_index=[2, 232581], y=[4975], batch=[4975], ptr=[2])
DataBatch(x=[248