In [4]:
import dgl
import torch
import pandas as pd
from dgl.nn import GraphConv
import torch.nn.functional as F

In [5]:

#load datasets
df_classes = pd.read_csv(r"C:\Users\User\Desktop\UAB\3rd-year\2nd-semester\synthesis project II\elliptic_bitcoin_dataset\elliptic_txs_classes.csv")
df_edges = pd.read_csv(r"C:\Users\User\Desktop\UAB\3rd-year\2nd-semester\synthesis project II\elliptic_bitcoin_dataset\elliptic_txs_edgelist.csv")
df_features = pd.read_csv(r"C:\Users\User\Desktop\UAB\3rd-year\2nd-semester\synthesis project II\elliptic_bitcoin_dataset\elliptic_txs_features.csv", header=None)



In [6]:
df_classes = df_classes[df_classes['class'] != "unknown"]
# Change column names --> Column 1 is txId, Column 2 is timestep and the rest are unknown features
df_features.columns = ['txId', 'timestep'] + ['f' + str(i) for i in range(165)]

# Remove all edges that do not appear in classesDF
df_features = df_features[df_features['txId'].isin(df_classes['txId'])]

df_edges = df_edges[df_edges['txId1'].isin(df_classes['txId']) & df_edges['txId2'].isin(df_classes['txId'])]

In [7]:
# Identifica los nodos en df_classes y df_features
nodos_classes = set(df_classes['txId'])
nodos_features = set(df_features['txId'])

# Encuentra la intersección de nodos entre clases y características
nodos_comunes = nodos_classes.intersection(nodos_features)

# Filtrar df_edges para asegurar que ambos nodos de cada arista estén en nodos_comunes
df_edges = df_edges[df_edges['txId1'].isin(nodos_comunes) & df_edges['txId2'].isin(nodos_comunes)]

# Filtrar df_features y df_classes para incluir solo los nodos comunes
df_features = df_features[df_features['txId'].isin(nodos_comunes)]
df_classes = df_classes[df_classes['txId'].isin(nodos_comunes)]

In [8]:
# Asegurarte de que df_edges solo contiene nodos presentes en los otros DataFrames
assert all(df_edges['txId1'].isin(df_features['txId']))
assert all(df_edges['txId2'].isin(df_features['txId']))

# Verificar que df_features y df_classes contienen los mismos nodos
assert set(df_features['txId']) == set(df_classes['txId'])

In [9]:
#We assign 0 to ilicit transacions and 1 to licit ones
df_classes['class'] = df_classes['class'].replace({'1': 0, '2': 1})

In [10]:
unique_nodes_classes = df_classes['txId'].nunique()

unique_nodes_edges = pd.concat([df_edges['txId1'], df_edges['txId2']]).nunique()

unique_nodes_features = df_features['txId'].nunique()

# Imprimir el número de nodos únicos en cada dataset
print(f"Nodos únicos en df_classes: {unique_nodes_classes}")
print(f"Nodos únicos en df_edges: {unique_nodes_edges}")
print(f"Nodos únicos en df_features: {unique_nodes_features}")

Nodos únicos en df_classes: 46564
Nodos únicos en df_edges: 35874
Nodos únicos en df_features: 46564


In [11]:
#reindexar los nodos para que empicen desde 0
node_mapping = {old_id: new_id for new_id, old_id in enumerate(df_features['txId'].unique())}

# Aplicar este mapeo a df_edges para actualizar los identificadores de nodos a la nueva secuencia
df_edges['txId1'] = df_edges['txId1'].map(node_mapping)
df_edges['txId2'] = df_edges['txId2'].map(node_mapping)

# Ahora, cuando crees el grafo y asignes características y etiquetas, el tamaño debería coincidir
g = dgl.graph((df_edges['txId1'].values, df_edges['txId2'].values))
g.ndata['feat'] = torch.tensor(df_features.iloc[:, 2:].values, dtype=torch.float32)  # Asumiendo que las dos primeras columnas no son características
g.ndata['label'] = torch.tensor(df_classes['class'].astype(int).values, dtype=torch.long)  # Asegurándonos de que las clases están en formato numérico

# Definir el modelo GCN
class GCNModel(torch.nn.Module):
    def __init__(self, in_feats, h_feats, num_classes):
        super(GCNModel, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
        self.conv2 = GraphConv(h_feats, num_classes, allow_zero_in_degree=True)

    def forward(self, g, inputs):
        h = self.conv1(g, inputs)
        h = torch.relu(h)
        h = self.conv2(g, h)
        return h

# Instanciar y entrenar el modelo
model = GCNModel(g.ndata['feat'].shape[1], 16, 2)  # Asumiendo 2 clases (lícito, ilícito) y omitiendo 'unknown'


In [12]:
import torch
import torch.nn.functional as F
from dgl.data import DGLDataset
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

def train(model, graph, features, labels, train_mask, optimizer):
    model.train()
    optimizer.zero_grad()
    out = model(graph, features)
    loss = F.cross_entropy(out[train_mask], labels[train_mask])
    loss.backward()
    optimizer.step()
    return loss.item()

def evaluate(model, graph, features, labels, mask):
    model.eval()
    with torch.no_grad():
        logits = model(graph, features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = torch.max(logits, dim=1)
        correct = torch.sum(indices == labels)
        return correct.item() * 1.0 / len(labels)

# Suponiendo que ya tienes un train_mask y un test_mask para indicar qué nodos usar para entrenamiento y prueba
train_mask = torch.rand(len(g.ndata['label'])) < 0.8  # Ejemplo para generar una máscara; ajusta según tus datos
test_mask = ~train_mask

optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

best_test_acc = 0.0
best_model_path = 'best_model_GCN.pth'

# Entrenamiento
for epoch in range(1000):  # Número de épocas
    loss = train(model, g, g.ndata['feat'], g.ndata['label'], train_mask, optimizer)
    train_acc = evaluate(model, g, g.ndata['feat'], g.ndata['label'], train_mask)
    print(f'Epoch {epoch}, Loss: {loss:.4f}, Train Acc: {train_acc:.4f}')

    # Evaluación en los datos de prueba
    test_acc = evaluate(model, g, g.ndata['feat'], g.ndata['label'], test_mask)
    print(f'Test Accuracy: {test_acc:.4f}')
    
    # Guardar el modelo si la precisión actual es mejor que la mejor precisión hasta el momento
    if test_acc > best_test_acc:
        best_test_acc = test_acc
        torch.save(model.state_dict(), best_model_path)
        print("Model saved")

# Cargar el mejor modelo guardado
best_model = GCNModel(g.ndata['feat'].shape[1], 16, 2)
best_model.load_state_dict(torch.load(best_model_path))
best_model.eval()

# Evaluación del mejor modelo en los datos de prueba
test_acc = evaluate(best_model, g, g.ndata['feat'], g.ndata['label'], test_mask)
print(f'best acc: {test_acc:.4f}')


Epoch 0, Loss: 1.2235, Train Acc: 0.7140
Test Accuracy: 0.7106
Model saved
Epoch 1, Loss: 0.9021, Train Acc: 0.7677
Test Accuracy: 0.7650
Model saved
Epoch 2, Loss: 0.7476, Train Acc: 0.8057
Test Accuracy: 0.8042
Model saved
Epoch 3, Loss: 0.6703, Train Acc: 0.8363
Test Accuracy: 0.8306
Model saved
Epoch 4, Loss: 0.6261, Train Acc: 0.8521
Test Accuracy: 0.8478
Model saved
Epoch 5, Loss: 0.5965, Train Acc: 0.8619
Test Accuracy: 0.8569
Model saved
Epoch 6, Loss: 0.5737, Train Acc: 0.8681
Test Accuracy: 0.8637
Model saved
Epoch 7, Loss: 0.5545, Train Acc: 0.8745
Test Accuracy: 0.8696
Model saved
Epoch 8, Loss: 0.5380, Train Acc: 0.8799
Test Accuracy: 0.8739
Model saved
Epoch 9, Loss: 0.5236, Train Acc: 0.8839
Test Accuracy: 0.8788
Model saved
Epoch 10, Loss: 0.5108, Train Acc: 0.8873
Test Accuracy: 0.8827
Model saved
Epoch 11, Loss: 0.4995, Train Acc: 0.8900
Test Accuracy: 0.8866
Model saved
Epoch 12, Loss: 0.4894, Train Acc: 0.8928
Test Accuracy: 0.8885
Model saved
Epoch 13, Loss: 0.4801