In [1]:
import pandas as pd
import torch
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from tensorflow.keras.metrics import Accuracy

In [2]:
classesDF  = pd.read_csv(r"C:\Users\User\Desktop\UAB\3rd-year\2nd-semester\synthesis project II\elliptic_bitcoin_dataset\elliptic_txs_classes.csv")
edgesDF = pd.read_csv(r"C:\Users\User\Desktop\UAB\3rd-year\2nd-semester\synthesis project II\elliptic_bitcoin_dataset\elliptic_txs_edgelist.csv")
featuresDF = pd.read_csv(r"C:\Users\User\Desktop\UAB\3rd-year\2nd-semester\synthesis project II\elliptic_bitcoin_dataset\elliptic_txs_features.csv", header=None)
featuresDF.columns = ['txId', 'timestep'] + ['f' + str(i) for i in range(165)]

In [3]:
#class 2: LICIT // class 1: ILLICIT
classesDF['class'] = classesDF['class'].map({'2': 0, '1': 1, 'unknown': -1})

featuresDF = featuresDF.merge(classesDF, on='txId')

# Move features 'class' to first column
cols = list(featuresDF.columns)
cols = cols[:1] + [cols[-1]] + cols[1:-1]
featuresDF = featuresDF[cols]

featuresDF.head(5)

Unnamed: 0,txId,class,timestep,f0,f1,f2,f3,f4,f5,f6,...,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164
0,230425980,-1,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,-1,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,-1,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,0,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,-1,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [4]:
in_degree_dict = {}
out_degree_dict = {}

for _, row in edgesDF.iterrows():
    txId1, txId2 = row['txId1'], row['txId2']
    
    if txId2 not in in_degree_dict:
        in_degree_dict[txId2] = []
    in_degree_dict[txId2].append(txId1)
    
    if txId1 not in out_degree_dict:
        out_degree_dict[txId1] = []
    out_degree_dict[txId1].append(txId2)

# Filtrar los diccionarios para incluir solo IDs presentes en featuresDF_example
valid_ids = set(featuresDF['txId'])

in_degree_dict_filtered = {k: [v for v in vals if v in valid_ids] for k, vals in in_degree_dict.items() if k in valid_ids}
out_degree_dict_filtered = {k: [v for v in vals if v in valid_ids] for k, vals in out_degree_dict.items() if k in valid_ids}

# Funciones para obtener las listas de conexiones
def get_in_degree_connections(txId):
    return in_degree_dict_filtered.get(txId, [])

def get_out_degree_connections(txId):
    return out_degree_dict_filtered.get(txId, [])

# Aplicar las funciones para crear las nuevas columnas
featuresDF['in_degree_edges'] = featuresDF['txId'].apply(get_in_degree_connections)
featuresDF['out_degree_edges'] = featuresDF['txId'].apply(get_out_degree_connections)

featuresDF.head() 

Unnamed: 0,txId,class,timestep,f0,f1,f2,f3,f4,f5,f6,...,f157,f158,f159,f160,f161,f162,f163,f164,in_degree_edges,out_degree_edges
0,230425980,-1,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,[98374661],[5530458]
1,5530458,-1,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792,[230425980],[232403360]
2,232022460,-1,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792,[232000575],"[232438397, 232022462]"
3,232438397,0,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,...,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792,"[232022460, 232047899, 3877118, 230452718, 230...",[92491280]
4,230460314,-1,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,...,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117,"[3272536, 230724244]","[230459870, 230460307, 230459688, 230570333, 2..."


In [5]:
featuresDF.drop(['in_degree_edges', 'out_degree_edges'], axis=1, inplace=True)
featuresDF.head() 

Unnamed: 0,txId,class,timestep,f0,f1,f2,f3,f4,f5,f6,...,f155,f156,f157,f158,f159,f160,f161,f162,f163,f164
0,230425980,-1,1,-0.171469,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,-0.562153,-0.600999,1.46133,1.461369,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
1,5530458,-1,1,-0.171484,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.947382,0.673103,-0.979074,-0.978556,0.018279,-0.08749,-0.131155,-0.097524,-0.120613,-0.119792
2,232022460,-1,1,-0.172107,-0.184668,-1.201369,-0.12197,-0.043875,-0.113002,-0.061584,...,0.670883,0.439728,-0.979074,-0.978556,-0.098889,-0.106715,-0.131155,-0.183671,-0.120613,-0.119792
3,232438397,0,1,0.163054,1.96379,-0.646376,12.409294,-0.063725,9.782742,12.414558,...,-0.577099,-0.613614,0.241128,0.241406,1.072793,0.08553,-0.131155,0.677799,-0.120613,-0.119792
4,230460314,-1,1,1.011523,-0.081127,-1.201369,1.153668,0.333276,1.312656,-0.061584,...,-0.511871,-0.400422,0.517257,0.579382,0.018279,0.277775,0.326394,1.29375,0.178136,0.179117


In [6]:
"""
X = featuresDF.drop(['txId', 'class'], axis=1).values.astype(float)
y = featuresDF['class'].values.astype(float)

# Convertir -1 en 'unknown' a NaN para el filtrado
y[y == -1] = np.nan

# Dividir los datos en conjuntos etiquetados y no etiquetados
is_labeled = ~np.isnan(y)
X_labeled = X[is_labeled]
y_labeled = y[is_labeled]

# Codificación One-hot de las etiquetas
y_labeled_onehot = to_categorical(y_labeled, num_classes=2)

X_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled_onehot, test_size=0.2, random_state=42)
"""


"\nX = featuresDF.drop(['txId', 'class'], axis=1).values.astype(float)\ny = featuresDF['class'].values.astype(float)\n\n# Convertir -1 en 'unknown' a NaN para el filtrado\ny[y == -1] = np.nan\n\n# Dividir los datos en conjuntos etiquetados y no etiquetados\nis_labeled = ~np.isnan(y)\nX_labeled = X[is_labeled]\ny_labeled = y[is_labeled]\n\n# Codificación One-hot de las etiquetas\ny_labeled_onehot = to_categorical(y_labeled, num_classes=2)\n\nX_train, X_test, y_train, y_test = train_test_split(X_labeled, y_labeled_onehot, test_size=0.2, random_state=42)\n"

In [17]:

X = featuresDF.drop(['txId', 'class'], axis=1).values.astype(float)
y = featuresDF['class'].values.astype(float)

is_labeled = y >= 0  # Esto seleccionará clases 1 y 2 como etiquetadas
is_unlabeled = y == -1  # Esto seleccionará clases -1 como no etiquetadas

X_labeled = X[is_labeled]
y_labeled = y[is_labeled]
print("y_labeled:", y_labeled)
X_unlabeled = X[is_unlabeled]

# Codificación One-hot de las etiquetas para los datos etiquetados
y_labeled_onehot = to_categorical(y_labeled - 1, num_classes=2)  # Ajustar las clases a 0 y 1

# División de los datos etiquetados en lícitos e ilícitos
X_licitos = X_labeled[y_labeled == 0]
print(len(X_licitos))
y_licitos = y_labeled_onehot[y_labeled == 0]

X_ilicitos = X_labeled[y_labeled == 1]
print(len(X_ilicitos))
y_ilicitos = y_labeled_onehot[y_labeled == 1]

#LICITOS
X_licitos_balanced = X_licitos[:4000]
print("X_licitos_balanced:",len(X_licitos_balanced))
X_licitos_eval = X_licitos[4000:]
print("X_licitos_eval: ", len(X_licitos_eval))

y_licitos_balanced= y_licitos[:4000]
print("y_licitos_balanced:",len(y_licitos_balanced))
y_licitos_eval = y_licitos[4000:]
print("y_licitos_balanced:",len(y_licitos_eval))

#ILICITOS
X_ilicitos_balanced = X_ilicitos[:4000]
print("X_ilicitos_balanced:",len(X_ilicitos_balanced))
X_ilicitos_eval = X_ilicitos[4000:]
print("X_ilicitos_eval: ", len(X_ilicitos_eval))

y_ilicitos_balanced= y_ilicitos[:4000]
print("y_ilicitos_balanced:",len(y_ilicitos_balanced))
y_ilicitos_eval = y_ilicitos[4000:]
print("y_ilicitos_balanced:",len(y_ilicitos_eval))

# Combinar y mezclar los datos balanceados para el entrenamiento
X_train_balanced = np.concatenate([X_licitos_balanced, X_ilicitos_balanced])
y_train_balanced = np.concatenate([y_licitos_balanced, y_ilicitos_balanced])
X_train_balanced, y_train_balanced = shuffle(X_train_balanced, y_train_balanced, random_state=42)

# Combinar los datos reservados para evaluación
X_eval = np.concatenate([X_licitos_eval, X_ilicitos_eval])
y_eval = np.concatenate([y_licitos_eval, y_ilicitos_eval])

# Asignar los datos no etiquetados como conjunto de test (para el autoentrenamiento o predicciones futuras)
X_test = X_unlabeled

# Ahora tienes:
# X_train_balanced, y_train_balanced: Datos de entrenamiento balanceados.
# X_eval, y_eval: Datos reservados para la evaluación final del modelo.
# X_test: Datos no etiquetados para predicciones futuras o autoentrenamiento.


y_labeled: [0. 0. 0. ... 1. 0. 1.]
42019
4545
X_licitos_balanced: 4000
X_licitos_eval:  38019
y_licitos_balanced: 4000
y_licitos_balanced: 38019
X_ilicitos_balanced: 4000
X_ilicitos_eval:  545
y_ilicitos_balanced: 4000
y_ilicitos_balanced: 545


In [18]:
print(X_train_balanced.shape)
print(X_test.shape)
print(y_train_balanced.shape)



(8000, 166)
(157205, 166)
(8000, 2)


In [8]:
#print(X_train.shape)
#print(X_test.shape)
#print(y_train.shape)
#print(y_test.shape)
"""
(37251, 166)
(9313, 166)
(37251, 2)
(9313, 2)
"""


'\n(37251, 166)\n(9313, 166)\n(37251, 2)\n(9313, 2)\n'

In [21]:
# Definir el modelo
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X_train_balanced.shape[1],)))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='softmax'))  # 2 clases: lícitas e ilícitas

# Compilar el modelo
model.compile(optimizer=SGD(), loss='categorical_crossentropy', metrics=['accuracy'])

# Entrenar el modelo con datos etiquetados
model.fit(X_train_balanced, y_train_balanced, epochs=20, batch_size=100)

# Predecir y evaluar con datos de prueba
#y_pred = model.predict(X_test)

"""
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
confusion_mtx = confusion_matrix(y_test_labels, y_pred_labels)
print(confusion_mtx)
"""


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


'\ny_pred_labels = np.argmax(y_pred, axis=1)\ny_test_labels = np.argmax(y_test, axis=1)\nconfusion_mtx = confusion_matrix(y_test_labels, y_pred_labels)\nprint(confusion_mtx)\n'

In [22]:

# Función de auto-entrenamiento adaptada para datos tabulares
def self_training(model, X_labeled, y_labeled, X_unlabeled, threshold):
    epoch = 0
    while len(X_unlabeled) > 0:
        # Entrenar el modelo
        model.fit(X_labeled, y_labeled, epochs=1, batch_size=100)

        # Predecir etiquetas para datos no etiquetados
        predictions = model.predict(X_unlabeled)
        confidences = np.max(predictions, axis=1)
        high_confidence_indices = confidences > threshold

        # Seleccionar datos con alta confianza
        X_high_confidence = X_unlabeled[high_confidence_indices]
        y_high_confidence = predictions[high_confidence_indices]

        # Actualizar conjuntos etiquetados
        X_labeled = np.concatenate([X_labeled, X_high_confidence])
        y_labeled = np.concatenate([y_labeled, y_high_confidence])

        # Eliminar los seleccionados de los datos no etiquetados
        X_unlabeled = np.delete(X_unlabeled, high_confidence_indices, axis=0)
        epoch += 1
        print(f"Iteración {epoch+1}, Datos añadidos: {len(X_high_confidence)}, Datos faltantes: {len(X_unlabeled)}")

    return model, X_labeled, y_labeled



In [12]:
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_pred_labels = np.argmax(y_pred, axis=1)
    y_test_labels = np.argmax(y_test, axis=1)
    
    # Calcular la precisión y la matriz de confusión
    accuracy = accuracy_score(y_test_labels, y_pred_labels)
    confusion_mtx = confusion_matrix(y_test_labels, y_pred_labels)
    
    return accuracy, confusion_mtx

In [23]:
model_self_trained, X_self_labeled, y_self_labeled = self_training(model, X_train_balanced, y_train_balanced, X_test, threshold=0.9)

"""
# Evaluar el modelo después del autoentrenamiento
final_accuracy, final_confusion_mtx = evaluate_model(model_self_trained, X_test, y_test)
print("Después del autoentrenamiento:")
print("Precisión:", final_accuracy)
print("Matriz de confusión:\n", final_confusion_mtx)
"""

 1/80 [..............................] - ETA: 0s - loss: 0.0065 - accuracy: 1.0000

Iteración 2, Datos añadidos: 142873, Datos faltantes: 14332
Iteración 3, Datos añadidos: 158, Datos faltantes: 14174
Iteración 4, Datos añadidos: 51, Datos faltantes: 14123
Iteración 5, Datos añadidos: 145, Datos faltantes: 13978
Iteración 6, Datos añadidos: 13, Datos faltantes: 13965
Iteración 7, Datos añadidos: 15, Datos faltantes: 13950
Iteración 8, Datos añadidos: 16, Datos faltantes: 13934
Iteración 9, Datos añadidos: 23, Datos faltantes: 13911
Iteración 10, Datos añadidos: 9, Datos faltantes: 13902
Iteración 11, Datos añadidos: 13, Datos faltantes: 13889
Iteración 12, Datos añadidos: 28, Datos faltantes: 13861
Iteración 13, Datos añadidos: 7, Datos faltantes: 13854
Iteración 14, Datos añadidos: 29, Datos faltantes: 13825
Iteración 15, Datos añadidos: 0, Datos faltantes: 13825
Iteración 16, Datos añadidos: 8, Datos faltantes: 13817
Iteración 17, Datos añadidos: 17, Datos faltantes: 13800
Iteración 18, Datos añadidos: 7, Datos faltantes: 13793
Iteración 19, Datos añadidos: 16, Dato

KeyboardInterrupt: 

In [None]:
X_self_labeled
#deberian ser 203769 creo

In [None]:
y_self_labeled

In [None]:
acc, conf_mtrx = evaluate_model(model_self_trained, X_eval, y_eval)

print("accuracy:", acc)
print("confusion matrix:", conf_mtrx)