In [214]:
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import hamming_loss
from scipy.sparse import lil_matrix
from tqdm import tqdm
import os

# -----------------------------
# Parameters
# -----------------------------
k = 10
m = 240
e = 30
n = 2000
nt = 501
p = 6000
d = 2456
GT_type = "sparse_rand"


In [215]:
# -----------------------------
# Dataset Loader
# -----------------------------
def load_dataset(path, n_samples, p, d):
    X = lil_matrix((n_samples, p), dtype=np.float32)
    Y = np.zeros((n_samples, d), dtype=np.int32)

    with open(path, 'r') as f:
        _ = f.readline()  # skip header
        for i, line in tqdm(enumerate(f), total=n_samples):
            if i >= n_samples:
                break
            parts = line.strip().split()
            labels = parts[0].split(',')
            for l in labels:
                if l.isdigit():
                    idx = int(l)
                    if 0 <= idx < d:
                        Y[i, idx] = 1
            for item in parts[1:]:
                if ':' in item:
                    idx, val = item.split(':')
                    idx = int(idx)
                    if idx < p:
                        X[i, idx] = float(val)
    return X.tocsr(), Y

In [216]:
# -----------------------------
# GT Matrix Builders
# -----------------------------
def build_gt_matrix(d, m, method="sparse_rand"):
    if method == "sparse_rand":
        A = np.zeros((m, d), dtype=int)
        s = m // (k + 1)
        for j in range(d):
            ones = np.random.choice(m, size=s, replace=False)
            A[ones, j] = 1
    # elif method == "expander":
    #     A = np.zeros((m, d), dtype=int)
    #     deg = m // (k + 1)
    #     for j in range(d):
    #         ones = np.random.choice(m, size=deg, replace=False)
    #         A[ones, j] = 1
    # elif method == "rs_code":
    #     np.random.seed(42)
    #     A = np.random.randint(0, 2, size=(m, d))
    # else:
    #     raise ValueError("Unknown GT matrix type")
    return A

# -----------------------------
# -----------------------------
# MLGT Training (Algorithm 1)
# -----------------------------
def train_classifiers(X, Y, A):
    Z = np.zeros((X.shape[0], A.shape[0]), dtype=int)
    for i in range(X.shape[0]):
        Z[i] = np.any(A[:, Y[i] == 1], axis=1).astype(int)

    classifiers = []
    for j in tqdm(range(A.shape[0])):
        clf = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=42)
        clf.fit(X.toarray(), Z[:, j])
        classifiers.append(clf)
    return classifiers

# -----------------------------
# MLGT Prediction (Algorithm 2)
# -----------------------------
def decode_prediction(x, classifiers, A, e):
    m = len(classifiers)
    z_hat = np.zeros(m, dtype=int)
    for j, clf in enumerate(classifiers):
        z_hat[j] = clf.predict(x)[0]
        
    d = A.shape[1]
    y_hat = np.zeros(d, dtype=int)
    for l in range(d):
        if np.sum(np.logical_and(A[:, l], 1 - z_hat)) < e / 2:
            y_hat[l] = 1
    return y_hat

def predict_all(X, classifiers, A, e):
    preds = []
    for i in tqdm(range(X.shape[0])):
        preds.append(decode_prediction(X[i], classifiers, A, e))
    return np.array(preds)

# -----------------------------
# Evaluation
# -----------------------------
def precision_at_k(y_true, y_pred, k):
    precisions = []
    for yt, yp in zip(y_true, y_pred):
        topk = np.argsort(-yp)[:k]
        correct = yt[topk].sum()
        precisions.append(correct / k)
    return np.mean(precisions)


In [217]:
# -----------------------------
# Main
# -----------------------------
dir = 'RCV1-x/RCV1-x/'
file_train = os.path.join(dir, 'rcv1x_train.txt')
file_test  = os.path.join(dir, 'rcv1x_test.txt')

print("Loading training data...")
X_train, Y_train = load_dataset(file_train, n, p, d)
# d = Y_train.shape[1]  # update d

print("Loading test data...")
X_test, Y_test = load_dataset(file_test, nt, p, d)

print("Building GT matrix...")
A = build_gt_matrix(d, m, method=GT_type)



Loading training data...


100%|██████████| 2000/2000 [00:00<00:00, 19416.55it/s]


Loading test data...


100%|██████████| 501/501 [00:00<00:00, 19273.63it/s]

Building GT matrix...





In [218]:
print("Training classifiers...")
classifiers = train_classifiers(X_train, Y_train, A)

Training classifiers...


100%|██████████| 240/240 [00:22<00:00, 10.79it/s]


In [219]:
print("Predicting test set...")
Y_pred = predict_all(X_test, classifiers, A, e)

print("Evaluation")
k_eval = 10
hl = hamming_loss(Y_test, Y_pred)
p2 = precision_at_k(Y_test, Y_pred, k_eval)
print(f"Hamming Loss: {hl:.4f}")
print(f"Precision@{k_eval} : {p2:.4f}")

Predicting test set...


100%|██████████| 501/501 [01:44<00:00,  4.80it/s]

Evaluation
Hamming Loss: 0.0021
Precision@10 : 0.0174



