In [99]:
import numpy as np
import os
from scipy.sparse import lil_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import hamming_loss
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# -----------------------------
# Parameters
# -----------------------------
k = 10      # Max labels per sample
e = 30      # Error correction parameter
m = 240     # Number of classifiers
n = 2000    # Training samples
nt = 501    # Test samples
p = 6000    # Feature dimension
d = 2456   # Number of labels
GT_type = "sparse_rand"  # Choose from: sparse_rand, expander, rs_code

In [100]:
# -------------------------------
# DATA LOADER
# -------------------------------
def load_dataset(path, n_samples, p, d):
    X = lil_matrix((n_samples, p), dtype=np.float32)
    Y = np.zeros((n_samples, d), dtype=np.int32)

    with open(path, 'r') as f:
        header = f.readline()  # skip header
        for i, line in tqdm(enumerate(f), total=n_samples):
            if i >= n_samples:
                break
            parts = line.strip().split()
            if not parts:
                continue
            # Label parsing
            label_tokens = parts[0].split(',')
            for l in label_tokens:
                if l.isdigit():
                    idx = int(l)
                    if 0 <= idx < d:
                        Y[i, idx] = 1
            # Feature parsing
            for item in parts[1:]:
                if ':' in item:
                    try:
                        idx, val = item.split(':')
                        idx = int(idx)
                        if idx < p:
                            X[i, idx] = float(val)
                    except ValueError:
                        continue
    return X.tocsr(), Y

In [101]:
# -----------------------------
# GT Matrix Construction
# -----------------------------
def build_gt_matrix(d, m, method="sparse_rand"):
    A = np.zeros((m, d), dtype=int)
    if method == "sparse_rand":
        for i in range(m):
            indices = np.random.choice(d, size=(d // k), replace=False)
            A[i, indices] = 1
    elif method == "expander":
        degree = d // k
        for j in range(d):
            neighbors = np.random.choice(m, degree, replace=False)
            A[neighbors, j] = 1
    elif method == "rs_code":
        raise NotImplementedError("Reed-Solomon code based GT not implemented")
    else:
        raise ValueError("Unknown GT matrix type")
    return A

# -----------------------------
# Train Classifiers (Binary Relevance on m meta-labels)
# -----------------------------
def encode_meta_labels(Y, A):
    Z = (Y @ A.T) > 0
    return Z.astype(int)

def train_classifiers(X, Z, method="logistic"):
    classifiers = []
    for j in range(Z.shape[1]):
        if method == "logistic":
            clf = LogisticRegression(max_iter=1000, class_weight='balanced')
        elif method == "rf":
            clf = RandomForestClassifier(n_estimators=100, class_weight='balanced')
        else:
            raise ValueError("Unknown classifier")
        clf.fit(X, Z[:, j])
        classifiers.append(clf)
    return classifiers

def predict_meta_labels(classifiers, X):
    Z_pred = np.zeros((X.shape[0], len(classifiers)))
    for j, clf in enumerate(classifiers):
        Z_pred[:, j] = clf.predict(X)
    return Z_pred

def decode_labels(Z_pred, A, k):
    Y_pred_scores = Z_pred @ A
    topk = np.argsort(-Y_pred_scores, axis=1)[:, :k]
    result = np.zeros((Z_pred.shape[0], A.shape[1]), dtype=int)
    for i in range(Z_pred.shape[0]):
        result[i, topk[i]] = 1
    return result


# -----------------------------
# Evaluation
# -----------------------------
def precision_at_k(Y_true, Y_pred, k):
    num = 0
    denom = Y_true.shape[0]
    for i in range(Y_true.shape[0]):
        topk = np.argsort(-Y_pred[i])[:k]
        num += np.sum(Y_true[i, topk])
    return num / (k * denom)


In [102]:
# -----------------------------
# Main
# -----------------------------
dir = 'RCV1-x/RCV1-x/'
print("Loading training data...")
X_train, Y_train = load_dataset(os.path.join(dir, 'rcv1x_train.txt'), n, p, d)
print(f"Training labels: {d}")

print("Loading testing data...")
X_test, Y_test = load_dataset(os.path.join(dir, 'rcv1x_test.txt'), nt, p, d)

Loading training data...


100%|██████████| 2000/2000 [00:00<00:00, 18690.74it/s]


Training labels: 2456
Loading testing data...


100%|██████████| 501/501 [00:00<00:00, 18551.66it/s]


In [103]:
# # check how many labels are unique in the training set
# unique_labels = np.unique(Y_train, axis=0)
# print(f"Unique labels in training set: {unique_labels.shape[0]}")
# Y_train

In [104]:
print("Building GT matrix...")
A = build_gt_matrix(d, m, method=GT_type)

print("Encoding meta-labels...")
Z_train = encode_meta_labels(Y_train, A)

Building GT matrix...
Encoding meta-labels...


In [105]:
print("Training classifiers...")
classifiers = train_classifiers(X_train, Z_train, method="logistic")

Training classifiers...


In [106]:
print("Predicting meta-labels...")
Z_pred = predict_meta_labels(classifiers, X_test)

print("Decoding labels...")
Y_test_pred = decode_labels(Z_pred, A, k=k)

print("Evaluation:")
print("Hamming Loss     : {:.4f}".format(hamming_loss(Y_test, Y_test_pred)))
print("Precision@{}      : {:.4f}".format(k, precision_at_k(Y_test, Y_test_pred, 10)))

Predicting meta-labels...
Decoding labels...
Evaluation:
Hamming Loss     : 0.0054
Precision@10      : 0.0830
