In [3]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import hamming_loss
from scipy.sparse import lil_matrix
from tqdm import tqdm
import os

# -----------------------------
# Parameters
# -----------------------------
k = 10
m = 240
e = 30
GT_type = "sparse_rand"

# -----------------------------
# Dataset Loader for Delicious
# -----------------------------
def load_delicious_dataset(info_path, data_path, n_samples):
    with open(info_path, 'r') as f:
        line = f.readline()
        num_labels, num_features, _ = map(int, line.strip().split())

    X = lil_matrix((n_samples, num_features), dtype=np.float32)
    Y = np.zeros((n_samples, num_labels), dtype=np.int32)

    with open(data_path, 'r') as f:
        for i, line in tqdm(enumerate(f), total=n_samples):
            if i >= n_samples:
                break
            parts = line.strip().split()
            if len(parts) < 2:
                continue
            labels = parts[0].split(',')
            features = parts[1:]

            for l in labels:
                try:
                    idx = int(l)
                    if 0 <= idx < num_labels:
                        Y[i, idx] = 1
                except ValueError:
                    pass

            for item in features:
                if ':' in item:
                    try:
                        idx, val = item.split(':')
                        idx = int(idx)
                        val = float(val)
                        if 0 <= idx < num_features:
                            X[i, idx] = val
                    except ValueError:
                        continue

    return X.tocsr(), Y, num_features, num_labels

# -----------------------------
# GT Matrix Builder
# -----------------------------
def build_gt_matrix(d, m, method="sparse_rand"):
    A = np.zeros((m, d), dtype=int)
    s = int(np.ceil(np.log2(d)))  # ensure better disjunct properties
    for j in range(d):
        ones = np.random.choice(m, size=s, replace=False)
        A[ones, j] = 1
    return A

# -----------------------------
# MLGT Training (Algorithm 1)
# -----------------------------
def train_classifiers(X, Y, A):
    from scipy.sparse import issparse, csr_matrix
    assert isinstance(X, csr_matrix), "X must be in CSR format"
    Z = (Y @ A.T > 0).astype(int)

    classifiers = []
    for j in tqdm(range(A.shape[0])):
        clf = LogisticRegression(solver='saga', max_iter=300, n_jobs=-1, random_state=42)
        clf.fit(X, Z[:, j])
        classifiers.append(clf)
    return classifiers

# -----------------------------
# MLGT Prediction with Probabilities
# -----------------------------
def predict_all_scores(X, classifiers, A):
    m = len(classifiers)
    Z_hat = np.zeros((X.shape[0], m))
    for j, clf in enumerate(classifiers):
        Z_hat[:, j] = clf.predict_proba(X)[:, 1]  # probability of label 1

    Y_scores = np.zeros((X.shape[0], A.shape[1]))
    for l in range(A.shape[1]):
        rows = np.where(A[:, l] == 1)[0]
        Y_scores[:, l] = np.sum(Z_hat[:, rows], axis=1)
    return Y_scores

# -----------------------------
# Thresholding for binary prediction
# -----------------------------
def threshold_predictions(Y_scores, A, e):
    Y_pred = np.zeros_like(Y_scores, dtype=int)
    for l in range(A.shape[1]):
        rows = np.where(A[:, l] == 1)[0]
        for i in range(Y_scores.shape[0]):
            support_misses = np.sum(Y_scores[i, rows] < 0.5)  # count probable 0s
            if support_misses < e / 2:
                Y_pred[i, l] = 1
    return Y_pred

# -----------------------------
# Evaluation
# -----------------------------
def precision_at_k(y_true, y_scores, k):
    precisions = []
    for yt, yp in zip(y_true, y_scores):
        topk = np.argsort(-yp)[:k]
        correct = yt[topk].sum()
        precisions.append(correct / k)
    return np.mean(precisions)

# -----------------------------
# Main
# -----------------------------
info_file = 'Delicious/Delicious_data.txt'
train_file = 'Delicious/delicious_trSplit.txt'
test_file  = 'Delicious/delicious_tstSplit.txt'
n = 12920  # Number of training samples
nt = 3185   # Number of test samples

print("Loading training data...")
X_train, Y_train, p, d = load_delicious_dataset(info_file, train_file, n)
print("Loading test data...")
X_test, Y_test, _, _ = load_delicious_dataset(info_file, test_file, nt)

print("Building GT matrix...")
A = build_gt_matrix(d, m, method=GT_type)

print("Training classifiers...")
classifiers = train_classifiers(X_train, Y_train, A)

print("Predicting test scores...")
Y_scores = predict_all_scores(X_test, classifiers, A)

print("Decoding predictions with thresholding...")
Y_pred = threshold_predictions(Y_scores, A, e)

print("Evaluation")
k_eval = 10
hl = hamming_loss(Y_test, Y_pred)
p2 = precision_at_k(Y_test, Y_scores, k_eval)
print(f"Hamming Loss: {hl:.4f}")
print(f"Precision@{k_eval} : {p2:.4f}")

Loading training data...


  0%|          | 0/12920 [00:00<?, ?it/s]

100%|██████████| 12920/12920 [00:00<00:00, 22799.57it/s]


Loading test data...


100%|██████████| 3185/3185 [00:00<00:00, 18733.08it/s]


Building GT matrix...
Training classifiers...


KeyboardInterrupt: 

In [None]:
# plot all values from k_eval 1 to 10

import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
plt.plot(range(1, 11), [precision_at_k(Y_test, Y_scores, k) for k in range(1, 11)], marker='o')
plt.title('Precision@k vs k')
plt.xlabel('k')
plt.ylabel('Precision@k')
plt.xticks(range(1, 11))
plt.grid()
plt.savefig('precision_at_k.png')
plt.show()
