In [79]:
import numpy as np
from scipy import sparse
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier  # for fitclinear equivalent
from scipy.sparse import csr_matrix, lil_matrix, csc_matrix, issparse
from tqdm import tqdm
import time
from sklearn.linear_model import LogisticRegression  # or LinearSVC
from sklearn.exceptions import ConvergenceWarning
import warnings
import random
import os

# -----------------------------
# Parameters for rcv1x
# -----------------------------
d = 2456
e = 234 # 3k*logd
k = 10
m = 240
n = 10000
nt = 1000
p = 6000


In [80]:
# -----------------------------
# Dataset Loader
# -----------------------------
def parse_line_lbls_fts(line):
    """Parse a single line into labels and features."""
    lbls_part, fts_part = line.strip().split(' ', 1)
    labels = [int(l) for l in lbls_part.split(',')]
    features = []
    for ft in fts_part.strip().split():
        idx, val = ft.split(':')
        features.append((int(idx), float(val)))
    return labels, features

def read_data(file_path, n, p, d, random_seed=42):
    """Reads a sparse matrix format dataset from a text file with optional constraints."""
    with open(file_path, 'r') as f:
        num_inst, num_ft, num_lbl = map(int, f.readline().split())
        lines = f.readlines()

    if n is None:
        n = num_inst
    n = min(n, len(lines))

    # Shuffle and select n lines
    random.seed(random_seed)
    selected_lines = random.sample(lines, n)

    feature_data = []
    feature_row = []
    feature_col = []

    label_data = []
    label_row = []
    label_col = []

    for idx, line in enumerate(selected_lines):
        labels, features = parse_line_lbls_fts(line)

        for ft_id, ft_val in features:
            if p is None or ft_id < p:
                feature_row.append(ft_id)
                feature_col.append(idx)
                feature_data.append(ft_val)

        for lbl_id in labels:
            if d is None or lbl_id < d:
                label_row.append(lbl_id)
                label_col.append(idx)
                label_data.append(1.0)

    if p is None:
        p = max(feature_row, default=0) + 1
    if d is None:
        d = max(label_row, default=0) + 1

    ft_mat = csc_matrix((feature_data, (feature_row, feature_col)), shape=(p, n))
    lbl_mat = csc_matrix((label_data, (label_row, label_col)), shape=(d, n))

    return ft_mat, lbl_mat

In [81]:
file = 'RCV1-x/rcv1x_train.txt'
X, Y = read_data(file,n, p, d)


In [82]:
def k_disjunct(m, d, p1):
    """
    Create a k-disjunct constant weight group testing matrix (sparse binarY).

    Parameters:
    m -- number of rows
    d -- number of columns
    p1 -- weight per column

    Returns:
    A -- sparse binarY matrix of shape m x d
    """
    A2 = lil_matrix((m, d), dtype=int)

    rep1 = int(np.floor(d / p1))
    rep2 = int(np.ceil(m / rep1))
    j1 = 0

    for j2 in range(rep2):
        if j2 == 0:
            for j in range(rep1):
                A2[j, j1:j1+p1] = 1
                j1 += p1
        else:
            rn = np.random.permutation(d)
            for j in range(rep1):
                A2[(j2-1)*rep1 + j, :] = A2[j, rn]

    A2 = A2[:m, :]
    rd = np.random.permutation(d)
    A2 = A2[:, rd]

    return csc_matrix(A2)



def Sel_c_k_disjunct(Y, m, n, k, c1):
    """
    Generate a constant weight group testing matrix A.
    
    Parameters:
    Y -- d x n label matrix (NumPY arraY or SciPY sparse matrix)
    m -- Number of groups (rows of A)
    n -- Number of training samples (columns in Y)
    k -- Label sparsitY
    c1 -- List of column sparsitY sweep values

    Returns:
    A -- m x d sparse binarY matrix
    c -- selected column sparsitY value from c1
    er -- minimum average Hamming loss error
    """
    d, _ = Y.shape
    Err = []
    Atmp = []

    for c1_val in c1:
        p1 = int(np.floor(c1_val * d / m))
        A = k_disjunct(m, d, p1)

        Z = (A @ Y[:, :n]) > 0  # spones: convert to binarY (0/1)

        ATp = A.transpose() @ Z
        err = np.zeros(n)

        for l in range(n):
            Yp = np.zeros(d)
            idx = np.argsort(-ATp[:, l])  # sort descending
            Yp[idx[:k]] = 1
            err[l] = np.sum(Yp != Y[:, l].toarray().flatten())

        Err.append(np.mean(err))
        Atmp.append(A)

    min_idx = np.argmin(Err)
    er = Err[min_idx]
    A = Atmp[min_idx]
    c = c1[min_idx]

    return A, c, er


def sort_sparse_mat(X):
    """
    Sorts each row of sparse matrix X (m x n) bY descending value,
    and returns a dense (m x n) matrix where each row contains
    the indices of columns sorted in descending order of value.

    Parameters:
    X -- scipY.sparse.csr_matrix of shape (m, n)

    Returns:
    rank_mat -- numpY arraY of shape (m, n)
    """
    if not isinstance(X, csr_matrix):
        X = csr_matrix(X)

    m, n = X.shape
    rank_mat = np.zeros((m, n), dtype=int)

    for i in range(m):
        row = X.getrow(i)
        cols = row.indices
        vals = row.data

        if len(vals) == 0:
            continue

        sorted_indices = np.argsort(-vals)  # descending sort
        sorted_cols = cols[sorted_indices]

        rank_mat[i, :len(sorted_cols)] = sorted_cols

    return rank_mat


def precision_k_new(score_mat, true_mat, K):
    return _helper(score_mat, true_mat, K)

def _helper(score_mat, true_mat, K):
    num_inst = score_mat.shape[1]
    num_lbl = score_mat.shape[0]

    score_mat = csr_matrix(score_mat)
    rank_mat = sort_sparse_mat(score_mat)  # custom ranking

    mat = []
    for j in range(num_inst):
        tmp = rank_mat[:, j].copy()
        tmp[tmp > K] = 0
        mat.append((tmp > 0).astype(int))

    mat = np.array(mat).T
    mat = csr_matrix(mat).multiply(true_mat)
    num = np.array(mat.sum(axis=0)).ravel()

    P = np.zeros(K)
    for k in range(1, K+1):
        num2 = np.minimum(num, k)
        P[k-1] = np.mean(num2 / k)

    return P




warnings.filterwarnings("ignore", category=ConvergenceWarning)

def MLGT_train_test(X, Y, Xtest, Ytest, A, k):
    """
    Train and evaluate MLGT with given data and parameters.

    Parameters:
    X, Y        -- Training feature and label matrices
    Xtest, Ytest-- Testing feature and label matrices
    A           -- Group testing matrix
    k           -- Label sparsitY (number of labels per instance)

    Returns:
    Output -- dictionarY with precision, training time, test time, etc.
    """
    m, d = A.shape
    n = X.shape[0]
    nt = Xtest.shape[0]
    Ztest = np.zeros((m, nt))
    Output = {}

    # --- Training ---
    t1 = time.process_time()
    Y2 = (A @ Y) > 0  # Shape [m, n]
    SVM = {}

    for j in range(m):
        y2 = Y2[j, :].toarray().ravel() if issparse(Y2) else Y2[j, :]
        if np.count_nonzero(y2) == 0:
            Ztest[j, :] = 0
        else:
            clf = LogisticRegression(solver='liblinear')
            clf.fit(X, y2)
            SVM[j] = clf


    t2 = time.process_time()

    # --- Testing ---
    for l in range(m):
        if l in SVM:
            Ztest[l, :] = SVM[l].predict(Xtest)


    Ztest = csr_matrix(Ztest)
    ATp = A.transpose().dot(Ztest)

    t3 = time.process_time()

    # --- Evaluation ---
    prec_k = precision_k_new(ATp, Ytest, k)

    Output["Prec_k"] = prec_k
    Output["train_time"] = t2 - t1
    Output["test_time"] = t3 - t2
    Output["total_times"] = t3 - t1

    return Output

In [83]:
dir = 'RCV1-x'
file_train = os.path.join(dir+'/', 'rcv1x_train.txt')
file_test = os.path.join(dir+'/', 'rcv1x_test.txt')

print("Loading dataset...")
X, Y = read_data(file_train, n, p, d)
Xtest, Ytest = read_data(file_test, nt, p, d)

X = X.T
Xtest = Xtest.T
print("Dataset loaded.")

Loading dataset...
Dataset loaded.


In [84]:
# MLGT parameters
c1 = list(range(10, 70, 10))  # column sparsitY sweep

# --- CW (Constant Weight MLGT) ---
A2, c, Err2 = Sel_c_k_disjunct(Y, m, n, k, c1)
# print(A2.shape)

start_test = time.time()
Output2 = MLGT_train_test(X, Y, Xtest, Ytest, A2, k)
end_test = time.time()


CW_GT_Prec = [
    Output2['Prec_k'][0],
    Output2['Prec_k'][2],
    Output2['Prec_k'][4]
]
CW_times = [Output2['train_time'], Output2['test_time']]

In [85]:
Output2

{'Prec_k': array([0.01      , 0.005     , 0.00333333, 0.0025    , 0.002     ,
        0.00166667, 0.00142857, 0.00125   , 0.00111111, 0.001     ]),
 'train_time': 2.3918431820000023,
 'test_time': 0.018048861000124816,
 'total_times': 2.409892043000127}