In [1]:
import os
import scipy.sparse as sp

## Helper function

In [2]:
def extract_xc_data(content):
    header = content[0]
    num_rows, num_cols = header[:-1].split(" ")
    num_rows = int(num_rows)
    num_cols = int(num_cols)

    indptr = [0]
    indices = []
    data = []
    for line in content[1:]:

        line = line[:-1]
        column_value = line.split(" ")
        for cv in column_value:
            if len(cv):
                col_num, value = cv.split(":")
                col_num = int(col_num)
                value = float(value)

                indices.append(col_num)
                data.append(value)
        indptr.append(len(indices))

    train_x_y_mat = sp.csr_matrix((data, indices, indptr), dtype=float)

    return train_x_y_mat


In [3]:
def extract_xc_text(content):
    trn_x = []
    for line in content:
        _, text = line.split('->')
        trn_x.append(text)
    return trn_x


In [4]:
def read_data(filename):
    with open(filename, encoding='utf-8') as file:
        df = file.readlines()
    return df


## Conversion

In [5]:
xc_dir = "/mnt/b100/d0/anshumitts/scratch/XC/data"
xc_dataset_name = 'G-LF-WikiSeeAlsoTitles-300K'

In [6]:
save_dir = "../Datasets"

### Classification-matrix

In [7]:
trn_x_y_str = read_data(f'{xc_dir}/{xc_dataset_name}/trn_X_Y.txt')
trn_x_y = extract_xc_data(trn_x_y_str)

tst_x_y_str = read_data(f'{xc_dir}/{xc_dataset_name}/tst_X_Y.txt')
tst_x_y = extract_xc_data(tst_x_y_str)

In [8]:
trn_x_y, tst_x_y

(<641846x311696 sparse matrix of type '<class 'numpy.float64'>'
 	with 1353908 stored elements in Compressed Sparse Row format>,
 <280808x311696 sparse matrix of type '<class 'numpy.float64'>'
 	with 684324 stored elements in Compressed Sparse Row format>)

In [10]:
elias_dir = f'{save_dir}/{xc_dataset_name}'
os.makedirs(elias_dir, exist_ok=True)

sp.save_npz(f'{elias_dir}/Y.trn.npz', trn_x_y)
sp.save_npz(f'{elias_dir}/Y.tst.npz', tst_x_y)

### Text-data

In [11]:
text_dir = f'{xc_dir}/{xc_dataset_name}/raw_data'

elias_dir = f'{save_dir}/{xc_dataset_name}/raw'
os.makedirs(elias_dir, exist_ok=True)

In [12]:
trn_x_txt = read_data(f'{text_dir}/train.raw.txt')
trn_x = extract_xc_text(trn_x_txt)

with open(f'{elias_dir}/trn_X.txt', 'w') as file:
    file.writelines(trn_x)

In [13]:
tst_x_txt = read_data(f'{text_dir}/test.raw.txt')
tst_x = extract_xc_text(tst_x_txt)

with open(f'{elias_dir}/tst_X.txt', 'w') as file:
    file.writelines(tst_x)

### BOW

In [14]:
bow_dir = f'{xc_dir}/{xc_dataset_name}'

In [16]:
trn_bow_str = read_data(f'{xc_dir}/{xc_dataset_name}/train_X_Xf.txt')
trn_bow = extract_xc_data(trn_bow_str)

tst_bow_str = read_data(f'{xc_dir}/{xc_dataset_name}/test_X_Xf.txt')
tst_bow = extract_xc_data(tst_bow_str)

In [17]:
trn_bow, tst_bow

(<641846x40001 sparse matrix of type '<class 'numpy.float64'>'
 	with 2553594 stored elements in Compressed Sparse Row format>,
 <280808x40001 sparse matrix of type '<class 'numpy.float64'>'
 	with 1119273 stored elements in Compressed Sparse Row format>)

In [18]:
elias_dir = f'{save_dir}/{xc_dataset_name}'
os.makedirs(elias_dir, exist_ok=True)

sp.save_npz(f'{elias_dir}/X.trn.npz', trn_x_y)
sp.save_npz(f'{elias_dir}/X.tst.npz', tst_x_y)