In [66]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import xclib.data.data_utils as du
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from IPython.display import display
from timeit import default_timer as timer
import xclib.evaluation.xc_metrics as xc_metrics

# Code

In [67]:
def read_map(filename):
    text = []
    with open(filename) as file:
        for line in file:
            text.append(line[:-1])
    return text


In [68]:
def read_XC_data(data_dir, meta_tag=""):
    data_tag = f"{meta_tag}_" if meta_tag else meta_tag
    trn_xy = du.read_sparse_file(f"{data_dir}/{data_tag}trn_X_Y.txt")
    tst_xy = du.read_sparse_file(f"{data_dir}/{data_tag}tst_X_Y.txt")
    
    trn_map = read_map(f"{data_dir}/raw_data/train.raw.txt")
    tst_map = read_map(f"{data_dir}/raw_data/test.raw.txt")
    lbl_map = read_map(f"{data_dir}/raw_data/label.raw.txt")
        
    if meta_tag:
        lbl_xy = du.read_sparse_file(f"{data_dir}/{data_tag}lbl_X_Y.txt")
        meta_map = read_map(f"{data_dir}/raw_data/{meta_tag}.raw.txt")
        return (trn_xy, tst_xy, lbl_xy), (trn_map, tst_map, lbl_map, meta_map)
    return (trn_xy, tst_xy), (trn_map, tst_map, lbl_map)


# Load

In [69]:
data_dir = "/home/scai/phd/aiz218323/tmp/XC/data/G-LF-WikiSeeAlsoTitles-300K/"
xy_data, xy_maps = read_XC_data(data_dir, meta_tag="category")



In [70]:
def purge_data(xy_data, xy_map):
    valid_idx = np.where(xy_data.getnnz(axis=1) > 0)[0].tolist()
    valid_data = xy_data[valid_idx]
    valid_map = np.array(xy_map)[valid_idx].tolist()
    
    return valid_idx, valid_data, valid_map


In [71]:
correct_idx, correct_data, correct_map = [], [], []

for data, text in zip(xy_data, xy_maps[:-1]):
    valid_idx, valid_data, valid_map = purge_data(data, text)
    correct_idx.append(valid_idx)
    correct_data.append(valid_data)
    correct_map.append(valid_map)
    

# Save

In [72]:
import os
from scipy.sparse import vstack

In [91]:
def purge_invalid_labels(trn_xy, tst_xy, label_map):
    valid_lbl_idx = np.where(trn_xy.getnnz(axis=0) > 0)[0]

    trn_X_Y = trn_xy[:, valid_lbl_idx]
    tst_X_Y = tst_xy[:, valid_lbl_idx]
    label_raw = np.array(label_map)[valid_lbl_idx]
    
    return trn_X_Y, tst_X_Y, label_raw


In [73]:
save_dir = "/home/scai/phd/aiz218323/tmp/XC/data/LF-WikiTitles-700K/"

In [74]:
os.makedirs(save_dir, exist_ok=True)
os.makedirs(f"{save_dir}/raw_data", exist_ok=True)

In [90]:
trn_xy = vstack([correct_data[0], correct_data[2]])
tst_xy = correct_data[1]

train_raw = correct_map[0] + correct_map[2]
test_raw = correct_map[1]

In [92]:
trn_X_Y, tst_X_Y, label_raw = purge_invalid_labels(trn_xy, tst_xy, xy_maps[-1])

In [95]:
is_label = [False]*len(correct_idx[0]) + [True]*len(correct_idx[2])
train_idx = pd.DataFrame({"index": correct_idx[0] + correct_idx[2],
                          "is_label": is_label})

is_label = [False]*len(correct_idx[1])
test_idx = pd.DataFrame({"index": correct_idx[1],
                        "is_label": is_label})

In [98]:
def write_map(texts, filename):
    with open(filename, 'w') as file:
        for text in texts:
            file.write(f"{text}\n")
            

In [99]:
du.write_sparse_file(trn_X_Y, f"{save_dir}/trn_X_Y.txt")
du.write_sparse_file(tst_X_Y, f"{save_dir}/tst_X_Y.txt")

write_map(train_raw, f"{save_dir}/raw_data/train.raw.txt")
write_map(test_raw, f"{save_dir}/raw_data/test.raw.txt")
write_map(label_raw, f"{save_dir}/raw_data/label.raw.txt")

train_idx.to_csv(f"{save_dir}/train.idx.csv")
test_idx.to_csv(f"{save_dir}/test.idx.csv")