In [3]:
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import xclib.data.data_utils as du
from scipy.sparse import csr_matrix
from scipy.sparse import coo_matrix
from IPython.display import display
from timeit import default_timer as timer
import xclib.evaluation.xc_metrics as xc_metrics

# Code

In [7]:
def read_id(filename, rm_suffix_idx=None):
    text = []
    with open(filename) as file:
        for line in file:
            text.append(line[:-1].split('->', maxsplit=1)[0][:rm_suffix_idx])
    return text

def read_ids(data_dir, rm_suffix_idx=None, meta_tag=""):
    trn_id = read_id(f"{data_dir}/raw_data/train.raw.txt", rm_suffix_idx)
    tst_id = read_id(f"{data_dir}/raw_data/test.raw.txt", rm_suffix_idx)
    lbl_id = read_id(f"{data_dir}/raw_data/label.raw.txt", rm_suffix_idx)
    if meta_tag:
        meta_id = read_map(f"{data_dir}/raw_data/{meta_tag}.raw.txt", rm_suffix_idx)
        return trn_id, tst_id, lbl_id, meta_id
    return trn_id, tst_id, lbl_id


In [8]:
def read_map(filename):
    text = []
    with open(filename) as file:
        for line in file:
            text.append(line[:-1].split('->', maxsplit=1)[1])
    return text


In [9]:
def read_XC_data(data_dir, meta_tag=""):
    data_tag = f"{meta_tag}_" if meta_tag else meta_tag
    trn_xy = du.read_sparse_file(f"{data_dir}/{data_tag}trn_X_Y.txt")
    tst_xy = du.read_sparse_file(f"{data_dir}/{data_tag}tst_X_Y.txt")
    
    trn_map = read_map(f"{data_dir}/raw_data/train.raw.txt")
    tst_map = read_map(f"{data_dir}/raw_data/test.raw.txt")
    lbl_map = read_map(f"{data_dir}/raw_data/label.raw.txt")
        
    if meta_tag:
        lbl_xy = du.read_sparse_file(f"{data_dir}/{data_tag}lbl_X_Y.txt")
        meta_map = read_map(f"{data_dir}/raw_data/{meta_tag}.raw.txt")
        return (trn_xy, tst_xy, lbl_xy), (trn_map, tst_map, lbl_map, meta_map)
    return (trn_xy, tst_xy), (trn_map, tst_map, lbl_map)


In [10]:
def compute_diff_inter(data_1, data_2):
    _, inter_idx, _ = np.intersect1d(data_1, data_2, return_indices=True)
    
    diff_elements = np.setdiff1d(data_1, data_2)
    diff_idx = np.where(np.isin(data_1, diff_elements))[0]
    
    return inter_idx, diff_idx


# Load

In [11]:
data_dir = "/home/scai/phd/aiz218323/tmp/XC/data/G-LF-WikiSeeAlsoTitles-300K/"
result_dir = "/home/scai/phd/aiz218323/tmp/XC/results/NGAME/STransformer/\
G-LF-WikiSeeAlsoTitles-300K/v_0_100/"

xy_data, xy_maps = read_XC_data(data_dir)

In [12]:
xy_ids = read_ids(data_dir, rm_suffix_idx=-2)

In [8]:
meta_data_dir = "/home/scai/phd/aiz218323/tmp/XC/data/G-LF-WikiTitles-1M/"
meta_result_dir = "/home/scai/phd/aiz218323/tmp/XC/results/NGAME/STransformer/\
G-LF-WikiTitles-1M/v_0_100/"

meta_data, meta_maps = read_XC_data(meta_data_dir)

In [9]:
meta_ids = read_ids(meta_data_dir)

## remove G-LF-WikiSeeAlsoTitles-300K_tst

In [136]:
_, tst_meta_idx, _ = np.intersect1d(meta_ids[0], xy_ids[1], return_indices=True)

In [137]:
diff_ids = np.setdiff1d(meta_ids[0], xy_ids[1])
trn_meta_idx = np.where(np.isin(meta_ids[0], diff_ids))[0]

## remove G-LF-WikiSeeAlsoTitles-300K_trn_tst

In [97]:
def compute_diff_inter(data_ids_1, data_ids_2):
    _, inter_idx, _ = np.intersect1d(data_ids_1, data_ids_2, return_indices=True)
    
    diff_ids = np.setdiff1d(data_ids_1, data_ids_2)
    diff_idx = np.where(np.isin(data_ids_1, diff_ids))[0]
    
    return diff_idx, inter_idx


In [98]:
trn_trn_diff_idx, _ = compute_diff_inter(meta_ids[0], xy_ids[0])
tst_trn_diff_idx, _ = compute_diff_inter(meta_ids[0], xy_ids[1])

In [102]:
trn_meta_idx = np.intersect1d(trn_trn_diff_idx, tst_trn_diff_idx)

In [103]:
trn_tst_diff_idx, _ = compute_diff_inter(meta_ids[1], xy_ids[0])
tst_tst_diff_idx, _ = compute_diff_inter(meta_ids[1], xy_ids[1])

In [104]:
tst_meta_idx = np.intersect1d(trn_tst_diff_idx, tst_tst_diff_idx)

In [105]:
len(trn_meta_idx), len(tst_meta_idx)

(3594124, 1535668)

# Save

In [12]:
import os
from scipy.sparse import vstack

In [138]:
save_dir = "/home/scai/phd/aiz218323/tmp/XC/data/LF-WikiTitles-1M/"

In [139]:
os.makedirs(save_dir, exist_ok=True)
os.makedirs(f"{save_dir}/raw_data", exist_ok=True)

In [140]:
trn_X_Y = meta_data[0][trn_meta_idx]

train_raw = np.array(meta_maps[0])[trn_meta_idx].tolist()
train_ids = np.array(meta_ids[0])[trn_meta_idx].tolist()

remove G-LF-WikiSeeAlsoTitles-300K_trn_tst

In [124]:
tst_X_Y = meta_data[1][tst_meta_idx]

test_raw = np.array(meta_maps[1])[tst_meta_idx].tolist()
test_ids = np.array(meta_ids[1])[tst_meta_idx].tolist()

remove G-LF-WikiSeeAlsoTitles-300K_tst

In [141]:
tst_X_Y = vstack([meta_data[1], meta_data[0][tst_meta_idx]])

test_raw = meta_maps[1] + np.array(meta_maps[0])[tst_meta_idx].tolist()
test_ids = meta_ids[1] + np.array(meta_ids[0])[tst_meta_idx].tolist()

In [142]:
def purge_invalid_labels(trn_xy, tst_xy, label_map, label_ids):
    valid_lbl_idx = np.where(trn_xy.getnnz(axis=0) > 0)[0]

    trn_X_Y = trn_xy[:, valid_lbl_idx]
    tst_X_Y = tst_xy[:, valid_lbl_idx]
    label_raw = np.array(label_map)[valid_lbl_idx]
    label_ids = np.array(label_ids)[valid_lbl_idx]
    
    return trn_X_Y, tst_X_Y, label_raw, label_ids


In [143]:
def purge_invalid_datapoints(xy, xy_raw, xy_ids):
    valid_datapoint_idx = np.where(xy.getnnz(axis=1) > 0)[0]
    
    xy = xy[valid_datapoint_idx]
    xy_raw = np.array(xy_raw)[valid_datapoint_idx]
    xy_ids = np.array(xy_ids)[valid_datapoint_idx]
    
    return xy, xy_raw, xy_ids


In [144]:
trn_X_Y, tst_X_Y, label_raw, label_ids = purge_invalid_labels(trn_X_Y, tst_X_Y, 
                                                              meta_maps[2], meta_ids[2])

In [145]:
trn_X_Y, train_raw, train_ids = purge_invalid_datapoints(trn_X_Y, train_raw, train_ids)
tst_X_Y, test_raw, test_ids = purge_invalid_datapoints(tst_X_Y, test_raw, test_ids)

In [146]:
def write_map(ids, texts, filename):
    with open(filename, 'w') as file:
        for id, text in zip(ids, texts):
            file.write(f"{id}->{text}\n")
            

In [148]:
du.write_sparse_file(trn_X_Y, f"{save_dir}/trn_X_Y.txt")
du.write_sparse_file(tst_X_Y, f"{save_dir}/tst_X_Y.txt")

write_map(train_ids, train_raw, f"{save_dir}/raw_data/train.raw.txt")
write_map(test_ids, test_raw, f"{save_dir}/raw_data/test.raw.txt")
write_map(label_ids, label_raw, f"{save_dir}/raw_data/label.raw.txt")

## Verify

In [149]:
print(len(np.where(trn_X_Y.getnnz(axis=0) < 1)[0]), len(np.where(trn_X_Y.getnnz(axis=1) < 1)[0]))
print(len(np.where(tst_X_Y.getnnz(axis=0) < 1)[0]), len(np.where(tst_X_Y.getnnz(axis=1) < 1)[0]))

0 0
0 0


In [150]:
trn_X_Y, tst_X_Y

(<4066131x1069780 sparse matrix of type '<class 'numpy.float32'>'
 	with 19042723 stored elements in Compressed Sparse Row format>,
 <1990819x1069780 sparse matrix of type '<class 'numpy.float32'>'
 	with 9425522 stored elements in Compressed Sparse Row format>)

In [151]:
len(train_raw), len(test_raw), len(label_raw)

(4066131, 1990819, 1069780)

In [152]:
len(train_ids), len(test_ids), len(label_ids)

(4066131, 1990819, 1069780)

# Verify

In [1]:
save_dir = "/home/scai/phd/aiz218323/tmp/XC/data/LF-WikiTitles-1M/"

In [4]:
trn_xy = du.read_sparse_file(f"{save_dir}/trn_X_Y.txt")
tst_xy = du.read_sparse_file(f"{save_dir}/tst_X_Y.txt")

In [5]:
print(len(np.where(trn_xy.getnnz(axis=0) < 1)[0]), len(np.where(trn_xy.getnnz(axis=1) < 1)[0]))
print(len(np.where(tst_xy.getnnz(axis=0) < 1)[0]), len(np.where(tst_xy.getnnz(axis=1) < 1)[0]))

0 0
0 0
