In [1]:
import os
import random
import pandas as pd
import numpy as np
from tqdm import tqdm, tqdm_notebook
from scipy.sparse import csr_matrix

In [2]:
%load_ext autoreload
%autoreload 2

## Code

In [3]:
def read_data(filename):
    with open(filename, encoding='utf-8') as file:
        df = file.readlines()
    return df

In [4]:
def extract_xc_data(content):
    header = content[0]
    num_rows, num_cols = header[:-1].split(" ")
    num_rows = int(num_rows)
    num_cols = int(num_cols)

    indptr = [0]
    indices = []
    data = []
    for line in content[1:]:

        line = line[:-1]
        column_value = line.split(" ")
        for cv in column_value:
            if len(cv):
                col_num, value = cv.split(":")
                col_num = int(col_num)
                value = int(value)

                indices.append(col_num)
                data.append(value)
        indptr.append(len(indices))

    train_x_y_mat = csr_matrix((data, indices, indptr), dtype=int)

    return train_x_y_mat

In [5]:
def xc_kgcl_kg(trn_x_y, rel=0):
    str_repr = ""
    for r, row in enumerate(trn_x_y):
        cols = row.indices
        for c in cols:
            str_repr += f"{r} {rel} {c}\n"
    return str_repr


In [6]:
def xc_kgcl_classification(trn_x_y):
    str_repr = ""
    for r, row in enumerate(trn_x_y):
        cols = row.indices
        row_str = str(r)+" "+" ".join(map(str, cols))+"\n"
        str_repr += row_str
    return str_repr


In [7]:
def extract_xc_node_id(filename):
    ids = []
    with open(filename) as file:
        for line in file:
            ids.append(line[:-1].split('->')[0])
    return ids


## Read data

In [22]:
xc_dir = "../../../data/G-LF-WikiSeeAlsoTitles-300K/"

- reading the graphs 

In [24]:
train_file = f"{xc_dir}/trn_X_Y.txt"
trn_x_y_str = read_data(train_file)
trn_x_y_mat = extract_xc_data(trn_x_y_str)

test_file = f"{xc_dir}/tst_X_Y.txt"
tst_x_y_str = read_data(test_file)
tst_x_y_mat = extract_xc_data(tst_x_y_str)


graph_train_file = f"{xc_dir}/graph_trn_X_Y.txt"
graph_trn_x_y_str = read_data(graph_train_file)
graph_trn_x_y_mat = extract_xc_data(graph_trn_x_y_str)

graph_test_file = f"{xc_dir}/graph_tst_X_Y.txt"
graph_tst_x_y_str = read_data(graph_test_file)
graph_tst_x_y_mat = extract_xc_data(graph_tst_x_y_str)


graph_label_file = f"{xc_dir}/graph_lbl_X_Y.txt"
graph_lbl_x_y_str = read_data(graph_label_file)
graph_lbl_x_y_mat = extract_xc_data(graph_lbl_x_y_str)

In [25]:
trn_x_y_mat.shape, tst_x_y_mat.shape, graph_trn_x_y_mat.shape, graph_tst_x_y_mat.shape, graph_lbl_x_y_mat.shape

((641846, 311696),
 (280808, 311696),
 (641846, 3074447),
 (280808, 3074447),
 (311696, 3831711))

- reading the text information

In [26]:
train_id_file = f"{xc_dir}/raw_data/train.raw.txt"
train_id = extract_xc_node_id(train_id_file)

test_id_file = f"{xc_dir}/raw_data/test.raw.txt"
test_id = extract_xc_node_id(test_id_file)

label_id_file = f"{xc_dir}/raw_data/label.raw.txt"
label_id = extract_xc_node_id(label_id_file)

graph_id_file = f"{xc_dir}/raw_data/graph.raw.txt"
graph_id = extract_xc_node_id(graph_id_file)

In [27]:
len(train_id), len(test_id), len(label_id), len(graph_id)

(641846, 280808, 311696, 3831711)

### Classification

In [9]:
trn_str_repr = xc_kgcl_classification(trn_x_y_mat)

save_file = "../data/G-LF-WikiSeeAlsoTitles-300K/train.txt"
os.makedirs(os.path.dirname(save_file), exist_ok=True)

with open(save_file, 'w') as file:
    file.write(trn_str_repr)

In [11]:
tst_str_repr = xc_kgcl_classification(tst_x_y_mat)

save_file = "../data/G-LF-WikiSeeAlsoTitles-300K/test.txt"
os.makedirs(os.path.dirname(save_file), exist_ok=True)

with open(save_file, 'w') as file:
    file.write(tst_str_repr)

### Knowledge graph

In [31]:
def create_knowledge_graph(graph, x_ids, y_ids, vocabulary, kg_str, relation):
    for r, row in tqdm_notebook(enumerate(graph), total=graph.shape[0]):
        col = row.indices
        for c in col:
            node_a = vocabulary.setdefault(x_ids[r], len(vocabulary))
            node_b = vocabulary.setdefault(y_ids[c], len(vocabulary))
            kg_str += f'{node_a} {relation} {node_b}\n'
    return kg_str
    

In [32]:
vocabulary, kg_str = dict(), ""

In [33]:
relation = 0
kg_str = create_knowledge_graph(graph_trn_x_y_mat, train_id, graph_id, vocabulary, kg_str, relation)


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for r, row in tqdm_notebook(enumerate(graph), total=graph.shape[0]):


  0%|          | 0/641846 [00:00<?, ?it/s]

In [34]:
kg_str = create_knowledge_graph(graph_tst_x_y_mat, test_id, graph_id, vocabulary, kg_str, relation)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for r, row in tqdm_notebook(enumerate(graph), total=graph.shape[0]):


  0%|          | 0/280808 [00:00<?, ?it/s]

In [35]:
kg_str = create_knowledge_graph(graph_lbl_x_y_mat, label_id, graph_id, vocabulary, kg_str, relation)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for r, row in tqdm_notebook(enumerate(graph), total=graph.shape[0]):


  0%|          | 0/311696 [00:00<?, ?it/s]

In [17]:
save_file = "../data/G-LF-WikiSeeAlsoTitles-300K/kg.txt"
with open(save_file, "w") as file:
    file.write(kg_str)