# Convert Data to C4 Dataset

## Check dependencies

In [1]:
from pathlib import Path
import hashlib
from gensim.models import KeyedVectors
import datasets

Following files are generated by [parsec/20.32](https://github.com/seantyh/parsec/blob/main/etc/20.32-tencent-StandfordNLP-small.ipynb):
```
tencent_nn_embeddings-s.txt b237d2
tencent_vocab_nn_compounds_small a29857
```

In [2]:
vec_path = Path("../data/tencent_nn_embeddings-s.txt.gz")
nn_path = Path("../data/tencent_vocab_nn_compounds_small.txt")
for path_x in (vec_path, nn_path):
    h = hashlib.sha1()
    h.update(path_x.read_bytes())
    print("{} {}".format(path_x.name, h.digest().hex()[:6]))

tencent_nn_embeddings-s.txt.gz b237d2
tencent_vocab_nn_compounds_small.txt a29857


## Convert to Datasets

In [3]:
kv = KeyedVectors.load_word2vec_format(vec_path)

In [4]:
from tqdm.auto import tqdm
from collections import Counter
import pandas as pd
import numpy as np
Counter(len(x) for x in kv.index_to_key)

Counter({1: 10818, 2: 27901, 4: 192882})

In [5]:
compounds = pd.read_csv(nn_path, header=None, 
                        names="nn c1 c2".split())

## Build C4 Dataset

In [6]:
comp_mat = []
c1_mat = []
c2_mat = []
for comp_x in tqdm(compounds.itertuples(), total=compounds.shape[0]):
    cc_x = comp_x.nn
    c1_x = comp_x.c1
    c2_x = comp_x.c2
    cc_vec = kv.get_vector(cc_x, norm=False)
    c1_vec = kv.get_vector(c1_x, norm=False)
    c2_vec = kv.get_vector(c2_x, norm=False)
    comp_mat.append(cc_vec)
    c1_mat.append(c1_vec)
    c2_mat.append(c2_vec)
comp_mat = np.vstack(comp_mat)
c1_mat = np.vstack(c1_mat)
c2_mat = np.vstack(c2_mat)


  0%|          | 0/192882 [00:00<?, ?it/s]

In [7]:
comp_mat.shape

(192882, 200)

In [8]:
N = comp_mat.shape[0]
split_idxs = np.arange(N)
rng = np.random.RandomState(123)
rng.shuffle(split_idxs)
train_idxs = split_idxs[:int(N*.8)]
val_idxs = split_idxs[int(N*.8):int(N*.9)]
test_idxs = split_idxs[int(N*.9):]
assert len(set(train_idxs) & set(val_idxs)) == 0
assert len(set(train_idxs) & set(test_idxs)) == 0
assert len(set(val_idxs) & set(test_idxs)) == 0
print("train/val/test: ", len(train_idxs), len(val_idxs), len(test_idxs))

train/val/test:  154305 19288 19289


In [9]:
consts = np.concatenate([c1_mat, c2_mat], axis=1)
train_comps = comp_mat[train_idxs, :]
val_comps = comp_mat[val_idxs, :]
test_comps = comp_mat[test_idxs, :]
train_consts = consts[train_idxs, :]
val_consts = consts[val_idxs, :]
test_consts = consts[test_idxs, :]
train_comps_text = compounds.nn[train_idxs]
val_comps_text = compounds.nn[val_idxs]
test_comps_text = compounds.nn[test_idxs]
train_c1_text = compounds.c1[train_idxs]
val_c1_text = compounds.c1[val_idxs]
test_c1_text = compounds.c1[test_idxs]
train_c2_text = compounds.c2[train_idxs]
val_c2_text = compounds.c2[val_idxs]
test_c2_text = compounds.c2[test_idxs]

In [10]:
print("comps train/val/test: ", train_comps.shape, val_comps.shape, test_comps.shape)
print("consts train/val/test: ", train_consts.shape, val_consts.shape, test_consts.shape)
print("comps_text train/val/test: ", len(train_comps_text), len(val_comps_text), len(test_comps_text))
print("c1_text train/val/test: ", len(train_c1_text), len(val_c1_text), len(test_c1_text))
print("c2_text train/val/test: ", len(train_c2_text), len(val_c2_text), len(test_c2_text))

comps train/val/test:  (154305, 200) (19288, 200) (19289, 200)
consts train/val/test:  (154305, 400) (19288, 400) (19289, 400)
comps_text train/val/test:  154305 19288 19289
c1_text train/val/test:  154305 19288 19289
c2_text train/val/test:  154305 19288 19289


### Build dataset dict

In [11]:
c4_dataset = {
    "train": {
        "comps": train_comps,
        "consts": train_consts,
        "comps_text": train_comps_text,
        "c1_text": train_c1_text,
        "c2_text": train_c2_text,
    },
    "val": {
        "comps": val_comps,
        "consts": val_consts,
        "comps_text": val_comps_text,
        "c1_text": val_c1_text,
        "c2_text": val_c2_text,
    },
    "test": {
        "comps": test_comps,
        "consts": test_consts,
        "comps_text": test_comps_text,
        "c1_text": test_c1_text,
        "c2_text": test_c2_text,
    }
}

## Write to file

```
tencent-compound-c4.pkl 0eb1e3
```

In [12]:
import pickle
c4_path = "../data/tencent-compound-c4.pkl"
with open(c4_path, "wb") as fout:
    pickle.dump(c4_dataset, fout)
h = hashlib.sha1()
h.update(Path(c4_path).read_bytes())
print(Path(c4_path).name, h.digest().hex()[:6])

tencent-compound-c4.pkl 0eb1e3
