In [1]:
import sys
import os

from pathlib import Path

data_path = Path("../data")
os.environ["VECTORIAN_DEMO_DATA_PATH"] = str(data_path)

sys.path.append("code")
import nbutils
import gold
import json

from vectorian.importers import TextImporter
from vectorian.embeddings import SpacyVectorEmbedding
from vectorian.corpus import Corpus
from tqdm import tqdm

nlp = nbutils.make_nlp(None)
gold_data = gold.load_data(data_path / "raw_data" / "gold.json")

sbert_model_names = [
    "en_paraphrase_distilroberta_base_v1",
    "msmarco-distilbert-base-v4"
]

sbert_embeddings = dict((k, SpacyVectorEmbedding(
    nbutils.make_nlp(k), 768)) for k in sbert_model_names)

def prepare_docs():
    if sbert_embeddings:
        im = TextImporter(nlp, embeddings=list(sbert_embeddings.values()))
    else:
        im = TextImporter(nlp)

    corpus = Corpus(data_path / "processed_data"/ "corpus")

    for x, d in tqdm(gold_data.in_degree(gold_data.nodes), desc="Importing"):
        if d < 1:
            continue

        node = gold_data.nodes[x]

        doc = im(
            node["context"],
            title=node["source"]["book"],
            author=node["source"]["author"],
            extra_metadata={
                'gold_id': node["id"]
            },
            show_progress=False)

        corpus.add_doc(doc)

prepare_docs()

Importing: 100%|███████████████████████████████████████████████████████████████████| 120/120 [04:42<00:00,  2.35s/it]
