In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
from sentence_transformers import SentenceTransformer, util

In [2]:
dir_root = "Data/"

In [3]:
with open(f"{dir_root}qaa.json","r") as f:
    data = json.load(f)
len(data)

18004

In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

## Embedding the sentences

In [9]:
sentences = []
title = []
cord_uid = []
pmc_json_files = []
arxiv_id = []
for item in tqdm(data):
    tmp = sent_tokenize(item["answare"])
    ls = len(tmp)
    sentences += tmp
    title += [item["title"]] * ls
    cord_uid += [item["cord_uid"]] * ls
    pmc_json_files += [item["pmc_json_files"]] * ls
    arxiv_id += [item["arxiv_id"]] * ls
sentences = pd.DataFrame({
    "sentence" : sentences,
    "title" : title,
    "cord_uid" : cord_uid,
    "pmc_json_files" : pmc_json_files,
    "arxiv_id" : arxiv_id,
})
sentences["sentences"] = sentences["sentence"].apply(lambda x: x.strip())
sentences["wc"] = sentences["sentence"].apply(lambda x: len(x.split(" ")))
sentences = sentences[(sentences["wc"] >= 3)]
sentences.reset_index()
sentences

100% 18004/18004 [00:01<00:00, 12599.64it/s]


Unnamed: 0,sentence,title,cord_uid,pmc_json_files,arxiv_id,sentences,wc
0,A prevalent symptom of Parkinson’s disease (PD...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,A prevalent symptom of Parkinson’s disease (PD...,124
1,The dataset consists of 1812 videos from 604 (...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,The dataset consists of 1812 videos from 604 (...,156
2,The participants without PD were recruited thr...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,The participants without PD were recruited thr...,13
3,These participants self-identified as not havi...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,These participants self-identified as not havi...,7
4,The participants with PD were recruited by ema...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,The participants with PD were recruited by ema...,28
...,...,...,...,...,...,...,...
58708,The very high case loads observed in some coun...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,The very high case loads observed in some coun...,39
58709,There is no need to attribute the highest case...,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,There is no need to attribute the highest case...,14
58710,"New York City, where confirmed cases substanti...",On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,"New York City, where confirmed cases substanti...",15
58711,"Brendan K. Beare: Methodology, Formal Analysis...",On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,"Brendan K. Beare: Methodology, Formal Analysis...",10


In [10]:
sentences = sentences.drop(columns=["sentence","wc"])
sentences.to_json(f"{dir_root}sentences.json",orient="records")
sentences

Unnamed: 0,title,cord_uid,pmc_json_files,arxiv_id,sentences
0,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,A prevalent symptom of Parkinson’s disease (PD...
1,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,The dataset consists of 1812 videos from 604 (...
2,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,The participants without PD were recruited thr...
3,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,These participants self-identified as not havi...
4,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,The participants with PD were recruited by ema...
...,...,...,...,...,...
58708,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,The very high case loads observed in some coun...
58709,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,There is no need to attribute the highest case...
58710,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,"New York City, where confirmed cases substanti..."
58711,On the emergence of a power law in the distrib...,hx8bnrsi,2004.12772,2004.12772,"Brendan K. Beare: Methodology, Formal Analysis..."


In [11]:
embbeded_vectors = model.encode(sentences["sentences"].values, show_progress_bar=True)
len(embbeded_vectors)

Batches:   0%|          | 0/1777 [00:00<?, ?it/s]

56861

In [12]:
with open(f"{dir_root}sentences.emb", "wb") as f:
    np.save(f, embbeded_vectors)

## Embeding the questions

In [19]:
indexes = []
questions = []

for i in tqdm(range(len(data))):
    for q in data[i]["questions"]:
        indexes.append(i)
        questions.append(q)

len(indexes), max(indexes), len(questions)

100% 18004/18004 [00:00<00:00, 834098.23it/s]


(58290, 18003, 58290)

In [20]:
with open(f"{dir_root}qaa.idx", 'w') as f:
    for item in tqdm(indexes):
        f.write(f"{str(item)}\n")

100% 58290/58290 [00:00<00:00, 1847352.20it/s]


In [21]:
embbeded_vectors = model.encode(questions, show_progress_bar=True)
len(embbeded_vectors)

Batches:   0%|          | 0/1822 [00:00<?, ?it/s]

58290

In [24]:
with open(f"{dir_root}qaa.emb", "wb") as f:
    np.save(f, embbeded_vectors)