In [1]:
import spacy
from spacy.tokens import DocBin

import pandas as pd
import pickle
import json

from tqdm import tqdm

nlp = spacy.blank("en")

In [2]:
labels = pd.read_csv("data/labels.csv", names=["sentence_id", "token", "pos", "seq_id", "tag_"])
labels["tag"] = pd.factorize(labels["tag_"])[0]
labels

Unnamed: 0,sentence_id,token,pos,seq_id,tag_,tag
0,0,G,NN,0,ENT,0
1,0,quartet,NN,1,ENT,0
2,0,oligonucleotides,VBZ,2,OTHERS,1
3,0,that,WDT,3,OTHERS,1
4,0,target,VBP,4,REL,2
...,...,...,...,...,...,...
1224468,44226,about,RB,8,ENT,0
1224469,44226,2,CD,9,ENT,0
1224470,44226,%,NN,10,ENT,0
1224471,44226,of,IN,11,REL,2


In [3]:
rels = pd.read_csv("data/rels.csv", names=["sentence_id", "source_id", "target_id", "edge", "type"])
rels = rels.loc[rels["type"] != "NIL"]
rels["label"] = pd.factorize(rels["type"])[0]
rels

Unnamed: 0,sentence_id,source_id,target_id,edge,type,label
0,0,1,4,"(0, 1, 4)",ENTREL,0
1,0,4,5,"(0, 4, 5)",RELENT,1
44,1,15,16,"(1, 15, 16)",ENTREL,0
45,1,16,17,"(1, 16, 17)",RELENT,1
46,1,13,16,"(1, 13, 16)",ENTREL,0
...,...,...,...,...,...,...
6066024,44226,3,4,"(44226, 3, 4)",RELENT,1
6066025,44226,6,7,"(44226, 6, 7)",ENTREL,0
6066026,44226,7,8,"(44226, 7, 8)",RELENT,1
6066027,44226,10,11,"(44226, 10, 11)",ENTREL,0


In [4]:
filelist = pd.read_csv("data/select-nlp.csv", names=["name", "time", "model"])
filelist

Unnamed: 0,name,time,model
0,8168603_extracted_facts.txt,1660084952,en_core_web_sm
1,6514494_extracted_facts.txt,1660085053,en_core_web_sm
2,7071205_extracted_facts.txt,1660085216,en_core_web_sm
3,6610906_extracted_facts.txt,1660085605,en_core_web_sm
4,8247050_extracted_facts.txt,1660085853,en_core_web_sm
...,...,...,...
4200,8736473_extracted_facts.txt,1672930779,en_core_web_trf
4201,5580778_extracted_facts.txt,1672930887,en_core_web_trf
4202,4781246_extracted_facts.txt,1672930915,en_core_web_trf
4203,5336678_extracted_facts.txt,1672930970,en_core_web_trf


In [5]:
sentences = []
for item in tqdm(filelist["name"].tolist()):
    with open("data/facts/" + item, 'r') as f:
        patentText = f.read()
    f.close()
    patentInfo = json.loads(patentText)

    for key in patentInfo:
        if key != "PATENT NUMBER":
            for line in patentInfo[key]:
                if len(line["facts"]) > 1:
                    sentences.append(line["sentence"])

len(sentences), sentences[:5]

100%|██████████| 4205/4205 [00:01<00:00, 2681.09it/s]


(44227,
 ['G quartet oligonucleotides that target hypoxia inducible factor 1 α ',
  'The present invention concerns particular G quartet oligonucleotides that are employed for the treatment and prevention of cancer ',
  'In specific cases, the G quartet oligonucleotides inhibit HIF 1α ',
  'The present invention generally relates at least to the fields of molecular biology, cell biology, and medicine, in particular cancer ',
  'Specifically, the invention concerns methods and compositions for the treatment and prevention of cancer '])

In [35]:
doc_bin = DocBin()
test_bin = DocBin()
for j in tqdm(range(len(sentences))[:1000]):
    doc = nlp(sentences[j])
    tags = labels.loc[labels["sentence_id"] == j]["tag_"].tolist()
    rels_subset = rels.loc[rels["sentence_id"] == j]
    edges = [(row["source_id"], row["type"], row["target_id"], row["label"]) for index, row in rels_subset.iterrows()]
    if len(doc) == len(tags):
        for k in range(len(tags)):
            doc[k].head = doc[k]
            doc[k].dep_ = "ROOT"
            for edge in edges:
                if edge[2] == k:
                    doc[k].head = doc[edge[0]]
                    doc[k].dep_ = edge[1]
        #print([(token.text, token.head, token.dep_, token.dep, token.is_sent_start) for token in doc])
        if j < 800:
            doc_bin.add(doc)
        else:
            test_bin.add(doc)
    else:
        print(j)

doc_bin.to_disk("train.spacy")
test_bin.to_disk("test.spacy")

100%|██████████| 1000/1000 [00:02<00:00, 338.36it/s]
