In [1]:
import scispacy
import spacy
import pickle

In [2]:
nlp = spacy.load("en_core_sci_sm")

In [3]:
text = """Myeloid derived suppressor cells (MDSC) are immature 
myeloid cells with immunosuppressive activity. 
They accumulate in tumor-bearing mice and humans 
with different types of cancer, including hepatocellular 
carcinoma (HCC)."""

In [4]:
doc = nlp(text)

In [5]:
for ent in doc.ents:
    print(ent.text,ent.start_char,ent.end_char,ent.label_)

Myeloid 0 7 ENTITY
suppressor cells 16 32 ENTITY
MDSC 34 38 ENTITY
immature 44 52 ENTITY
immunosuppressive activity 73 99 ENTITY
accumulate 107 117 ENTITY
tumor-bearing mice 121 139 ENTITY
humans 144 150 ENTITY
cancer 176 182 ENTITY
hepatocellular 
carcinoma 194 219 ENTITY
HCC 221 224 ENTITY


In [54]:
from spacy import displacy
#displacy.render(next(doc.sents), style='dep', jupyter=True) #dependancy image

In [7]:
displacy.render(next(doc.sents), jupyter=True, style='ent')

In [8]:
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [9]:
#add single entity to model
nlp = spacy.blank('en')
ner = nlp.create_pipe("ner")

In [10]:
LABEL = "ANIMAL"
TRAIN_DATA = [
    (
        "Horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ),
    ("Do they bite?", {"entities": []}),
    (
        "horses are too tall and they pretend to care about your feelings",
        {"entities": [(0, 6, LABEL)]},
    ),
    ("horses pretend to care about your feelings", {"entities": [(0, 6, LABEL)]}),
    (
        "they pretend to care about your feelings, those horses",
        {"entities": [(48, 54, LABEL)]},
    ),
    ("horses?", {"entities": [(0, 6, LABEL)]}),
]

In [11]:
nlp.add_pipe(ner)
ner.add_label(LABEL)
ner.add_label("VEGETABLE")
optimizer = nlp.begin_training()

In [14]:
for itn in range(2):
    random.shuffle(TRAIN_DATA)
    losses = {}
    batches = minibatch(TRAIN_DATA, size=2)
    for batch in batches:
        texts = [text for text,annotations in batch]
        annotations = [annotations for text,annotations in batch]
        nlp.update(texts, annotations, losses = losses)
        print(losses)

{'ner': 1.0999109639778838e-10}
{'ner': 1.1280088603617894e-10}
{'ner': 2.6444271714444655e-08}
{'ner': 1.6084335576751693e-08}
{'ner': 1.6306131142039298e-08}
{'ner': 1.6307120743688314e-08}


In [15]:
TEST_DATA = ['This horses are too good to be true'] 

In [16]:
for doc in nlp.pipe(TEST_DATA):
    print(doc.text)
    for ent in doc.ents:
        print(ent.text,ent.start_char,ent.end_char,ent.label_)

This horses are too good to be true
horses 5 11 ANIMAL


In [17]:
import json
import logging
import sys
import pandas as pd
#convert trial data to spacy format
df = pd.read_fwf('/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/train.txt',header =None)
df.columns = ['Words']
df['Words'] = df['Words'].str.split(" ")
df = pd.DataFrame(df.Words.values.tolist()).add_prefix('code_')
df = df.drop(['code_2','code_3'], axis=1)

In [18]:
import string
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [19]:
df.columns = ['words','labels']
punctuations = [x for x in string.punctuation]

for index, row in df.iterrows():
    if row['words'] in punctuations:
        row['labels'] == 'O'

df.head()

Unnamed: 0,words,labels
0,variable,O
1,temperature,O
2,electron,B-CMT
3,paramagnetic,I-CMT
4,resonance,I-CMT


In [20]:
df_check_O = df[df.labels == 'O']
df_check_O.head(2)

Unnamed: 0,words,labels
0,variable,O
1,temperature,O


In [21]:
df.labels = df.labels.str.replace("B-MAT","Inorganic material")
df.labels = df.labels.str.replace("I-MAT","Inorganic material")
df.labels = df.labels.str.replace("B-MA","Inorganic material")
df.labels = df.labels.str.replace("I-MA","Inorganic material")
df.labels = df.labels.str.replace("B-M","Inorganic material")
df.labels = df.labels.str.replace("I-M","Inorganic material")
df.labels = df.labels.str.replace("I-SPL","Symmetry Label")
df.labels = df.labels.str.replace("B-SPL","Symmetry Label")
df.labels = df.labels.str.replace("I-DSC","Sample descriptor")
df.labels = df.labels.str.replace("B-DSC","Sample descriptor")
df.labels = df.labels.str.replace("I-DS","Sample descriptor")
df.labels = df.labels.str.replace("B-DS","Sample descriptor")
df.labels = df.labels.str.replace("I-D","Sample descriptor")
df.labels = df.labels.str.replace("B-D","Sample descriptor")
df.labels = df.labels.str.replace("I-PRO","Material property")
df.labels = df.labels.str.replace("B-PRO","Material property")
df.labels = df.labels.str.replace("I-PR","Material property")
df.labels = df.labels.str.replace("B-PR","Material property")
df.labels = df.labels.str.replace("I-P","Material property")
df.labels = df.labels.str.replace("B-P","Material property")
df.labels = df.labels.str.replace("I-APL","Material application")
df.labels = df.labels.str.replace("B-APL","Material application")
df.labels = df.labels.str.replace("I-AP","Material application")
df.labels = df.labels.str.replace("B-AP","Material application")
df.labels = df.labels.str.replace("I-A","Material application")
df.labels = df.labels.str.replace("B-A","Material application")
df.labels = df.labels.str.replace("I-SMT","Synthesis method")
df.labels = df.labels.str.replace("B-SMT","Synthesis method")
df.labels = df.labels.str.replace("I-SM","Synthesis method")
df.labels = df.labels.str.replace("B-SM","Synthesis method")
df.labels = df.labels.str.replace("I-S","Synthesis method")
df.labels = df.labels.str.replace("B-S","Synthesis method")
df.labels = df.labels.str.replace("I-CMT","Characterization method")
df.labels = df.labels.str.replace("B-CMT","Characterization method")
df.labels = df.labels.str.replace("I-CM","Characterization method")
df.labels = df.labels.str.replace("B-CM","Characterization method")
df.labels = df.labels.str.replace("I-C","Characterization method")
df.labels = df.labels.str.replace("B-C","Characterization method")
df.to_csv('/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/train.tsv', sep = '\t',header=False, index =False)

In [22]:
label_list = df.labels.tolist()
label_list =list(set(label_list))
label_list

['O',
 '',
 'Sample descriptor',
 'Material property',
 'I',
 None,
 'Material application',
 'Synthesis method',
 '1/2',
 'B',
 'Symmetry Label',
 'Characterization method',
 'B-',
 'I-',
 'Inorganic material']

In [23]:
unknown_label_list = ['O','B-','I-','1/2','B','I','']

In [24]:
def tsv_to_json_format(input_path,output_path,unknown_label_list):
    try:
        f=open(input_path,'r') # input file
        fp=open(output_path, 'w') # output file
        data_dict={}
        annotations =[]
        label_dict={}
        s=''
        start=0
        for line in f:
            if line[0:len(line)-1]!='.\tO':
                word,entity=line.split('\t')
                s+=word+" "
                entity=entity[:len(entity)-1]
                if entity not in unknown_label_list:
                    if len(entity) != 1:
                        d={}
                        d['text']=word
                        d['start']=start
                        d['end']=start+len(word)-1  
                        try:
                            label_dict[entity].append(d)
                        except:
                            label_dict[entity]=[]
                            label_dict[entity].append(d) 
                start+=len(word)+1
            else:
                data_dict['content']=s
                s=''
                label_list=[]
                for ents in list(label_dict.keys()):
                    for i in range(len(label_dict[ents])):
                        if(label_dict[ents][i]['text']!=''):
                            l=[ents,label_dict[ents][i]]
                            for j in range(i+1,len(label_dict[ents])): 
                                if(label_dict[ents][i]['text']==label_dict[ents][j]['text']):  
                                    di={}
                                    di['start']=label_dict[ents][j]['start']
                                    di['end']=label_dict[ents][j]['end']
                                    di['text']=label_dict[ents][i]['text']
                                    l.append(di)
                                    label_dict[ents][j]['text']=''
                            label_list.append(l)                          
                            
                for entities in label_list:
                    label={}
                    label['label']=[entities[0]]
                    label['points']=entities[1:]
                    annotations.append(label)
                data_dict['annotation']=annotations
                annotations=[]
                json.dump(data_dict, fp)
                fp.write('\n')
                data_dict={}
                start=0
                label_dict={}
    except Exception as e:
        logging.exception("Unable to process file" + "\n" + "error = " + str(e))
        return None

In [25]:
tsv_to_json_format("/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/train.tsv",'/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/train.json',unknown_label_list)

In [26]:
training_data = []
lines=[]
with open("/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/train.json", 'r') as f:
    lines = f.readlines()
for line in lines:
    data = json.loads(line)
    text = data['content']
    entities = []
    for annotation in data['annotation']:
        point = annotation['points'][0]
        labels = annotation['label']
        if not isinstance(labels, list):
            labels = [labels]

        for label in labels:
            entities.append((point['start'], point['end'] + 1 ,label))


    training_data.append((text, {"entities" : entities}))

In [27]:
# Training additional entity types using spaCy
from __future__ import unicode_literals, print_function
import pickle
import plac
import random
from pathlib import Path
import spacy
from spacy.util import minibatch, compounding

In [28]:
LABEL = df['labels'].tolist()
LABEL = list(set(LABEL))
LABEL = [x for x in LABEL if x] 
LABEL = [x for x in LABEL if x not in unknown_label_list]

In [29]:
LABEL

['Sample descriptor',
 'Material property',
 'Material application',
 'Synthesis method',
 'Symmetry Label',
 'Characterization method',
 'Inorganic material']

In [30]:
TRAIN_DATA = training_data

In [31]:
#adding new entities to blank model
nlp = spacy.blank('en')
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
for i in LABEL:
    ner.add_label(i)
optimizer = nlp.begin_training()

In [32]:
# #SPacy Training syntax - Code 1
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
# with nlp.disable_pipes(*other_pipes):  # only train NER
#     for itn in range(500):
#         random.shuffle(TRAIN_DATA)
#         losses = {}
#         batches = minibatch(TRAIN_DATA, size=2)
#         for batch in batches:
#             texts = [text for text,annotations in batch]
#             annotations = [annotations for text,annotations in batch]
#             nlp.update(texts, annotations, losses = losses)
#             print(losses)

In [33]:
#SPacy Training syntax - Code 2
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):  # only train NER
    for itn in range(2):
        random.shuffle(TRAIN_DATA)
        losses = {}
        batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                       losses=losses)
        print('Losses', losses)
#number of iterations - try 100

Losses {'ner': 36906.452446693314}
Losses {'ner': 27985.315468629782}


In [34]:
text = 'The biological application of photoactivatable ruthenium anticancer prodrugs is limited by the need to use poorly penetrating high-energy visible light for their activation. Upconverting nanoparticles (UCNPs), which produce high-energy light under near-infrared (NIR) excitation, can solve this issue, provided that they form stable, water (H2O)-dispersible nanoconjugates with the prodrug and that there is efficient energy transfer from the UCNP to the ruthenium complex. Herein, we report on the synthesis and photochemistry of the ruthenium(II) polypyridyl complex [Ru(bpy)2(3H)](PF6)2 ([1](PF6)2), where bpy = 2,2-bipyridine and 3H is a photocleavable bis(thioether) ligand modified with two phosphonate moieties. This ligand was coordinated to the ruthenium center through its thioether groups and could be dissociated under blue-light irradiation. Complex [1](PF6)2 was bound to the surface of NaYF4:Yb3+,Tm3+@NaYF4:Nd3+@NaYF4 core–shell–shell (CSS-)UCNPs through its bis(phosphonate) group, thereby creating a H2O-dispersible, thermally stable nanoconjugate (CSS-UCNP@[1]). Conjugation to the nanoparticle surface was found to be most efficient in neutral to slightly basic conditions, resulting in up to 2.4 × 103 RuII ions per UCNP. The incorporation of a neodymium-doped shell layer allowed for the generation of blue light using low-energy, deep-penetrating light (796 nm). This wavelength prevents the undesired heating seen with conventional UCNPs activated at 980 nm. Irradiation of CSS-UCNP@[1] with NIR light led to activation of the ruthenium complex [1](PF6)2. Although only one of the two thioether groups was dissociated under irradiation at 50 W·cm–2, we provide the first demonstration of the photoactivation of a ruthenium thioether complex using 796 nm irradiation of a H2O-dispersible nanoconjugate.' 

In [35]:
 from spacy import displacy
# doc2 = nlp(text_2)
# displacy.render(doc2, style="ent")

In [36]:
# #update existing scispacy model with matscholar data - (Code 3)
# nlp = spacy.load("en_core_sci_sm")
# if "ner" not in nlp.pipe_names:
#     ner = nlp.create_pipe("ner")
#     nlp.add_pipe(ner, last=True)
# else:
#     ner = nlp.get_pipe("ner")
# for i in LABEL:
#     ner.add_label(i)

In [37]:
# pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]
# with nlp.disable_pipes(*other_pipes):
#     nlp.begin_training()
#     for itn in range(500):
#         random.shuffle(TRAIN_DATA)
#         losses = {}
#         batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
#         for batch in batches:
#             texts, annotations = zip(*batch)
#             nlp.update(texts, annotations, drop=0.35,
#                        losses=losses)
#         print('Losses', losses)

In [38]:
nlp.to_disk('/Users/patsnap/Desktop/Neo4J_and_other_codes/Spacy_codes/spacy_models')

In [39]:
doc2 = nlp(text)
displacy.render(doc2, style="ent")

In [40]:
df2 = pd.read_fwf('/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/dev.txt',header =None)
df2.columns = ['Words', 'unnecessary']
df2 = df2.drop(['unnecessary'], axis = 1)
df2 = df2.fillna("")

In [41]:
df2['Words'] = df2['Words'].str.split(" ")
df2.head()

Unnamed: 0,Words
0,"[hydrogen, B-CMT]"
1,"[diffusion, I-CMT]"
2,"[studies, I-CMT]"
3,"[in, O]"
4,"[Zr, B-MAT]"


In [42]:
df2.isnull().sum().sum()
df2 = pd.DataFrame(df2.Words.values.tolist()).add_prefix('code_')
df2.head()

Unnamed: 0,code_0,code_1
0,hydrogen,B-CMT
1,diffusion,I-CMT
2,studies,I-CMT
3,in,O
4,Zr,B-MAT


In [43]:
#import dev set
df2.columns = ['words','labels']
punctuations = [x for x in string.punctuation]

for index, row in df2.iterrows():
    if row['words'] in punctuations:
        row['labels'] == 'O'

In [44]:
df2.labels = df2.labels.str.replace("B-MAT","Inorganic material")
df2.labels = df2.labels.str.replace("I-MAT","Inorganic material")
df2.labels = df2.labels.str.replace("B-MA","Inorganic material")
df2.labels = df2.labels.str.replace("I-MA","Inorganic material")
df2.labels = df2.labels.str.replace("B-M","Inorganic material")
df2.labels = df2.labels.str.replace("I-M","Inorganic material")
df2.labels = df2.labels.str.replace("I-SPL","Symmetry Label")
df2.labels = df2.labels.str.replace("B-SPL","Symmetry Label")
df2.labels = df2.labels.str.replace("I-DSC","Sample descriptor")
df2.labels = df2.labels.str.replace("B-DSC","Sample descriptor")
df2.labels = df2.labels.str.replace("I-DS","Sample descriptor")
df2.labels = df2.labels.str.replace("B-DS","Sample descriptor")
df2.labels = df2.labels.str.replace("I-D","Sample descriptor")
df2.labels = df2.labels.str.replace("B-D","Sample descriptor")
df2.labels = df2.labels.str.replace("I-PRO","Material property")
df2.labels = df2.labels.str.replace("B-PRO","Material property")
df2.labels = df2.labels.str.replace("I-PR","Material property")
df2.labels = df2.labels.str.replace("B-PR","Material property")
df2.labels = df2.labels.str.replace("I-P","Material property")
df2.labels = df2.labels.str.replace("B-P","Material property")
df2.labels = df2.labels.str.replace("I-APL","Material application")
df2.labels = df2.labels.str.replace("B-APL","Material application")
df2.labels = df2.labels.str.replace("I-AP","Material application")
df2.labels = df2.labels.str.replace("B-AP","Material application")
df2.labels = df2.labels.str.replace("I-A","Material application")
df2.labels = df2.labels.str.replace("B-A","Material application")
df2.labels = df2.labels.str.replace("I-SMT","Synthesis method")
df2.labels = df2.labels.str.replace("B-SMT","Synthesis method")
df2.labels = df2.labels.str.replace("I-SM","Synthesis method")
df2.labels = df2.labels.str.replace("B-SM","Synthesis method")
df2.labels = df2.labels.str.replace("I-S","Synthesis method")
df2.labels = df2.labels.str.replace("B-S","Synthesis method")
df2.labels = df2.labels.str.replace("I-CMT","Characterization method")
df2.labels = df2.labels.str.replace("B-CMT","Characterization method")
df2.labels = df2.labels.str.replace("I-CM","Characterization method")
df2.labels = df2.labels.str.replace("B-CM","Characterization method")
df2.labels = df2.labels.str.replace("I-C","Characterization method")
df2.labels = df2.labels.str.replace("B-C","Characterization method")
df2.to_csv('/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/dev.tsv', sep = '\t',header=False, index =False)

In [45]:
label_list = df.labels.tolist()
label_list =list(set(label_list))
label_list

['O',
 '',
 'Sample descriptor',
 'Material property',
 'I',
 None,
 'Material application',
 'Synthesis method',
 '1/2',
 'B',
 'Symmetry Label',
 'Characterization method',
 'B-',
 'I-',
 'Inorganic material']

In [46]:
tsv_to_json_format("/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/dev.tsv",'/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/dev.json',unknown_label_list)

In [47]:
dev_data = []
dev_dataframe = []
lines=[]
with open("/Users/patsnap/Desktop/Neo4J_and_other_codes/SPacy_trial_data/8184428/dev.json", 'r') as f:
    lines = f.readlines()
for line in lines:
    data = json.loads(line)
    text = data['content']
    entities = []
    for annotation in data['annotation']:
        point = annotation['points'][0]
        labels = annotation['label']
        if not isinstance(labels, list):
            labels = [labels]

        for label in labels:
            entities.append((point['start'], point['end'] + 1 ,label))


    dev_data.append((text, {"entities" : entities}))
    dev_dataframe.append((text,entities))

In [48]:
ner_model = spacy.load('/Users/patsnap/Desktop/Neo4J_and_other_codes/Spacy_codes/spacy_models') 

In [49]:
df_dev = pd.DataFrame(dev_dataframe, columns=['text', 'actual_value'])
df_dev.head()

Unnamed: 0,text,actual_value
0,hydrogen diffusion studies in Zr - based laves...,"[(0, 8, Characterization method), (9, 18, Char..."
1,the diffusion constants have been determined...,"[(6, 15, Material property), (16, 25, Material..."
2,the results have been discussed on the basis...,"[(50, 54, Characterization method), (55, 57, C..."
3,the dependence of diffusion constant on allo...,"[(20, 29, Material property), (30, 38, Materia..."
4,activation energy has been obtained from the...,"[(2, 12, Material property), (13, 19, Material..."


In [50]:
def space_text(text):
    my_list = []
    doc = ner_model(text)
    for ent in doc.ents:
        my_list.append((ent.start_char, ent.end_char, ent.label_))
    return my_list  

In [51]:
df_dev['predicted_value'] = df_dev['text'].apply(lambda x:space_text(x))
df_dev.head()

Unnamed: 0,text,actual_value,predicted_value
0,hydrogen diffusion studies in Zr - based laves...,"[(0, 8, Characterization method), (9, 18, Char...","[(0, 8, Synthesis method), (30, 32, Inorganic ..."
1,the diffusion constants have been determined...,"[(6, 15, Material property), (16, 25, Material...","[(6, 15, Material property), (16, 25, Material..."
2,the results have been discussed on the basis...,"[(50, 54, Characterization method), (55, 57, C...","[(50, 54, Characterization method), (55, 57, C..."
3,the dependence of diffusion constant on allo...,"[(20, 29, Material property), (30, 38, Materia...","[(42, 47, Symmetry Label), (48, 59, Material p..."
4,activation energy has been obtained from the...,"[(2, 12, Material property), (13, 19, Material...","[(2, 12, Material property), (13, 19, Material..."


In [52]:
actual_value_list = []
for l in df_dev['actual_value']:
    for a in l:
        actual_value_list.append(a)
predicted_value_list = []
for l in df_dev['predicted_value']:
    for a in l:
        predicted_value_list.append(a)

In [53]:
diff = list(set(actual_value_list)-set(predicted_value_list))
accuracy = ((len(actual_value_list) - len(diff))/len(actual_value_list)) * 100
accuracy
#got accuracy 79.47% after training the model for 500 iterations - updated scispacy model(code 3)
#got accuracy 83.15% after training the model for 500 iterations - blank model(code 2)
# got accuracy 76.80% after training model for 500 iterations - blank model (code 1)

76.35716404627708