#### This notebook generates NER results using SciSpacy and does the comparison

In [1]:
!pip install scispacy

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bc5cdr_md-0.2.5.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bionlp13cg_md-0.2.5.tar.gz

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_core_sci_sm-0.2.5.tar.gz (33.1 MB)
Building wheels for collected packages: en-core-sci-sm
  Building wheel for en-core-sci-sm (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-sci-sm: filename=en_core_sci_sm-0.2.5-py3-none-any.whl size=33155835 sha256=4f8f1accfd85ea2562b4881130247b67a2b1fa71b3d82480a45c26154662fe19
  Stored in directory: /home/ec2-user/.cache/pip/wheels/aa/fa/98/9e290a31f3079f3e67030a95e67174bb3052904d3fa6f7d5b5
Successfully built en-core-sci-sm
You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m
Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bc5cdr_md-0.2.5.tar.gz
  Using cached https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/

In [3]:
!pip install sklearn_crfsuite

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/python -m pip install --upgrade pip' command.[0m


In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils import data
import numpy as np

import scispacy
import spacy

from sklearn.metrics import classification_report
from sklearn_crfsuite.utils import flatten

nlp = spacy.load("en_ner_bc5cdr_md")

#### Test the results

In [4]:
#### Defines the labels for the model
class HParams:
    def __init__(self, vocab_type):
        self.VOCAB_DICT = {
            'bc5cdr': ('<PAD>', 'O', 'B-Chemical', 'B-Disease' , 'I-Disease', 'I-Chemical'),
            'bionlp3g' : ('<PAD>', 'O' ,'B-Amino_acid', 'B-Anatomical_system', 'B-Cancer', 'B-Cell', 
                        'B-Cellular_component', 'B-Developing_anatomical_structure', 'B-Gene_or_gene_product', 
                        'B-Immaterial_anatomical_entity', 'B-Multi-tissue_structure', 'B-Organ', 'B-Organism', 
                        'B-Organism_subdivision', 'B-Organism_substance', 'B-Pathological_formation', 
                        'B-Simple_chemical', 'B-Tissue', 'I-Amino_acid', 'I-Anatomical_system', 'I-Cancer', 
                        'I-Cell', 'I-Cellular_component', 'I-Developing_anatomical_structure', 'I-Gene_or_gene_product', 
                        'I-Immaterial_anatomical_entity', 'I-Multi-tissue_structure', 'I-Organ', 'I-Organism', 
                        'I-Organism_subdivision', 'I-Organism_substance', 'I-Pathological_formation', 'I-Simple_chemical', 
                        'I-Tissue')
        }
        self.VOCAB = self.VOCAB_DICT[vocab_type]
        self.tag2idx = {v:k for k,v in enumerate(self.VOCAB)}
        self.idx2tag = {k:v for k,v in enumerate(self.VOCAB)}

In [5]:
### Reads the input file and returns the data
def get_data(path):
    instances = open(path).read().strip().split('\n\n')
    sents = []
    tags_li = []
    for entry in instances:
        words = [line.split()[0] for line in entry.splitlines()]
        tags = ([line.split()[-1] for line in entry.splitlines()])
        sents.append(words)
        tags_li.append(tags)
    return (sents, tags_li)
    

In [6]:
def calculate_output(sents):
    #print(' '.join(sents))
    doc = nlp(' '.join(sents))
    #spacy.displacy.render(doc, jupyter = True, style = 'ent')
    
    ###Get the output from Spacy
    item_dict = {}
    for item in doc.ents:
        item_part = str(item).split()
        item_dict[item_part[0]] = "B-" + item.label_.capitalize()
        if len(item_part) > 1:    
            for i in range(1,len(item_part)):
                item_dict[item_part[i]] = "I-" + item.label_.capitalize()
                
    predicted_output = []
    for item in sents:
        if item in item_dict.keys():
            predicted_output.append(item_dict[item])
        else:
            predicted_output.append('O')
    
    return predicted_output,item_dict
        
                
        

In [7]:
def get_the_results(sents,tags_li,hp):
    
    Words,Tags, Predicted_tags = [], [], []
    for sent, tag in zip(sents, tags_li):
        Words.extend(sent)
        Tags.extend(tag)
        predicted_tag, _ = calculate_output(sent)
        assert len(predicted_tag) == len(tag), "label and prediction lengths are not same"
        Predicted_tags.extend(predicted_tag)
        
    ## calc metric
    y_true =  np.array([hp.tag2idx[item] for item in Tags])
    y_pred =  np.array([hp.tag2idx[item] for item in Predicted_tags])
    
    print(classification_report(y_true, y_pred, target_names=list(hp.tag2idx.keys())[1:]))
    

### Results for BC5CDR tags

In [15]:
hp = HParams('bc5cdr')
sents,tags_li = get_data("./../data/BC5CDR-IOB/test.tsv")

In [16]:
### display for one row
doc = nlp(' '.join(sents[7]))
displacy_image = spacy.displacy.render(doc, jupyter = True, style = 'ent')

In [17]:
get_the_results(sents,tags_li,hp)

              precision    recall  f1-score   support

           O       0.98      0.98      0.98    110576
  B-Chemical       0.91      0.88      0.90      5385
   B-Disease       0.82      0.81      0.81      4424
   I-Disease       0.74      0.74      0.74      2737
  I-Chemical       0.63      0.57      0.60      1628

    accuracy                           0.96    124750
   macro avg       0.82      0.80      0.81    124750
weighted avg       0.96      0.96      0.96    124750



#### Results for BioNLP13CG tags

In [21]:
nlp = spacy.load("en_ner_bionlp13cg_md")
hp = HParams('bionlp3g')
sents,tags_li = get_data("./../data/BioNLP13CG-IOB/test.tsv")

In [22]:
### display for one row
doc = nlp(' '.join(sents[7]))
displacy_image = spacy.displacy.render(doc, jupyter = True, style = 'ent')

In [23]:
get_the_results(sents,tags_li,hp)

                                   precision    recall  f1-score   support

                                O       0.97      0.97      0.97     40642
                     B-Amino_acid       0.00      0.00      0.00        62
              B-Anatomical_system       0.00      0.00      0.00        17
                         B-Cancer       0.80      0.76      0.78       924
                           B-Cell       0.78      0.75      0.77      1013
             B-Cellular_component       0.76      0.74      0.75       180
B-Developing_anatomical_structure       0.00      0.00      0.00        17
           B-Gene_or_gene_product       0.87      0.85      0.86      2520
   B-Immaterial_anatomical_entity       0.55      0.19      0.29        31
         B-Multi-tissue_structure       0.73      0.68      0.70       303
                          B-Organ       0.64      0.59      0.61       156
                       B-Organism       0.85      0.82      0.83       518
           B-Organism_su

  _warn_prf(average, modifier, msg_start, len(result))
