## Import

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import time
import os
import argparse
from subprocess import call


import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
tqdm.pandas()

from IPython.display import display, Markdown, Latex, HTML

import torch

from transformers import GPT2Tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer


from verisci.covid import AbstractRetriever, RationaleSelector, LabelPredictor
from verisci.evaluate.lib.data import GoldDataset
#import wandb

  from pandas import Panel


In [3]:
loc_df_scispacy_sentence_word_unq_ner_abr_filtered ='../../dfs_generated/linguistic/df_scispacy_sentence_word_unq_ner_abr_filtered.pkl'

## Scifact

### Load Data

In [4]:
loc_corpus = '../../data/scifact/corpus.jsonl'
loc_claim_train = '../../data/scifact/claims_train.jsonl'
loc_claim_test = '../../data/scifact/claims_test.jsonl'
loc_claim_dev = '../../data/scifact/claims_dev.jsonl'

In [5]:
from verisci.evaluate.lib.data import GoldDataset

In [6]:
def get_claim_label_from_jsonl(dataset_jsonl):
    claim_label_list_train = []


    for cur_claim in dataset_jsonl:
        claim_txt = cur_claim.claim

        for doc_id, evidence in cur_claim.evidence.items():

            ev_doc = cur_claim.release.corpus.get_document(doc_id)

            claim_label = evidence.label.name

            tmp_dic = {"claim" : claim_txt, "label" : claim_label}

            claim_label_list_train.append(tmp_dic)
    return claim_label_list_train

In [7]:
def get_claim_label_evidence_from_jsonl(dataset_jsonl, source):
    claim_label_list_train = []


    for cur_claim in dataset_jsonl:
        claim_txt = cur_claim.claim

        for doc_id, evidence in cur_claim.evidence.items():

            ev_doc = cur_claim.release.corpus.get_document(doc_id)

            claim_label = evidence.label.name
            
            list_rationales = []
            for i, sents in enumerate(evidence.rationales):
                list_rationales = [sent for i, sent in enumerate(ev_doc.sentences) if i in sents]

            tmp_dic = {"claim" : claim_txt, "label" : claim_label, "list_rationales" :list_rationales, "source" :source}

            claim_label_list_train.append(tmp_dic)
    return claim_label_list_train

In [8]:
ds_train = GoldDataset(loc_corpus, loc_claim_train)
# claim_train = ds_train.get_claim(39)
# claim_train.pretty_print()
dic_train = get_claim_label_evidence_from_jsonl(ds_train, source = "train")

In [9]:
ds_valid = GoldDataset(loc_corpus, loc_claim_dev)
# claim_valid = ds_valid.get_claim(42)
# claim_valid.pretty_print()
dic_valid = get_claim_label_evidence_from_jsonl(ds_valid, source = "dev")

In [10]:
df_claim_evid_label = pd.concat([pd.DataFrame(dic_train), pd.DataFrame(dic_valid)], ignore_index=True)

df_claim_evid_label

Unnamed: 0,claim,label,list_rationales,source
0,1 in 5 million in UK have abnormal PrP positiv...,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train
1,32% of liver transplantation programs required...,SUPPORTS,[Policies requiring discontinuation of methado...,train
2,40mg/day dosage of folic acid and 2mg/day dosa...,SUPPORTS,[CONCLUSION Treatment with high doses of folic...,train
3,76-85% of people with severe mental disorder r...,SUPPORTS,[Although disorder severity was correlated wit...,train
4,A T helper 2 cell (Th2) environment impedes di...,REFUTES,"[Thus, in Lyn(-/-) mice, basophils and IgE aut...",train
...,...,...,...,...
768,Women with a higher birth weight are more like...,SUPPORTS,[Increased risk of breast cancer was noted wit...,dev
769,Women with a higher birth weight are more like...,SUPPORTS,[RESULTS We found that heavier birth weights w...,dev
770,aPKCz causes tumour enhancement by affecting g...,REFUTES,"[Taken together, this demonstrates that PKCζ i...",dev
771,cSMAC formation enhances weak ligand signalling.,SUPPORTS,[This conclusion was supported by experiments ...,dev


## Abbreviations

In [11]:
import spacy

# nlp = spacy.load("en_core_web_trf")

from spacy import displacy
from nltk import word_tokenize
import nltk

In [12]:
nlp_sci_brt = spacy.load("en_core_sci_scibert")
nlp_sci_lg = spacy.load("en_core_sci_lg")

#import en_ner_bc5cdr_md
nlp_cr = spacy.load("en_ner_craft_md")
nlp_bc = spacy.load("en_ner_bc5cdr_md")
nlp_bi = spacy.load("en_ner_bionlp13cg_md")
nlp_jn = spacy.load("en_ner_jnlpba_md")

nlp_core_web_trf = spacy.load("en_core_web_trf")

In [13]:
from scispacy.abbreviation import AbbreviationDetector

In [14]:
nlp_sci_brt.add_pipe("abbreviation_detector")
nlp_sci_lg.add_pipe("abbreviation_detector")
nlp_cr.add_pipe("abbreviation_detector")
nlp_bc.add_pipe("abbreviation_detector")
nlp_bi.add_pipe("abbreviation_detector")
nlp_jn.add_pipe("abbreviation_detector")

<scispacy.abbreviation.AbbreviationDetector at 0x7f8d27001dd0>

In [31]:
list_sci_brt = set()
list_sci_lg  = set()
list_cr = set()
list_bc = set()
list_bi = set()
list_jn = set()


In [25]:
list_jn.difference(list_jn)

set()

In [36]:
list_jn.difference(list_sci_brt)

set()

In [37]:
list_jn.difference(list_sci_brt.union(list_sci_lg, list_cr, list_bc, list_bi))

set()

In [33]:
all_sentence_abbr = []
for cur_indx, cur_row in tqdm(df_claim_evid_label.iterrows(), total = len(df_claim_evid_label["claim"])):
    
    cur_claim = cur_row['claim']
    doc_sci_brt = nlp_sci_brt(cur_claim)
    
    doc_sci_lg = nlp_sci_lg(cur_claim)
    
    doc_cr = nlp_cr(cur_claim)
    
    
    doc_bc = nlp_bc(cur_claim)
    
    doc_bi = nlp_bi(cur_claim)
    
    doc_jn = nlp_jn(cur_claim)
    
    
    
    
    
    cur_all_abbr = {}
    
    for cur_abr in doc_sci_brt._.abbreviations:  
        list_sci_brt.add(str(cur_abr))
        
        cur_all_abbr[str(cur_abr)] = {'abrv_text': str(cur_abr), 'abr_definition' : str(cur_abr._.long_form), 
                                      'abr_model' : 'en_core_sci_scibert', 'claim' : cur_claim, 'sf_start_char' : cur_abr.start_char, 'sf_end_char' : cur_abr.end_char,
                                     'lf_start_char' : cur_abr._.long_form.start_char, 'lf_end_char' : cur_abr._.long_form.end_char, 
                                     'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
        cur_all_abbr[str(cur_abr)]['abr_whole_start'] = -1
        cur_all_abbr[str(cur_abr)]['abr_whole_end'] = -1
        # if short and long form of abbraviation exists
        if((cur_abr.end_char - cur_abr.start_char > 0) & (cur_abr._.long_form.end_char - cur_abr._.long_form.start_char > 0)):
            # Aaa Bbb Rrr (ABR) or ABR (Aaa Bbb Rrr)  
            if (cur_abr.start_char - cur_abr._.long_form.end_char   < 4 )| (cur_abr._.long_form.start_char - cur_abr.end_char < 4):
                cur_all_abbr[str(cur_abr)]['abr_whole_start'] = min(cur_abr.start_char, cur_abr._.long_form.start_char)
                cur_all_abbr[str(cur_abr)]['abr_whole_end'] = max(cur_abr.end_char, cur_abr._.long_form.end_char)
    
    for cur_abr in doc_sci_lg._.abbreviations:  
        list_sci_lg.add(str(cur_abr))
        cur_all_abbr[str(cur_abr)] = {'abrv_text': str(cur_abr), 'abr_definition' : str(cur_abr._.long_form), 
                                      'abr_model' : 'en_core_sci_lg', 'claim' : cur_claim, 'sf_start_char' : cur_abr.start_char, 'sf_end_char' : cur_abr.end_char,
                                     'lf_start_char' : cur_abr._.long_form.start_char, 'lf_end_char' : cur_abr._.long_form.end_char, 
                                     'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
        
        cur_all_abbr[str(cur_abr)]['abr_whole_start'] = -1
        cur_all_abbr[str(cur_abr)]['abr_whole_end'] = -1
        # if short and long form of abbraviation exists
        if((cur_abr.end_char - cur_abr.start_char > 0) & (cur_abr._.long_form.end_char - cur_abr._.long_form.start_char > 0)):
            # Aaa Bbb Rrr (ABR) or ABR (Aaa Bbb Rrr)  
            if (cur_abr.start_char - cur_abr._.long_form.end_char   < 4 )| (cur_abr._.long_form.start_char - cur_abr.end_char < 4):
                cur_all_abbr[str(cur_abr)]['abr_whole_start'] = min(cur_abr.start_char, cur_abr._.long_form.start_char)
                cur_all_abbr[str(cur_abr)]['abr_whole_end'] = max(cur_abr.end_char, cur_abr._.long_form.end_char)
                
                
    for cur_abr in doc_cr._.abbreviations:  
        list_cr.add(str(cur_abr))
        cur_all_abbr[str(cur_abr)] = {'abrv_text': str(cur_abr), 'abr_definition' : str(cur_abr._.long_form), 
                                      'abr_model' : 'en_ner_craft_md', 'claim' : cur_claim, 'sf_start_char' : cur_abr.start_char, 'sf_end_char' : cur_abr.end_char,
                                      'lf_start_char' : cur_abr._.long_form.start_char, 'lf_end_char' : cur_abr._.long_form.end_char, 
                                      'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
    
    
        cur_all_abbr[str(cur_abr)]['abr_whole_start'] = -1
        cur_all_abbr[str(cur_abr)]['abr_whole_end'] = -1
        # if short and long form of abbraviation exists
        if((cur_abr.end_char - cur_abr.start_char > 0) & (cur_abr._.long_form.end_char - cur_abr._.long_form.start_char > 0)):
            # Aaa Bbb Rrr (ABR) or ABR (Aaa Bbb Rrr)  
            if (cur_abr.start_char - cur_abr._.long_form.end_char   < 4 )| (cur_abr._.long_form.start_char - cur_abr.end_char < 4):
                cur_all_abbr[str(cur_abr)]['abr_whole_start'] = min(cur_abr.start_char, cur_abr._.long_form.start_char)
                cur_all_abbr[str(cur_abr)]['abr_whole_end'] = max(cur_abr.end_char, cur_abr._.long_form.end_char)
                
                
    for cur_abr in doc_bc._.abbreviations:
        list_bc.add(str(cur_abr))
        cur_all_abbr[str(cur_abr)] = {'abrv_text': str(cur_abr), 'abr_definition' : str(cur_abr._.long_form), 
                                      'abr_model' : 'en_ner_bc5cdr_md', 'claim' : cur_claim, 'sf_start_char' : cur_abr.start_char, 'sf_end_char' : cur_abr.end_char,
                                     'lf_start_char' : cur_abr._.long_form.start_char, 'lf_end_char' : cur_abr._.long_form.end_char, 
                                     'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
        
        
        cur_all_abbr[str(cur_abr)]['abr_whole_start'] = -1
        cur_all_abbr[str(cur_abr)]['abr_whole_end'] = -1
        # if short and long form of abbraviation exists
        if((cur_abr.end_char - cur_abr.start_char > 0) & (cur_abr._.long_form.end_char - cur_abr._.long_form.start_char > 0)):
            # Aaa Bbb Rrr (ABR) or ABR (Aaa Bbb Rrr)  
            if (cur_abr.start_char - cur_abr._.long_form.end_char   < 4 )| (cur_abr._.long_form.start_char - cur_abr.end_char < 4):
                cur_all_abbr[str(cur_abr)]['abr_whole_start'] = min(cur_abr.start_char, cur_abr._.long_form.start_char)
                cur_all_abbr[str(cur_abr)]['abr_whole_end'] = max(cur_abr.end_char, cur_abr._.long_form.end_char)
                
                
    for cur_abr in doc_bi._.abbreviations:
        list_bi.add(str(cur_abr))
        cur_all_abbr[str(cur_abr)] = {'abrv_text': str(cur_abr), 'abr_definition' : str(cur_abr._.long_form), 
                                      'abr_model' : 'en_ner_bionlp13cg_md', 'claim' : cur_claim, 'sf_start_char' : cur_abr.start_char, 'sf_end_char' : cur_abr.end_char,
                                     'lf_start_char' : cur_abr._.long_form.start_char, 'lf_end_char' : cur_abr._.long_form.end_char, 
                                     'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
    
    
        cur_all_abbr[str(cur_abr)]['abr_whole_start'] = -1
        cur_all_abbr[str(cur_abr)]['abr_whole_end'] = -1
        # if short and long form of abbraviation exists
        if((cur_abr.end_char - cur_abr.start_char > 0) & (cur_abr._.long_form.end_char - cur_abr._.long_form.start_char > 0)):
            # Aaa Bbb Rrr (ABR) or ABR (Aaa Bbb Rrr)  
            if (cur_abr.start_char - cur_abr._.long_form.end_char   < 4 )| (cur_abr._.long_form.start_char - cur_abr.end_char < 4):
                cur_all_abbr[str(cur_abr)]['abr_whole_start'] = min(cur_abr.start_char, cur_abr._.long_form.start_char)
                cur_all_abbr[str(cur_abr)]['abr_whole_end'] = max(cur_abr.end_char, cur_abr._.long_form.end_char)
                
                
    for cur_abr in doc_jn._.abbreviations:
        list_jn.add(str(cur_abr))
        cur_all_abbr[str(cur_abr)] = {'abrv_text': str(cur_abr), 'abr_definition' : str(cur_abr._.long_form), 
                                      'abr_model' : 'en_ner_jnlpba_md', 'claim' : cur_claim, 'sf_start_char' : cur_abr.start_char, 'sf_end_char' : cur_abr.end_char,
                                     'lf_start_char' : cur_abr._.long_form.start_char, 'lf_end_char' : cur_abr._.long_form.end_char, 
                                     'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
        
        
        cur_all_abbr[str(cur_abr)]['abr_whole_start'] = -1
        cur_all_abbr[str(cur_abr)]['abr_whole_end'] = -1
        # if short and long form of abbraviation exists
        if((cur_abr.end_char - cur_abr.start_char > 0) & (cur_abr._.long_form.end_char - cur_abr._.long_form.start_char > 0)):
            # Aaa Bbb Rrr (ABR) or ABR (Aaa Bbb Rrr)  
            if (cur_abr.start_char - cur_abr._.long_form.end_char   < 4 )| (cur_abr._.long_form.start_char - cur_abr.end_char < 4):
                cur_all_abbr[str(cur_abr)]['abr_whole_start'] = min(cur_abr.start_char, cur_abr._.long_form.start_char)
                cur_all_abbr[str(cur_abr)]['abr_whole_end'] = max(cur_abr.end_char, cur_abr._.long_form.end_char)+1        
    
    
    all_sentence_abbr.extend(list(cur_all_abbr.values()))
    
# en_core_sci_scibert and   en_ner_jnlpba_md having same abbrs

100%|██████████| 773/773 [01:14<00:00, 10.36it/s]


In [24]:
all_sentence_abbr

[{'abrv_text': 'CKD',
  'abr_definition': 'chronic kidney disease',
  'abr_model': 'en_ner_jnlpba_md',
  'claim': '40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.',
  'sf_start_char': 104,
  'sf_end_char': 107,
  'lf_start_char': 80,
  'lf_end_char': 102,
  'org_label': 'SUPPORTS',
  'list_rationales': ['CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease.   \n'],
  'data_source': 'train',
  'abr_whole_start': 80,
  'abr_whole_end': 108},
 {'abrv_text': 'Th2',
  'abr_definition': 'T helper 2 cell',
  'abr_model': 'en_ner_jnlpba_md',
  'claim': 'A T helper 2 cell (Th2) environment impedes disease development in patients with systemic lupus erythematosus (SLE).',
  'sf_start_char': 19,
  'sf_end_char': 22,
  'lf_start_char': 2,
  'lf_end_char': 17,
  'org_lab

In [16]:
df_sentence_word_unq_abr = pd.DataFrame(all_sentence_abbr)

In [17]:
df_sentence_word_unq_abr

Unnamed: 0,abrv_text,abr_definition,abr_model,claim,sf_start_char,sf_end_char,lf_start_char,lf_end_char,org_label,list_rationales,data_source,abr_whole_start,abr_whole_end
0,CKD,chronic kidney disease,en_ner_jnlpba_md,40mg/day dosage of folic acid and 2mg/day dosa...,104,107,80,102,SUPPORTS,[CONCLUSION Treatment with high doses of folic...,train,80,108
1,Th2,T helper 2 cell,en_ner_jnlpba_md,A T helper 2 cell (Th2) environment impedes di...,19,22,2,17,REFUTES,"[Thus, in Lyn(-/-) mice, basophils and IgE aut...",train,2,23
2,SLE,systemic lupus erythematosus,en_ner_jnlpba_md,A T helper 2 cell (Th2) environment impedes di...,111,114,81,109,REFUTES,"[Thus, in Lyn(-/-) mice, basophils and IgE aut...",train,81,115
3,AMPK,AMP-activated protein kinase,en_ner_jnlpba_md,AMP-activated protein kinase (AMPK) activation...,30,34,0,28,SUPPORTS,[These studies implicate deficient AMPK activa...,train,0,35
4,ESCs,embryonic stem cells,en_ner_jnlpba_md,Androgenetic haploid mouse embryonic stem cell...,49,53,27,47,SUPPORTS,[Our results demonstrate that AG-haESCs can be...,train,27,54
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79,IgG,Immunoglobulin G,en_ner_jnlpba_md,Human T-lymphotropic virus type-I-associated m...,132,135,114,130,SUPPORTS,[Antibodies to hnRNP-A1 cross-reacted with HTL...,dev,114,136
80,NSC,Neural Stem Cell,en_ner_jnlpba_md,MicroRNA is involved in the regulation of Neur...,60,63,42,58,SUPPORTS,[High levels of miR-184 promoted proliferation...,dev,42,64
81,NETs,Neutrophil extracellular traps,en_ner_jnlpba_md,Neutrophil extracellular traps (NETs) are rele...,32,36,0,30,SUPPORTS,"[Here we show that chromatin fibers, so-called...",dev,0,37
82,E. coli,Escherichia coli,en_ner_jnlpba_md,The tip of the inner tube of the toxic type VI...,115,122,97,113,SUPPORTS,[Our results indicate a new model of the T6SS ...,dev,97,123


In [18]:
df_sentence_word_unq_abr['abr_model'].value_counts()

en_ner_jnlpba_md    84
Name: abr_model, dtype: int64

## NER

In [19]:
from pattern.en import quantify, numerals, number,reflect

from nltk.corpus import wordnet as wn_nltk
from itertools import chain

from pattern.en import NOUN, VERB, ADJECTIVE, ADVERB, NOUN, VERB, ADJ, ADV, PRON, DET, PREP, ADP, NUM, CONJ, INTJ, PRT, PUNC, X
from pattern.en import  POS, CHUNK, INFINITIVE, PRESENT, PAST, FUTURE, FIRST, SECOND, THIRD,SINGULAR, PLURAL, SG, PL, PROGRESSIVE, PARTICIPLE, WORD
from pattern.text import UNIVERSAL
from pattern.en import conjugate, lemma, lexeme,PRESENT,SG, verbs, PARTICIPLE, ngrams
from pattern.en import wordnet as wn_pattern

from pattern.en import tree
from pattern.text.tree import Word
from pattern.en import parse, parsetree, tag, pprint

In [38]:
df_claim_evid_label.shape

(773, 4)

In [20]:
all_sentence_word_ner = []
for cur_index, cur_row in tqdm(df_claim_evid_label.iterrows(), total = len(df_claim_evid_label)):
#     doc_sci_brt = nlp_sci_brt(txt)
#     doc_sci_lg = nlp_sci_lg(txt)

    cur_claim = cur_row['claim']
    doc_cr = nlp_cr(cur_claim)
    doc_bc = nlp_bc(cur_claim)
    doc_bi = nlp_bi(cur_claim)
    doc_jn = nlp_jn(cur_claim)
    
    

    
#         all_tags.append({'ner_text': cur_ent.text, 'ner_label' : cur_ent.label_, 'ner_model' : 'en_core_sci_scibert', 'claim' : txt})
    
#     for cur_ent in doc_sci_lg.ents:
#         all_tags.append({'ner_text': cur_ent.text, 'ner_label' : cur_ent.label_, 'ner_model' : 'en_core_sci_lg', 'claim' : txt})

    
    cur_all_tags = {}
    
#     for cur_ent in doc_sci_brt.ents:
#         cur_all_tags[cur_ent.text] = {'ner_text': cur_ent.text, 'ner_label' : cur_ent.label_, 'ner_model' : 'en_core_sci_scibert', 'claim' : txt, 'start_char' : cur_ent.start_char, 'end_char' : cur_ent.end_char}

    for cur_ent in doc_cr.ents:      
        
        cur_all_tags[cur_ent.text] = {'ner_text': cur_ent.text, 'ner_label' : cur_ent.label_, 
                                      'ner_model' : 'en_ner_craft_md', 'claim' : cur_claim, 'start_char' : cur_ent.start_char, 
                                      'end_char' : cur_ent.end_char, 'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
    
    for cur_ent in doc_bc.ents:
        cur_all_tags[cur_ent.text] = {'ner_text': cur_ent.text, 'ner_label' : cur_ent.label_, 
                                      'ner_model' : 'en_ner_bc5cdr_md', 'claim' : cur_claim, 'start_char' : cur_ent.start_char, 
                                      'end_char' : cur_ent.end_char, 'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
        
    for cur_ent in doc_bi.ents:
        cur_all_tags[cur_ent.text] = {'ner_text': cur_ent.text, 'ner_label' : cur_ent.label_, 
                                      'ner_model' : 'en_ner_bionlp13cg_md', 'claim' : cur_claim, 'start_char' : cur_ent.start_char, 
                                      'end_char' : cur_ent.end_char, 'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
    
    for cur_ent in doc_jn.ents:
        cur_all_tags[cur_ent.text] = {'ner_text': cur_ent.text, 'ner_label' : cur_ent.label_, 
                                      'ner_model' : 'en_ner_jnlpba_md', 'claim' : cur_claim, 'start_char' : cur_ent.start_char, 
                                      'end_char' : cur_ent.end_char, 'org_label' : cur_row['label'], 'list_rationales' : cur_row['list_rationales'],
                                      'data_source' : cur_row['source']}
    
    
    all_sentence_word_ner.extend(list(cur_all_tags.values()))

100%|██████████| 773/773 [00:22<00:00, 33.82it/s]


In [21]:
df_sentence_word_unq_ner = pd.DataFrame(all_sentence_word_ner)

In [22]:
df_sentence_word_unq_abr_ext = df_sentence_word_unq_abr.copy()

In [23]:
df_sentence_word_unq_abr_ext['comment'] = 'abbreviation'
df_sentence_word_unq_abr_ext['whole_text'] = df_sentence_word_unq_abr_ext['abr_definition'] + " ("+ df_sentence_word_unq_abr_ext['abrv_text'] + ")"

df_sentence_word_unq_abr_ext.rename(columns = {'whole_text' : 'ner_text', 'comment' : 'ner_label', 'abr_model' :'ner_model', 
                                                'claim' : 'claim', 'abr_whole_start' : 'start_char', 'abr_whole_end' : 'end_char'}, inplace= True)

In [24]:
df_sentence_word_unq_ner_abr = df_sentence_word_unq_ner.append(df_sentence_word_unq_abr_ext[['ner_label', 'ner_text', 'ner_model', 
                                                'claim', 'start_char', 'end_char', 'org_label', 'list_rationales', 'data_source']], ignore_index= True)

df_sentence_word_unq_ner_abr.sort_values(by = ['claim', 'start_char', 'end_char'], inplace= True)

df_sentence_word_unq_ner_abr.reset_index(drop= True, inplace=True)


In [25]:
HTML(df_sentence_word_unq_ner_abr.loc[9:19, :].sort_values(by = ['claim']).to_html())

Unnamed: 0,ner_text,ner_label,ner_model,claim,start_char,end_char,org_label,list_rationales,data_source
9,methadone,SIMPLE_CHEMICAL,en_ner_bionlp13cg_md,32% of liver transplantation programs required patients to discontinue methadone treatment in 2001.,71,80,SUPPORTS,[Policies requiring discontinuation of methadone in 32% of all programs contradict the evidence base for efficacy of long-term replacement therapies and potentially result in relapse of previously stable patients.],train
10,40mg/day,SIMPLE_CHEMICAL,en_ner_bionlp13cg_md,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,0,8,SUPPORTS,[CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n],train
11,folic acid,SIMPLE_CHEMICAL,en_ner_bionlp13cg_md,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,19,29,SUPPORTS,[CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n],train
12,2mg/day,CHEMICAL,en_ner_bc5cdr_md,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,34,41,SUPPORTS,[CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n],train
13,vitamin,CHEBI,en_ner_craft_md,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,52,59,SUPPORTS,[CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n],train
14,vitamin B12,SIMPLE_CHEMICAL,en_ner_bionlp13cg_md,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,52,63,SUPPORTS,[CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n],train
15,chronic kidney disease,DISEASE,en_ner_bc5cdr_md,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,80,102,SUPPORTS,[CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n],train
16,chronic kidney disease (CKD),abbreviation,en_ner_jnlpba_md,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,80,108,SUPPORTS,[CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n],train
17,kidney,ORGAN,en_ner_bionlp13cg_md,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,88,94,SUPPORTS,[CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n],train
18,CKD,CANCER,en_ner_bionlp13cg_md,40mg/day dosage of folic acid and 2mg/day dosage of vitamin B12 does not affect chronic kidney disease (CKD) progression.,104,107,SUPPORTS,[CONCLUSION Treatment with high doses of folic acid and B vitamins did not improve survival or reduce the incidence of vascular disease in patients with advanced chronic kidney disease or end-stage renal disease. \n],train


In [26]:
def get_overlaped_span(start_1, end_1, start_2, end_2):
    if( max(end_1, end_2) - min(start_1, start_2) <= (end_1 - start_1) + (end_2 - start_2)):
        return (min(start_1, start_2) , max(end_1, end_2))
    else :
        return None

In [27]:
df_grp_sentence_word_unq_ner = df_sentence_word_unq_ner_abr.loc[:, :].groupby('claim')

In [28]:
df_sentence_word_unq_ner_abr_filtered = pd.DataFrame(columns=['ner_text', 'ner_label', 'ner_model', 
                                                'claim', 'start_char', 'end_char'])

for cur_claim, cur_df_group in tqdm(df_grp_sentence_word_unq_ner):
    cur_df_group.sort_values(by = ['start_char', 'end_char'], inplace= True)
    counter_index_outer = 0
    while counter_index_outer < len(cur_df_group):
        cur_span_start = cur_df_group['start_char']
        counter_index_inner = counter_index_outer+1
        while counter_index_inner < len(cur_df_group):  
            overlapped_span = get_overlaped_span(cur_df_group['start_char'].iloc[counter_index_outer], cur_df_group['end_char'].iloc[counter_index_outer],
                                 cur_df_group['start_char'].iloc[counter_index_inner], cur_df_group['end_char'].iloc[counter_index_inner])
            if overlapped_span != None:
                cur_df_group.loc[cur_df_group.index[counter_index_outer], 'start_char'] = overlapped_span[0]
                cur_df_group.loc[cur_df_group.index[counter_index_outer], 'end_char'] = overlapped_span[1]
                cur_df_group.drop(cur_df_group.index[[counter_index_inner]], inplace = True)
            else:
                counter_index_inner += 1 
        counter_index_outer += 1
    df_sentence_word_unq_ner_abr_filtered = df_sentence_word_unq_ner_abr_filtered.append(cur_df_group, ignore_index= True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
100%|██████████| 668/668 [00:03<00:00, 197.31it/s]


In [29]:
df_grp_sentence_word_unq_ner_abr_filtered = df_sentence_word_unq_ner_abr_filtered.loc[:, :].groupby('claim')

In [30]:
df_sentence_word_unq_ner_abr_filtered.to_pickle(loc_df_scispacy_sentence_word_unq_ner_abr_filtered)

In [31]:
dic_map_strpos_patternpos = {'NOUN' :NOUN, 'VERB' : VERB, 'ADJECTIVE': ADJECTIVE, 'ADVERB' : ADVERB}

In [32]:
css_style = """
        <style>
        .org_claim_container {
          background-color: #f0f6fc;
          color: #000000cc;
          border: 2px ridge #72aee655;
          border-radius: 10px;
          margin: 5px;
          padding: 10px;
        }

        .term_container{
          background-color: #faebf6;
          color: #af0000dc;
          border-radius: 5px;
          border: 2px ridge #faebf613;
          font-size: 12pt;
          font-weight: bold;
        }

        .non_term_container{
          background-color: #fff7fd;
          color: #001630;
          font-size: 12pt;
        }   
        
        .meta_info_token{
        display: block;
        background-color: #00a6f309;
        border: 1px ridge #00a6f333;
        margine : 5px;
        padding : 5px;
        font-weight: 400;
        text-shadow: 0.2px 0.2px #00000055;
        box-shadow: 1px 1px #00660022;
        }
        </style>
    """

gen_html = css_style

list_dic_of_sent_tag_info = []

for cur_claim, cur_df_group in tqdm(df_grp_sentence_word_unq_ner_abr_filtered):
    doc_sci_brt = nlp_sci_brt(cur_claim)
    list_term_ranges = cur_df_group[['start_char', 'end_char']].values
    gen_html += '<div class="org_claim_container">'
    
    
    
    previous_term_end = 0
    for cur_indx, cur_term_range in enumerate(list_term_ranges):
        
        gen_html += '<span class = "non_term_container">'+ cur_claim[previous_term_end:cur_term_range[0]]+'</span>'
        gen_html += '<span class = "term_container">'+ cur_claim[cur_term_range[0]:cur_term_range[1]]+'</span>'
        
        doc_sci_brt = nlp_sci_brt(cur_claim[previous_term_end:cur_term_range[0]])
        
        
        for cur_token in doc_sci_brt:
            dic_token_all_info = {}
#             gen_html += '<span class = "meta_info_token">'+ str({'<b>token_text': cur_token.text, '</b>token_lemma' : cur_token.lemma_, 
#                                                                  'pos' : cur_token.pos_,'dep' : cur_token.dep_, 
#                                                                  'tag' : cur_token.tag_,
#                                                                  'tag_explain' : spacy.explain(cur_token.tag_),
#                                                                 })+'</span>'
            
            
            dic_token_all_info['claim'] = cur_claim, 
            dic_token_all_info['token'] = cur_token.text 
            
#             print('org label ', set(cur_df_group['org_label']))
            dic_token_all_info['org_label'] = list(set(cur_df_group['org_label']))[0]
            dic_token_all_info['list_rationales'] = list(cur_df_group['list_rationales'])[0]
            dic_token_all_info['data_source'] = list(set(cur_df_group['data_source']))[0]
#             dic_token_all_info['claim'] = 
            
#             print('\n\n#### spacy ####')
            
#             print({'token_text': cur_token.text, 'token_lemma' : cur_token.lemma_, 
#                                                                  'pos' : cur_token.pos_,'dep' : cur_token.dep_, 
#                                                                  'tag' : cur_token.tag_,
#                                                                  'tag_explain' : spacy.explain(cur_token.tag_),                                                                 
#                                                                 })
            
            
            dic_token_all_info['token_text_spacy'] = cur_token.text 
            dic_token_all_info['token_lemma_spacy'] = cur_token.lemma_ 
            dic_token_all_info['pos_spacy'] = cur_token.pos_
            dic_token_all_info['dep_spacy'] = cur_token.dep_
            dic_token_all_info['tag_spacy'] = cur_token.tag_
            dic_token_all_info['tag_explain_spacy'] = spacy.explain(cur_token.tag_),                                                                 
                                                                
#             print('\n#### NLTK ####')
            syns_wn_nltk = wn_nltk.synsets(cur_token.text)
            synset_wn_nltk = list(set(chain.from_iterable([word.lemma_names() for word in syns_wn_nltk])))
#             print('nltk syn : ', cur_token.text, ' >>> ', synset_wn_nltk)
            dic_token_all_info['wn_nltk_syn'] = synset_wn_nltk
            
            
#             print('morph :', cur_token.morph.to_dict())
            dic_token_all_info['morf_nltk'] = cur_token.morph.to_dict()
            
#             print('\n#### Pattern ####')
            pattern_syns_list = []
            
            try:
                pattern_syns_list.append(numerals(cur_token.text))
            except :
                print()
#             word_pattern = Word(cur_claim, cur_token.text)
#             print('word_pattern > ', word_pattern.string)


#             print('lemma', ' > ', lemma(cur_token.text))
            
            
            if(cur_token.pos_ in ['NOUN', 'VERB', 'ADJECTIVE', 'ADVERB'] ):
                cur_pos = str(cur_token.pos_).upper()
#                 print("cur_token > ", cur_token.text)
                
                for synset in wn_pattern.synsets(cur_token.text, pos=dic_map_strpos_patternpos[cur_token.pos_]):
#                     print("Token pos : ", synset.pos)
#                     print("Token similar : ", synset.similar())
#                     print("synonyms : %s" % synset.synonyms)      
#                     print("pos  : %s" % synset.lexname )    
                    pattern_syns_list += synset.synonyms
            pattern_syns_list = list(set(pattern_syns_list))
            dic_token_all_info['wn_pattern_syn'] = pattern_syns_list
#             print('pattern synonyms :', pattern_syns_list)
            
            list_dic_of_sent_tag_info.append(dic_token_all_info)
        previous_term_end = cur_term_range[1]
        
    gen_html += '<span class = "non_term_container">'+cur_claim[previous_term_end:]+'</span>'
    gen_html += '</div>'
    

    


  0%|          | 1/668 [00:00<02:36,  4.27it/s]










  0%|          | 2/668 [00:00<02:36,  4.25it/s]






















  0%|          | 3/668 [00:01<05:00,  2.21it/s]












  1%|          | 4/668 [00:01<04:19,  2.55it/s]









  1%|          | 5/668 [00:02<04:12,  2.62it/s]












  1%|          | 6/668 [00:02<03:32,  3.12it/s]










  1%|          | 7/668 [00:02<03:09,  3.49it/s]












  1%|▏         | 9/668 [00:02<02:31,  4.35it/s]


















  1%|▏         | 10/668 [00:03<02:31,  4.35it/s]











  2%|▏         | 11/668 [00:03<02:23,  4.59it/s]













  2%|▏         | 12/668 [00:03<02:16,  4.81it/s]


















  2%|▏         | 14/668 [00:03<02:17,  4.76it/s]

















  2%|▏         | 15/668 [00:04<02:17,  4.73it/s]














  2%|▏         | 16/668 [00:04<02:27,  4.43it/s]















  3%|▎         | 17/668 [00:04<02:19,  4.67it/s]























  3%|▎         | 18/668 [00:04<02:23,  4.52it/s]











  3%|▎         | 20/668 [00:05<02:05,  5.17it/s]













  3%|▎         | 22/668 [00:05<01:53,  5.71it/s]



















  4%|▎         | 24/668 [00:05<01:47,  6.02it/s]


















  4%|▎         | 25/668 [00:05<01:49,  5.85it/s]













  4%|▍         | 26/668 [00:06<02:12,  4.85it/s]











  4%|▍         | 27/668 [00:06<02:26,  4.38it/s]















  4%|▍         | 28/668 [00:06<02:16,  4.69it/s]
















  4%|▍         | 30/668 [00:06<01:51,  5.74it/s]












  5%|▍         | 31/668 [00:07<02:00,  5.28it/s]











  5%|▍         | 32/668 [00:07<02:03,  5.15it/s]








  5%|▍         | 33/668 [00:07<01:58,  5.38it/s]









  5%|▌         | 34/668 [00:07<02:00,  5.28it/s]











  5%|▌         | 35/668 [00:07<02:08,  4.91it/s]






  6%|▌         | 37/668 [00:08<02:05,  5.02it/s]








  6%|▌         | 39/668 [00:08<01:42,  6.16it/s]









  6%|▌         | 41/668 [00:08<01:29,  6.97it/s]





  6%|▋         | 42/668 [00:09<01:48,  5.77it/s]








  7%|▋         | 44/668 [00:09<01:47,  5.82it/s]










  7%|▋         | 46/668 [00:09<01:29,  6.93it/s]




























  7%|▋         | 48/668 [00:09<01:28,  6.97it/s]

















  7%|▋         | 49/668 [00:10<01:33,  6.64it/s]









  8%|▊         | 51/668 [00:10<01:31,  6.72it/s]















  8%|▊         | 54/668 [00:10<01:10,  8.74it/s]










  8%|▊         | 56/668 [00:10<01:11,  8.61it/s]

















  9%|▉         | 59/668 [00:11<01:24,  7.23it/s]



















  9%|▉         | 60/668 [00:11<01:32,  6.56it/s]
















  9%|▉         | 61/668 [00:11<01:47,  5.65it/s]

















  9%|▉         | 63/668 [00:11<01:32,  6.52it/s]









 10%|▉         | 64/668 [00:12<01:45,  5.71it/s]

















 10%|▉         | 65/668 [00:12<01:59,  5.06it/s]














 10%|▉         | 66/668 [00:12<02:09,  4.65it/s]











 10%|█         | 67/668 [00:12<02:22,  4.22it/s]















 10%|█         | 69/668 [00:13<01:56,  5.14it/s]

















 10%|█         | 70/668 [00:13<01:55,  5.17it/s]














 11%|█         | 72/668 [00:13<01:48,  5.50it/s]



















 11%|█         | 73/668 [00:13<01:39,  6.01it/s]











 11%|█         | 75/668 [00:14<01:34,  6.29it/s]















 12%|█▏        | 77/668 [00:14<01:28,  6.67it/s]

















 12%|█▏        | 79/668 [00:14<01:19,  7.42it/s]











 12%|█▏        | 81/668 [00:15<01:22,  7.16it/s]














 12%|█▏        | 82/668 [00:15<01:24,  6.91it/s]














 12%|█▏        | 83/668 [00:15<02:02,  4.79it/s]




















 13%|█▎        | 85/668 [00:15<01:39,  5.84it/s]













 13%|█▎        | 86/668 [00:16<03:00,  3.22it/s]
















 13%|█▎        | 87/668 [00:16<02:51,  3.38it/s]














 13%|█▎        | 89/668 [00:16<01:58,  4.87it/s]













 14%|█▎        | 91/668 [00:17<01:31,  6.28it/s]















 14%|█▍        | 93/668 [00:17<01:27,  6.60it/s]
















 14%|█▍        | 95/668 [00:17<01:30,  6.33it/s]

















 15%|█▍        | 97/668 [00:18<01:25,  6.69it/s]


















 15%|█▍        | 98/668 [00:18<01:21,  6.98it/s]










 15%|█▌        | 101/668 [00:18<01:20,  7.05it/s]










 15%|█▌        | 102/668 [00:18<01:22,  6.82it/s]











 16%|█▌        | 104/668 [00:19<01:38,  5.73it/s]












 16%|█▌        | 107/668 [00:19<01:28,  6.35it/s]
















 16%|█▌        | 108/668 [00:19<01:24,  6.63it/s]








 16%|█▋        | 110/668 [00:20<01:29,  6.26it/s]








 17%|█▋        | 111/668 [00:20<01:51,  5.01it/s]













 17%|█▋        | 112/668 [00:20<02:07,  4.37it/s]











 17%|█▋        | 113/668 [00:20<02:07,  4.34it/s]











 17%|█▋        | 116/668 [00:21<01:31,  6.07it/s]




















 18%|█▊        | 118/668 [00:21<01:24,  6.54it/s]
























 18%|█▊        | 120/668 [00:21<01:13,  7.46it/s]












 18%|█▊        | 122/668 [00:21<01:06,  8.27it/s]








 19%|█▊        | 124/668 [00:22<01:09,  7.77it/s]














 19%|█▊        | 125/668 [00:22<01:29,  6.09it/s]















 19%|█▉        | 127/668 [00:22<01:23,  6.51it/s]


















 19%|█▉        | 129/668 [00:23<01:20,  6.73it/s]





















 20%|█▉        | 131/668 [00:23<01:16,  7.01it/s]















 20%|█▉        | 132/668 [00:23<01:30,  5.94it/s]



















 20%|█▉        | 133/668 [00:23<01:37,  5.46it/s]









 20%|██        | 134/668 [00:24<01:53,  4.69it/s]











 20%|██        | 136/668 [00:24<01:35,  5.56it/s]












 21%|██        | 138/668 [00:24<01:29,  5.92it/s]

















 21%|██        | 139/668 [00:24<01:30,  5.83it/s]















 21%|██        | 140/668 [00:25<01:51,  4.74it/s]














 21%|██        | 141/668 [00:25<01:59,  4.43it/s]

















 21%|██▏       | 142/668 [00:25<01:40,  5.23it/s]
















 22%|██▏       | 144/668 [00:25<01:35,  5.48it/s]






















 22%|██▏       | 145/668 [00:25<01:30,  5.75it/s]














 22%|██▏       | 148/668 [00:26<01:15,  6.85it/s]








 22%|██▏       | 149/668 [00:26<01:11,  7.23it/s]





















 23%|██▎       | 152/668 [00:27<01:24,  6.09it/s]











 23%|██▎       | 154/668 [00:27<01:15,  6.77it/s]











 23%|██▎       | 155/668 [00:27<01:26,  5.92it/s]
























 23%|██▎       | 156/668 [00:27<01:59,  4.28it/s]









 24%|██▎       | 157/668 [00:28<02:20,  3.64it/s]













 24%|██▎       | 158/668 [00:28<02:05,  4.07it/s]











 24%|██▍       | 160/668 [00:28<01:43,  4.93it/s]















 24%|██▍       | 161/668 [00:28<01:30,  5.58it/s]



















 24%|██▍       | 162/668 [00:29<01:36,  5.24it/s]




















 24%|██▍       | 163/668 [00:29<01:46,  4.72it/s]









 25%|██▍       | 164/668 [00:29<01:59,  4.21it/s]














 25%|██▍       | 165/668 [00:29<01:53,  4.42it/s]















 25%|██▍       | 166/668 [00:30<01:56,  4.32it/s]













 25%|██▌       | 167/668 [00:30<01:56,  4.31it/s]










 25%|██▌       | 169/668 [00:30<01:37,  5.11it/s]
















 26%|██▌       | 171/668 [00:30<01:26,  5.75it/s]














 26%|██▌       | 173/668 [00:31<01:15,  6.56it/s]















 26%|██▌       | 175/668 [00:31<01:34,  5.22it/s]










 26%|██▋       | 176/668 [00:31<01:24,  5.82it/s]

















 27%|██▋       | 178/668 [00:32<01:24,  5.81it/s]






 27%|██▋       | 179/668 [00:32<01:25,  5.71it/s]











 27%|██▋       | 180/668 [00:32<01:46,  4.58it/s]













 27%|██▋       | 182/668 [00:33<01:32,  5.27it/s]
















 28%|██▊       | 184/668 [00:33<01:22,  5.89it/s]




















 28%|██▊       | 186/668 [00:33<01:12,  6.62it/s]








 28%|██▊       | 187/668 [00:33<01:06,  7.25it/s]






 28%|██▊       | 190/668 [00:34<01:11,  6.72it/s]



















 29%|██▊       | 191/668 [00:34<01:15,  6.33it/s]
















 29%|██▉       | 193/668 [00:34<01:20,  5.90it/s]

















 29%|██▉       | 194/668 [00:34<01:22,  5.75it/s]

















 29%|██▉       | 195/668 [00:35<01:23,  5.65it/s]















 30%|██▉       | 198/668 [00:35<01:15,  6.21it/s]









 30%|██▉       | 199/668 [00:35<01:18,  5.98it/s]













 30%|███       | 201/668 [00:35<01:20,  5.80it/s]













 30%|███       | 202/668 [00:36<01:32,  5.03it/s]












 31%|███       | 204/668 [00:36<01:18,  5.92it/s]
















 31%|███       | 205/668 [00:36<01:18,  5.91it/s]











 31%|███       | 206/668 [00:36<01:18,  5.89it/s]












 31%|███       | 208/668 [00:37<01:25,  5.39it/s]













 31%|███▏      | 209/668 [00:37<01:22,  5.57it/s]
























 31%|███▏      | 210/668 [00:37<01:20,  5.72it/s]





















 32%|███▏      | 212/668 [00:37<01:24,  5.40it/s]



















 32%|███▏      | 213/668 [00:38<01:39,  4.56it/s]
















 32%|███▏      | 214/668 [00:38<01:28,  5.15it/s]





















 32%|███▏      | 217/668 [00:39<01:21,  5.50it/s]








 33%|███▎      | 219/668 [00:39<01:07,  6.61it/s]






















 33%|███▎      | 220/668 [00:39<01:16,  5.83it/s]











 33%|███▎      | 221/668 [00:39<01:27,  5.13it/s]











 33%|███▎      | 222/668 [00:39<01:23,  5.35it/s]














 34%|███▎      | 224/668 [00:40<01:20,  5.53it/s]




















 34%|███▍      | 226/668 [00:40<01:07,  6.59it/s]













 34%|███▍      | 228/668 [00:40<01:03,  6.90it/s]
















 34%|███▍      | 229/668 [00:40<01:00,  7.24it/s]



















 34%|███▍      | 230/668 [00:41<01:11,  6.15it/s]










 35%|███▍      | 231/668 [00:41<01:35,  4.59it/s]










 35%|███▍      | 233/668 [00:41<01:25,  5.09it/s]











 35%|███▌      | 234/668 [00:42<01:29,  4.86it/s]















 35%|███▌      | 235/668 [00:42<01:47,  4.04it/s]













 35%|███▌      | 236/668 [00:42<01:35,  4.54it/s]























 35%|███▌      | 237/668 [00:42<01:55,  3.73it/s]























 36%|███▌      | 238/668 [00:43<02:16,  3.15it/s]











 36%|███▌      | 239/668 [00:43<01:58,  3.63it/s]





















 36%|███▌      | 240/668 [00:43<01:55,  3.72it/s]













 36%|███▌      | 242/668 [00:44<01:27,  4.87it/s]
















 37%|███▋      | 244/668 [00:44<01:08,  6.19it/s]





















 37%|███▋      | 246/668 [00:44<01:05,  6.49it/s]












 37%|███▋      | 248/668 [00:44<01:01,  6.84it/s]













 37%|███▋      | 250/668 [00:45<00:53,  7.77it/s]





 38%|███▊      | 252/668 [00:45<00:55,  7.48it/s]




















 38%|███▊      | 254/668 [00:45<00:53,  7.67it/s]
















 38%|███▊      | 255/668 [00:45<01:11,  5.75it/s]
















 38%|███▊      | 257/668 [00:46<00:59,  6.87it/s]









 39%|███▊      | 258/668 [00:46<01:02,  6.51it/s]













 39%|███▉      | 259/668 [00:46<01:04,  6.29it/s]














 39%|███▉      | 260/668 [00:46<01:11,  5.74it/s]















 39%|███▉      | 261/668 [00:46<01:17,  5.22it/s]
























 39%|███▉      | 263/668 [00:47<01:24,  4.80it/s]











 40%|███▉      | 264/668 [00:47<01:33,  4.33it/s]


















 40%|███▉      | 265/668 [00:48<01:38,  4.07it/s]


















 40%|███▉      | 266/668 [00:48<01:39,  4.05it/s]









 40%|███▉      | 267/668 [00:48<01:28,  4.54it/s]













 40%|████      | 268/668 [00:48<01:29,  4.46it/s]










 40%|████      | 269/668 [00:48<01:35,  4.20it/s]











 40%|████      | 270/668 [00:49<01:32,  4.28it/s]
























 41%|████      | 272/668 [00:49<01:23,  4.76it/s]






























 41%|████      | 274/668 [00:49<01:13,  5.33it/s]

























 41%|████      | 275/668 [00:50<01:27,  4.51it/s]










 41%|████▏     | 276/668 [00:50<01:35,  4.11it/s]










 42%|████▏     | 278/668 [00:50<01:25,  4.54it/s]








 42%|████▏     | 279/668 [00:51<01:32,  4.20it/s]




















 42%|████▏     | 281/668 [00:51<01:09,  5.56it/s]























 42%|████▏     | 282/668 [00:51<01:08,  5.60it/s]













 42%|████▏     | 283/668 [00:51<01:11,  5.40it/s]










 43%|████▎     | 285/668 [00:52<01:15,  5.07it/s]










 43%|████▎     | 286/668 [00:52<01:18,  4.86it/s]












 43%|████▎     | 287/668 [00:52<01:21,  4.67it/s]













 43%|████▎     | 288/668 [00:52<01:14,  5.07it/s]













 43%|████▎     | 290/668 [00:53<01:06,  5.66it/s]












 44%|████▎     | 291/668 [00:53<01:05,  5.74it/s]















 44%|████▎     | 292/668 [00:53<01:12,  5.16it/s]














 44%|████▍     | 294/668 [00:53<01:12,  5.15it/s]










 44%|████▍     | 295/668 [00:54<01:20,  4.62it/s]


















 44%|████▍     | 296/668 [00:54<01:16,  4.88it/s]











 44%|████▍     | 297/668 [00:54<01:08,  5.40it/s]








 45%|████▍     | 300/668 [00:54<01:00,  6.08it/s]









 45%|████▌     | 301/668 [00:55<01:07,  5.46it/s]











 45%|████▌     | 302/668 [00:55<01:10,  5.21it/s]











 46%|████▌     | 305/668 [00:55<00:56,  6.44it/s]





















 46%|████▌     | 307/668 [00:55<00:57,  6.31it/s]











 46%|████▌     | 308/668 [00:55<00:55,  6.50it/s]















 46%|████▋     | 309/668 [00:56<01:12,  4.94it/s]
















 46%|████▋     | 310/668 [00:56<01:11,  5.03it/s]












 47%|████▋     | 311/668 [00:56<01:16,  4.66it/s]






 47%|████▋     | 313/668 [00:57<01:12,  4.87it/s]













 47%|████▋     | 315/668 [00:57<01:00,  5.79it/s]














 47%|████▋     | 317/668 [00:57<00:52,  6.64it/s]

















 48%|████▊     | 319/668 [00:57<00:50,  6.91it/s]















 48%|████▊     | 320/668 [00:58<00:57,  6.06it/s]










 48%|████▊     | 321/668 [00:58<00:57,  6.03it/s]









 48%|████▊     | 323/668 [00:58<00:57,  6.00it/s]















 49%|████▊     | 324/668 [00:58<00:56,  6.10it/s]












 49%|████▉     | 326/668 [00:59<01:03,  5.39it/s]
















 49%|████▉     | 327/668 [00:59<00:55,  6.14it/s]





 49%|████▉     | 329/668 [00:59<00:49,  6.85it/s]















 49%|████▉     | 330/668 [00:59<01:09,  4.88it/s]







 50%|████▉     | 331/668 [01:00<01:19,  4.22it/s]












 50%|████▉     | 332/668 [01:00<01:22,  4.07it/s]














 50%|████▉     | 333/668 [01:00<01:24,  3.96it/s]














 50%|█████     | 334/668 [01:01<01:23,  4.02it/s]
















 50%|█████     | 336/668 [01:01<01:11,  4.63it/s]















 51%|█████     | 338/668 [01:01<00:56,  5.87it/s]











 51%|█████     | 340/668 [01:01<00:50,  6.55it/s]


















 51%|█████     | 341/668 [01:02<00:46,  6.96it/s]













 51%|█████▏    | 344/668 [01:02<00:50,  6.43it/s]









 52%|█████▏    | 345/668 [01:02<00:50,  6.45it/s]



















 52%|█████▏    | 346/668 [01:02<00:50,  6.35it/s]










 52%|█████▏    | 347/668 [01:03<01:02,  5.12it/s]
















 52%|█████▏    | 348/668 [01:03<01:11,  4.48it/s]














 52%|█████▏    | 349/668 [01:03<01:17,  4.14it/s]












 52%|█████▏    | 350/668 [01:03<01:12,  4.36it/s]












 53%|█████▎    | 351/668 [01:04<01:14,  4.26it/s]




















 53%|█████▎    | 352/668 [01:04<01:15,  4.20it/s]





















 53%|█████▎    | 354/668 [01:04<00:55,  5.70it/s]









 53%|█████▎    | 355/668 [01:04<00:49,  6.30it/s]














 53%|█████▎    | 356/668 [01:04<00:55,  5.57it/s]


























 53%|█████▎    | 357/668 [01:05<01:01,  5.05it/s]


























 54%|█████▎    | 358/668 [01:05<01:05,  4.73it/s]
















 54%|█████▎    | 359/668 [01:05<01:08,  4.50it/s]














 54%|█████▍    | 360/668 [01:05<01:09,  4.40it/s]









 54%|█████▍    | 361/668 [01:06<01:16,  3.99it/s]










 54%|█████▍    | 362/668 [01:06<01:20,  3.82it/s]















 54%|█████▍    | 363/668 [01:06<01:17,  3.93it/s]

















 55%|█████▍    | 365/668 [01:07<01:03,  4.77it/s]





















 55%|█████▍    | 367/668 [01:07<00:52,  5.69it/s]







 55%|█████▌    | 368/668 [01:07<00:54,  5.54it/s]















 55%|█████▌    | 370/668 [01:07<00:48,  6.08it/s]













 56%|█████▌    | 373/668 [01:08<00:35,  8.32it/s]







 56%|█████▌    | 375/668 [01:08<00:32,  9.08it/s]












 57%|█████▋    | 378/668 [01:08<00:33,  8.57it/s]











 57%|█████▋    | 379/668 [01:08<00:34,  8.48it/s]









 57%|█████▋    | 381/668 [01:09<00:40,  7.12it/s]









 57%|█████▋    | 383/668 [01:09<00:38,  7.39it/s]























 57%|█████▋    | 384/668 [01:09<01:01,  4.58it/s]









 58%|█████▊    | 385/668 [01:10<01:13,  3.87it/s]















 58%|█████▊    | 386/668 [01:10<01:04,  4.38it/s]











 58%|█████▊    | 387/668 [01:10<00:58,  4.81it/s]








 58%|█████▊    | 390/668 [01:10<00:51,  5.40it/s]














 59%|█████▊    | 391/668 [01:11<00:53,  5.15it/s]








 59%|█████▉    | 393/668 [01:11<00:52,  5.27it/s]















 59%|█████▉    | 395/668 [01:11<00:48,  5.61it/s]





















 59%|█████▉    | 397/668 [01:11<00:41,  6.58it/s]










 60%|█████▉    | 399/668 [01:12<00:39,  6.73it/s]













 60%|██████    | 401/668 [01:12<00:40,  6.60it/s]


















 60%|██████    | 402/668 [01:12<00:49,  5.36it/s]















 60%|██████    | 403/668 [01:13<00:50,  5.28it/s]
























 61%|██████    | 405/668 [01:13<00:43,  6.09it/s]








 61%|██████    | 407/668 [01:13<00:36,  7.11it/s]









 61%|██████    | 408/668 [01:13<00:35,  7.41it/s]













 61%|██████▏   | 410/668 [01:14<00:43,  5.98it/s]














 62%|██████▏   | 411/668 [01:14<00:42,  6.03it/s]











 62%|██████▏   | 412/668 [01:14<00:50,  5.08it/s]











 62%|██████▏   | 414/668 [01:14<00:46,  5.45it/s]












 62%|██████▏   | 415/668 [01:14<00:44,  5.73it/s]










 62%|██████▏   | 416/668 [01:15<00:46,  5.40it/s]











 62%|██████▏   | 417/668 [01:15<00:45,  5.50it/s]












 63%|██████▎   | 419/668 [01:15<00:46,  5.33it/s]











 63%|██████▎   | 420/668 [01:15<00:43,  5.64it/s]













 63%|██████▎   | 422/668 [01:16<00:40,  6.07it/s]
















 63%|██████▎   | 423/668 [01:16<00:42,  5.70it/s]
















 63%|██████▎   | 424/668 [01:16<00:51,  4.74it/s]





















 64%|██████▎   | 425/668 [01:16<00:49,  4.88it/s]















 64%|██████▍   | 426/668 [01:17<00:55,  4.39it/s]


















 64%|██████▍   | 427/668 [01:17<00:53,  4.53it/s]















 64%|██████▍   | 429/668 [01:17<00:46,  5.19it/s]



















 64%|██████▍   | 430/668 [01:17<00:42,  5.57it/s]

















 65%|██████▍   | 432/668 [01:18<00:44,  5.28it/s]














 65%|██████▍   | 433/668 [01:18<00:44,  5.27it/s]

















 65%|██████▌   | 435/668 [01:18<00:36,  6.44it/s]








 65%|██████▌   | 436/668 [01:18<00:54,  4.29it/s]














 65%|██████▌   | 437/668 [01:19<01:01,  3.76it/s]



















 66%|██████▌   | 438/668 [01:19<01:03,  3.65it/s]











 66%|██████▌   | 439/668 [01:19<01:07,  3.41it/s]
















 66%|██████▌   | 441/668 [01:20<00:47,  4.76it/s]















 66%|██████▋   | 443/668 [01:20<00:40,  5.60it/s]






















 67%|██████▋   | 445/668 [01:20<00:34,  6.39it/s]




















 67%|██████▋   | 447/668 [01:21<00:30,  7.36it/s]























 67%|██████▋   | 448/668 [01:21<00:28,  7.63it/s]









 68%|██████▊   | 451/668 [01:21<00:27,  7.96it/s]









 68%|██████▊   | 453/668 [01:21<00:28,  7.56it/s]












 68%|██████▊   | 455/668 [01:22<00:29,  7.20it/s]











 68%|██████▊   | 456/668 [01:22<00:31,  6.73it/s]




















 68%|██████▊   | 457/668 [01:22<00:32,  6.44it/s]



















 69%|██████▊   | 459/668 [01:22<00:32,  6.39it/s]

















 69%|██████▉   | 460/668 [01:22<00:32,  6.37it/s]

















 69%|██████▉   | 461/668 [01:23<00:33,  6.11it/s]























 69%|██████▉   | 463/668 [01:23<00:37,  5.43it/s]












 70%|██████▉   | 465/668 [01:23<00:32,  6.23it/s]













 70%|██████▉   | 466/668 [01:23<00:29,  6.90it/s]








 70%|███████   | 468/668 [01:24<00:29,  6.76it/s]














 71%|███████   | 472/668 [01:24<00:20,  9.38it/s]

















 71%|███████   | 474/668 [01:24<00:23,  8.30it/s]

















 71%|███████▏  | 477/668 [01:25<00:26,  7.25it/s]

















 72%|███████▏  | 478/668 [01:25<00:27,  6.96it/s]













 72%|███████▏  | 479/668 [01:25<00:29,  6.34it/s]









 72%|███████▏  | 481/668 [01:25<00:29,  6.27it/s]










 72%|███████▏  | 482/668 [01:25<00:27,  6.85it/s]




















 72%|███████▏  | 483/668 [01:26<00:31,  5.82it/s]




















 73%|███████▎  | 485/668 [01:26<00:34,  5.37it/s]















 73%|███████▎  | 486/668 [01:26<00:35,  5.09it/s]























 73%|███████▎  | 487/668 [01:27<00:43,  4.20it/s]









 73%|███████▎  | 488/668 [01:27<00:47,  3.81it/s]














 73%|███████▎  | 489/668 [01:27<00:45,  3.91it/s]










 73%|███████▎  | 490/668 [01:28<00:48,  3.66it/s]


















 74%|███████▎  | 491/668 [01:28<00:45,  3.86it/s]
















 74%|███████▎  | 492/668 [01:28<00:44,  3.99it/s]
















 74%|███████▍  | 493/668 [01:28<00:39,  4.45it/s]
















 74%|███████▍  | 494/668 [01:29<00:50,  3.47it/s]

















 74%|███████▍  | 495/668 [01:29<00:58,  2.98it/s]







 74%|███████▍  | 497/668 [01:29<00:44,  3.83it/s]
















 75%|███████▍  | 498/668 [01:30<00:40,  4.16it/s]

















 75%|███████▍  | 499/668 [01:30<00:40,  4.18it/s]




















 75%|███████▌  | 501/668 [01:30<00:33,  5.02it/s]













 75%|███████▌  | 502/668 [01:30<00:31,  5.25it/s]












 75%|███████▌  | 504/668 [01:31<00:28,  5.72it/s]



























 76%|███████▌  | 505/668 [01:31<00:28,  5.75it/s]























 76%|███████▌  | 508/668 [01:31<00:25,  6.28it/s]




























 76%|███████▋  | 511/668 [01:32<00:22,  6.97it/s]




















 77%|███████▋  | 512/668 [01:32<00:24,  6.50it/s]















 77%|███████▋  | 513/668 [01:32<00:28,  5.45it/s]



















 77%|███████▋  | 515/668 [01:32<00:29,  5.13it/s]






















 77%|███████▋  | 517/668 [01:33<00:23,  6.39it/s]













 78%|███████▊  | 518/668 [01:33<00:24,  6.10it/s]













 78%|███████▊  | 520/668 [01:33<00:26,  5.64it/s]














 78%|███████▊  | 521/668 [01:34<00:31,  4.65it/s]


























 78%|███████▊  | 522/668 [01:34<00:33,  4.31it/s]


























 79%|███████▊  | 525/668 [01:34<00:24,  5.87it/s]
















 79%|███████▊  | 526/668 [01:34<00:24,  5.88it/s]

























 79%|███████▉  | 529/668 [01:35<00:19,  7.20it/s]







 79%|███████▉  | 530/668 [01:35<00:25,  5.39it/s]













 79%|███████▉  | 531/668 [01:35<00:29,  4.58it/s]



























 80%|███████▉  | 533/668 [01:36<00:29,  4.59it/s]






















 80%|████████  | 536/668 [01:36<00:25,  5.26it/s]





















 81%|████████  | 538/668 [01:36<00:20,  6.36it/s]











 81%|████████  | 540/668 [01:37<00:18,  6.81it/s]











 81%|████████  | 542/668 [01:37<00:18,  6.88it/s]












 81%|████████▏ | 543/668 [01:37<00:19,  6.47it/s]













 81%|████████▏ | 544/668 [01:37<00:20,  6.20it/s]














 82%|████████▏ | 545/668 [01:38<00:22,  5.52it/s]

















 82%|████████▏ | 546/668 [01:38<00:27,  4.48it/s]












 82%|████████▏ | 548/668 [01:38<00:21,  5.58it/s]






















 82%|████████▏ | 550/668 [01:38<00:19,  6.19it/s]





















 82%|████████▏ | 551/668 [01:39<00:17,  6.58it/s]















 83%|████████▎ | 552/668 [01:39<00:25,  4.47it/s]














 83%|████████▎ | 554/668 [01:39<00:21,  5.38it/s]



















 83%|████████▎ | 555/668 [01:39<00:22,  4.99it/s]










 83%|████████▎ | 556/668 [01:40<00:26,  4.28it/s]










 83%|████████▎ | 557/668 [01:40<00:24,  4.61it/s]









 84%|████████▎ | 558/668 [01:40<00:26,  4.08it/s]












 84%|████████▎ | 559/668 [01:41<00:30,  3.60it/s]













 84%|████████▍ | 560/668 [01:41<00:27,  3.95it/s]









 84%|████████▍ | 561/668 [01:41<00:24,  4.38it/s]











 84%|████████▍ | 562/668 [01:41<00:24,  4.38it/s]















 84%|████████▍ | 563/668 [01:41<00:22,  4.69it/s]









 84%|████████▍ | 564/668 [01:42<00:22,  4.69it/s]



















 85%|████████▍ | 565/668 [01:42<00:24,  4.15it/s]




















 85%|████████▍ | 566/668 [01:42<00:25,  3.96it/s]













 85%|████████▍ | 567/668 [01:42<00:26,  3.86it/s]
































 85%|████████▌ | 568/668 [01:43<00:30,  3.28it/s]














 85%|████████▌ | 569/668 [01:43<00:26,  3.71it/s]
















 85%|████████▌ | 570/668 [01:43<00:23,  4.12it/s]


















 85%|████████▌ | 571/668 [01:44<00:27,  3.58it/s]
















 86%|████████▌ | 572/668 [01:44<00:26,  3.68it/s]














 86%|████████▌ | 573/668 [01:44<00:25,  3.79it/s]


















 86%|████████▌ | 574/668 [01:44<00:23,  3.96it/s]









 86%|████████▌ | 575/668 [01:45<00:25,  3.71it/s]

















 86%|████████▌ | 576/668 [01:45<00:23,  3.88it/s]


















 86%|████████▋ | 577/668 [01:45<00:19,  4.59it/s]

















 87%|████████▋ | 579/668 [01:45<00:18,  4.94it/s]
















 87%|████████▋ | 580/668 [01:46<00:17,  5.01it/s]

















 87%|████████▋ | 581/668 [01:46<00:16,  5.16it/s]













 87%|████████▋ | 582/668 [01:46<00:17,  4.91it/s]














 87%|████████▋ | 583/668 [01:46<00:20,  4.24it/s]







 87%|████████▋ | 584/668 [01:47<00:21,  3.85it/s]















 88%|████████▊ | 585/668 [01:47<00:21,  3.91it/s]















 88%|████████▊ | 586/668 [01:47<00:20,  4.07it/s]









 88%|████████▊ | 588/668 [01:47<00:16,  4.99it/s]



















 88%|████████▊ | 589/668 [01:48<00:16,  4.83it/s]











 88%|████████▊ | 590/668 [01:48<00:15,  4.91it/s]









 89%|████████▊ | 592/668 [01:48<00:13,  5.81it/s]



















 89%|████████▉ | 594/668 [01:48<00:10,  6.93it/s]











 89%|████████▉ | 595/668 [01:49<00:13,  5.55it/s]













 89%|████████▉ | 596/668 [01:49<00:15,  4.52it/s]




















 89%|████████▉ | 597/668 [01:49<00:17,  3.99it/s]































 90%|████████▉ | 598/668 [01:50<00:18,  3.68it/s]










 90%|████████▉ | 599/668 [01:50<00:17,  3.94it/s]












 90%|████████▉ | 600/668 [01:50<00:18,  3.59it/s]











 90%|████████▉ | 601/668 [01:50<00:17,  3.73it/s]












 90%|█████████ | 602/668 [01:51<00:18,  3.57it/s]























 90%|█████████ | 603/668 [01:51<00:16,  3.98it/s]














 90%|█████████ | 604/668 [01:51<00:17,  3.66it/s]

















 91%|█████████ | 605/668 [01:51<00:15,  4.17it/s]

























 91%|█████████ | 606/668 [01:52<00:17,  3.48it/s]









 91%|█████████ | 607/668 [01:52<00:17,  3.47it/s]












 91%|█████████ | 609/668 [01:52<00:15,  3.93it/s]













 91%|█████████▏| 610/668 [01:53<00:13,  4.42it/s]


















 92%|█████████▏| 612/668 [01:53<00:11,  4.92it/s]














 92%|█████████▏| 613/668 [01:53<00:11,  4.99it/s]


















 92%|█████████▏| 614/668 [01:53<00:10,  5.05it/s]












 92%|█████████▏| 616/668 [01:54<00:10,  5.19it/s]



















 93%|█████████▎| 618/668 [01:54<00:08,  5.82it/s]










 93%|█████████▎| 619/668 [01:54<00:09,  5.18it/s]












 93%|█████████▎| 620/668 [01:54<00:08,  5.79it/s]


















 93%|█████████▎| 622/668 [01:55<00:07,  6.13it/s]










 93%|█████████▎| 623/668 [01:55<00:08,  5.55it/s]














 93%|█████████▎| 624/668 [01:55<00:08,  5.23it/s]














 94%|█████████▎| 625/668 [01:55<00:07,  5.76it/s]




















 94%|█████████▎| 626/668 [01:56<00:08,  5.08it/s]

















 94%|█████████▍| 628/668 [01:56<00:07,  5.44it/s]







 94%|█████████▍| 629/668 [01:56<00:06,  5.78it/s]









 94%|█████████▍| 631/668 [01:56<00:06,  6.09it/s]















 95%|█████████▍| 632/668 [01:57<00:06,  5.50it/s]















 95%|█████████▍| 633/668 [01:57<00:06,  5.31it/s]











 95%|█████████▍| 634/668 [01:57<00:07,  4.34it/s]








 95%|█████████▌| 635/668 [01:58<00:08,  3.70it/s]











 95%|█████████▌| 636/668 [01:58<00:09,  3.51it/s]

















 95%|█████████▌| 637/668 [01:58<00:08,  3.56it/s]






























 96%|█████████▌| 638/668 [01:58<00:09,  3.30it/s]












 96%|█████████▌| 640/668 [01:59<00:07,  3.88it/s]













 96%|█████████▌| 641/668 [01:59<00:06,  3.99it/s]











 96%|█████████▌| 642/668 [01:59<00:06,  4.03it/s]




















 96%|█████████▋| 643/668 [02:00<00:06,  4.15it/s]




















 97%|█████████▋| 645/668 [02:00<00:04,  5.16it/s]


























 97%|█████████▋| 649/668 [02:00<00:03,  5.45it/s]














 97%|█████████▋| 651/668 [02:01<00:02,  6.35it/s]


















 98%|█████████▊| 652/668 [02:01<00:02,  6.99it/s]



















 98%|█████████▊| 654/668 [02:01<00:02,  6.51it/s]


















 98%|█████████▊| 655/668 [02:01<00:01,  7.19it/s]





















 98%|█████████▊| 657/668 [02:02<00:01,  6.02it/s]












 99%|█████████▉| 660/668 [02:02<00:01,  7.49it/s]









 99%|█████████▉| 662/668 [02:02<00:00,  7.08it/s]


















 99%|█████████▉| 664/668 [02:03<00:00,  7.59it/s]













100%|█████████▉| 666/668 [02:03<00:00,  6.85it/s]














100%|█████████▉| 667/668 [02:03<00:00,  5.62it/s]














100%|██████████| 668/668 [02:03<00:00,  5.40it/s]










In [1]:
HTML(gen_html)

NameError: name 'HTML' is not defined

## POS

In [43]:
df_non_term_toekn_info.iloc[:10,:]

Unnamed: 0,claim,token,org_label,list_rationales,data_source,token_text_spacy,token_lemma_spacy,pos_spacy,dep_spacy,tag_spacy,tag_explain_spacy,wn_nltk_syn,morf_nltk,wn_pattern_syn
0,(1 in 5 million in UK have abnormal PrP positi...,1,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,1,1,NUM,compound,CD,"(cardinal number,)","[single, ane, i, I, one, unity, ace, 1]",{'NumType': 'Card'},[one]
1,(1 in 5 million in UK have abnormal PrP positi...,in,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,in,in,ADP,quantmod,IN,"(conjunction, subordinating or preposition,)","[atomic_number_49, In, inwards, inward, inch, ...",{},[]
2,(1 in 5 million in UK have abnormal PrP positi...,5,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,5,5,NUM,compound,CD,"(cardinal number,)","[V, five, 5, quintet, fivesome, pentad, quint,...",{'NumType': 'Card'},[five]
3,(1 in 5 million in UK have abnormal PrP positi...,million,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,million,million,NUM,nmod:npmod,CD,"(cardinal number,)","[one_thousand_thousand, trillion, zillion, bil...",{'NumType': 'Card'},[]
4,(1 in 5 million in UK have abnormal PrP positi...,in,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,in,in,ADP,ROOT,IN,"(conjunction, subordinating or preposition,)","[atomic_number_49, In, inwards, inward, inch, ...",{},[]
5,(1 in 5 million in UK have abnormal PrP positi...,,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,,,SPACE,dep,_SP,"(None,)",[],{'PunctType': 'Peri'},[]
6,(1 in 5 million in UK have abnormal PrP positi...,have,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,have,have,VERB,ROOT,VB,"(verb, base form,)","[own, stimulate, get, give, sustain, wealthy_p...",{'VerbForm': 'Inf'},"[own, stimulate, get, give, sustain, experienc..."
7,(1 in 5 million in UK have abnormal PrP positi...,abnormal,REFUTES,"[RESULTS Of the 32,441 appendix samples 16 wer...",train,abnormal,abnormal,ADJ,xcomp,JJ,"(adjective,)","[abnormal, unnatural]",{'Degree': 'Pos'},[]
8,"(1,000 genomes project enables mapping of gene...",1000,SUPPORTS,"[In conclusion, uncommon or rare genetic varia...",dev,1000,1000,NUM,ROOT,CD,"(cardinal number,)",[],{'NumType': 'Card'},[]
9,"(1,000 genomes project enables mapping of gene...",,SUPPORTS,"[In conclusion, uncommon or rare genetic varia...",dev,,,SPACE,dep,_SP,"(None,)",[],{'PunctType': 'Peri'},[]


In [40]:
HTML(df_non_term_toekn_info[['claim', 'token', 'token_text_spacy', 'pos_spacy', 'tag_spacy', 'tag_explain_spacy']].to_html())

Unnamed: 0,claim,token,token_text_spacy,pos_spacy,tag_spacy,tag_explain_spacy
0,"(1 in 5 million in UK have abnormal PrP positivity.,)",1,1,NUM,CD,"(cardinal number,)"
1,"(1 in 5 million in UK have abnormal PrP positivity.,)",in,in,ADP,IN,"(conjunction, subordinating or preposition,)"
2,"(1 in 5 million in UK have abnormal PrP positivity.,)",5,5,NUM,CD,"(cardinal number,)"
3,"(1 in 5 million in UK have abnormal PrP positivity.,)",million,million,NUM,CD,"(cardinal number,)"
4,"(1 in 5 million in UK have abnormal PrP positivity.,)",in,in,ADP,IN,"(conjunction, subordinating or preposition,)"
5,"(1 in 5 million in UK have abnormal PrP positivity.,)",,,SPACE,_SP,"(None,)"
6,"(1 in 5 million in UK have abnormal PrP positivity.,)",have,have,VERB,VB,"(verb, base form,)"
7,"(1 in 5 million in UK have abnormal PrP positivity.,)",abnormal,abnormal,ADJ,JJ,"(adjective,)"
8,"(1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.,)",1000,1000,NUM,CD,"(cardinal number,)"
9,"(1,000 genomes project enables mapping of genetic sequence variation consisting of rare variants with larger penetrance effects than common variants.,)",,,SPACE,_SP,"(None,)"


In [32]:
df_non_term_toekn_info = pd.DataFrame(list_dic_of_sent_tag_info)

In [33]:
df_non_term_toekn_info['pos_spacy'].value_counts()

NOUN     1070
ADP      1048
SPACE     979
VERB      868
ADJ       498
DET       407
PUNCT     106
ADV        84
CCONJ      80
PART       78
NUM        45
SCONJ      34
PROPN      27
PRON       22
AUX        17
X           9
SYM         3
Name: pos_spacy, dtype: int64

In [34]:
df_non_term_toekn_info['tag_spacy'].value_counts()

IN       1081
_SP       979
NN        851
JJ        446
DT        399
VBZ       338
NNS       219
VBN       183
VB        139
VBG        98
VBP        97
CC         80
RB         78
-RRB-      48
JJR        46
CD         45
TO         40
,          39
NNP        27
RBR        19
MD         17
WRB        16
VBD        13
-LRB-      10
FW          9
EX          8
WDT         8
PRP$        7
JJS         6
POS         6
WP          5
''          4
RBS         3
``          2
$           2
PRP         2
HYPH        2
RP          1
SYM         1
NFP         1
Name: tag_spacy, dtype: int64

In [35]:
df_non_term_toekn_info.groupby(['pos_spacy', 'tag_spacy']).size()

pos_spacy  tag_spacy
ADJ        JJ            446
           JJR            46
           JJS             6
ADP        IN           1047
           RP              1
ADV        RB             46
           RBR            19
           RBS             3
           WRB            16
AUX        MD             17
CCONJ      CC             80
DET        DT            399
           WDT             8
NOUN       NN            851
           NNS           219
NUM        CD             45
PART       POS             6
           RB             32
           TO             40
PRON       EX              8
           PRP             2
           PRP$            7
           WP              5
PROPN      NNP            27
PUNCT      ''              4
           ,              39
           -LRB-          10
           -RRB-          48
           HYPH            2
           NFP             1
           ``              2
SCONJ      IN             34
SPACE      _SP           979
SYM        $          

## Debug

In [None]:
!conda env list

In [None]:
!/home/computeruser/anaconda3/envs/scifact/bin/pip install pattern

In [None]:
!/home/computeruser/anaconda3/envs/scifact/bin/pip install --no-binary :all: nmslib

In [None]:
!conda install pattern