<a href="https://colab.research.google.com/github/sohiniroych/AI_with_Sohini_Notebooks/blob/main/NER_functions_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 5.1 MB/s 
[?25hCollecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 35.3 MB/s 
[?25hCollecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[K     |████████████████████████████████| 43 kB 1.6 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 45.4 MB/s 
[?25hCollecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 53.9 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 39.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
 

In [None]:
# connect to the Google drive containing data and authorize access
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
import os
os.chdir('/content/drive/MyDrive/Colab Notebooks/NER/StackOverflowNER/code/')

Mounted at /content/drive


In [None]:
data_path='./DataReader/text_files/13352832.txt'

In [None]:
# Author: Sohini.roychowdhury@accenture.com
#This file contains utility functions to enable Named Entity Recognition of techical details from written text.
#Function NER_for_Tech represnts NER for tech_Stack
#Function Display_NER enabled displaying detected NER (B-entity, I-entity) using dislaCy
from transformers import AutoModelForTokenClassification, pipeline, AutoTokenizer
import spacy
from spacy import displacy
import numpy as np

# NER map is hardcoded from BERT-overflow. It can change 

#ner_map={"0": "B-Algorithm", "1": "B-Application", "2": "B-Class", "3": "B-Code_Block", "4": "B-Data_Structure", "5": "B-Data_Type", "6": "B-Device", "7": "B-File_Name", "8": "B-File_Type", "9": "B-Function", "10": "B-HTML_XML_Tag", "11": "B-Language", "12": "B-Library", "13": "B-Operating_System", "14": "B-Output_Block", "15": "B-User_Interface_Element", "16": "B-User_Name", "17": "B-Variable", "18": "B-Version", "19": "B-Website", "20": "I-Algorithm", "21": "I-Application", "22": "I-Class", "23": "I-Code_Block", "24": "I-Data_Structure", "25": "I-Data_Type", "26": "I-Device", "27": "I-File_Name", "28": "I-File_Type", "29": "I-Function", "30": "I-HTML_XML_Tag", "31": "I-Language", "32": "I-Library", "33": "I-Operating_System", "34": "I-Output_Block", "35": "I-User_Interface_Element", "36": "I-User_Name", "37": "I-Variable", "38": "I-Version", "39": "I-Website", "40": "O"}

def NER_for_Tech(path_to_file, model_checkpoint="jeniya/BERTOverflow", BI=True):
    #Pass a text file to this function and trained model checkpount from hugging face.
    #1. Read the file contents
    with open(path_to_file) as f:
        contents = f.read()
    stack_tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)
    stack_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
    ner_pipeline = pipeline("ner", model=stack_model, tokenizer=stack_tokenizer)
    ner_results = ner_pipeline(contents)
    #2. Now we need to consolidate B-entity and I-entity
    # First, we create a ner map, keys=label_list, values=range
    labs=[]
    for i in range(len(ner_results)):
      labs.append(ner_results[i]['entity'])
    label_list=np.unique(labs)  
    
    key_vals=range(len(label_list))
    ner_map=dict(zip(key_vals,label_list))
    
    #3. Consolidate entities and discard low score redictions
    if(BI==True):
      entities = []
      for i in range(len(ner_results)):
          if ner_results[i]['entity'] != 0:
              
              if ((ner_results[i]['entity'][0] == 'B') and (ner_results[i]['score']>=0.05)):
                  j = i + 1
                  while j < len(ner_results) and ner_results[j]['entity'][0] == 'I':
                      j += 1
                  entities.append((ner_results[i]['entity'].split('-')[1], ner_results[i]['start'],
                                  ner_results[j - 1]['end']))
      #4. CReate spacy document that can be printed
      nlp = spacy.blank("en")  # english language
      doc = nlp(contents)
      entis = []
      for ee in entities:
        if(ee[2]>ee[1]+1):# discard single word tokens
         if((doc.char_span(ee[1],ee[2],ee[0])) is not None):
            entis.append(doc.char_span(ee[1], ee[2], ee[0]))
      doc.ents=entis
    else:
      #transform labels to suit displacy
      new_keys = ['label','score','index','word','start','end']

      for i in range(len(ner_results)):
        ner_results[i]['label'] = ner_results[i].pop('entity')
      doc={'ents':ner_results, 'text': contents}

    return doc
    
    
def Display_NER(doc, BI=True):
    if(BI==True):
      displacy_html = displacy.render(doc, style="ent", jupyter=True)
    else:
      displacy_html = displacy.render(doc, style='ent', jupyter=True, manual=True)
    

In [None]:
os.getcwd()
!ls

Attentive_BiLSTM   Flow_NER_BERT_Overflow    __pycache__
BERT_NER	   NER_BERT_Overflow	     Readme.md
BERT_NER_Utils.py  NER_functions_test.ipynb  SOTokenizer
DataReader	   ner_map.txt


In [None]:
doc=NER_for_Tech(data_path)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at jeniya/BERTOverflow and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
Display_NER(doc)

In [None]:
doc1=NER_for_Tech(data_path, model_checkpoint='bert-base-uncased', BI=False)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-u

In [None]:
Display_NER(doc1, BI=False)

# Now we try spacy-annotator for display

In [None]:
! pip install ner-annotator

Collecting ner-annotator
  Downloading ner_annotator-0.1.1-py3-none-any.whl (17 kB)
Collecting pyqt5==5.12
  Downloading PyQt5-5.12-5.12.1_a-cp35.cp36.cp37.cp38-abi3-manylinux1_x86_64.whl (61.1 MB)
[K     |████████████████████████████████| 61.1 MB 6.1 kB/s 
[?25hCollecting PyQt5_sip<4.20,>=4.19.14
  Downloading PyQt5_sip-4.19.19-cp37-cp37m-manylinux1_x86_64.whl (67 kB)
[K     |████████████████████████████████| 67 kB 2.3 MB/s 
[?25hInstalling collected packages: PyQt5-sip, pyqt5, ner-annotator
Successfully installed PyQt5-sip-4.19.19 ner-annotator-0.1.1 pyqt5-5.12


In [None]:
with open(data_path) as f:
        contents = f.read()

In [None]:
import pandas as pd
import ner_annotator as ann


In [None]:
df=pd.DataFrame({"text":[contents]})

In [None]:
df

Unnamed: 0,text
0,Our business unit relies on big data processin...


In [None]:
ann data_path -e 'GPE' 'Person'

SyntaxError: ignored

In [None]:
annotator=spa.Annotator(labels=["GPE","Person","Noun"], model=nlp)

In [None]:
df_labels=annotator.annotate(df=df, col_text="text", shuffle=True)

HTML(value='-1 examples annotated, 2 examples left')

Text(value='', description='GPE', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three')

Text(value='', description='Person', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three')

Text(value='', description='Noun', layout=Layout(width='auto'), placeholder='ent one, ent two, ent three')

HBox(children=(Button(button_style='success', description='submit', style=ButtonStyle()), Button(button_style=…

Output()

In [None]:
df_labels

Unnamed: 0,text,annotations
0,Our business unit relies on big data processin...,(Our business unit relies on big data processi...
