Review of NER models is taken from here:
https://arxiv.org/pdf/2205.00034.pdf

In [1]:
! pip install stanza
! pip install spacy
! pip install spark-nlp==4.0.2 pyspark==3.2.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting spark-nlp==4.0.2
  Downloading spark_nlp-4.0.2-py2.py3-none-any.whl (532 kB)
[K     |████████████████████████████████| 532 kB 5.1 MB/s 
[?25hCollecting pyspark==3.2.1
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 38 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 54.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=885e18953eb3c9d9ce427b4f15d55005770882ccad0ec6d220102dcec90020c3
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, spark-nlp, pyspar

In [2]:
import pandas as pd
from tqdm import tqdm
import stanza
import spacy
from sparknlp.base import *
from sparknlp.annotator import *
from sparknlp.pretrained import PretrainedPipeline
import sparknlp

In [4]:
df = pd.read_csv('data_train_marked.csv')

### Prediction by pretrained models

In [None]:
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

#NER Stanza

for j in tqdm(range(len(df))):
    cur_question = df.loc[j, 'question']
    doc = nlp(cur_question)
    df.loc[j, 'ner_stanza'] =','.join([ent.text for ent in doc.ents])

2022-10-02 13:08:17 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 191kB [00:00, 8.71MB/s]                    
2022-10-02 13:08:17 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | combined  |
| ner       | ontonotes |

2022-10-02 13:08:17 INFO: Use device: cpu
2022-10-02 13:08:17 INFO: Loading: tokenize
2022-10-02 13:08:17 INFO: Loading: ner
2022-10-02 13:08:18 INFO: Done loading processors!
100%|██████████| 19481/19481 [21:55<00:00, 14.81it/s]


In [None]:
# NER spacy
! python3 -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

for j in tqdm(range(len(df))):
    cur_question = df.loc[j, 'question']
    doc = nlp(cur_question)
    df.loc[j, 'ner_spacy'] =','.join([ent.text for ent in doc.ents])

100%|██████████| 19481/19481 [01:32<00:00, 209.89it/s]


In [5]:
#NER SPARK

#!NB this implementation works just in Colab
# Start Spark Session with Spark NLP
spark = sparknlp.start()

# Download a pre-trained pipeline
pipeline = PretrainedPipeline('explain_document_dl', lang='en')

for j in tqdm(range(len(df))):
    cur_question = df.loc[j, 'question']
    result = pipeline.annotate(cur_question)
    df.loc[j, 'ner_spark'] =','.join(result['entities'])

explain_document_dl download started this may take some time.
Approx size to download 169.4 MB
[OK!]


100%|██████████| 19481/19481 [15:26<00:00, 21.04it/s]


In [6]:
df.to_csv('train_ner_pretrained.csv', index=False)

### NER results (amount of rows where NER was not found)

In [7]:
df = pd.read_csv('train_ner_pretrained.csv')

In [None]:
#NER Stanza
count = 0
for j in range(len(df)):
    if pd.isna(df.loc[j, 'ner_stanza']):
        count += 1
print(count/len(df))

0.7437503208254196


In [None]:
#NER Spacy
count = 0
for j in range(len(df)):
    if pd.isna(df.loc[j, 'ner_spacy']):
        count += 1
print(count/len(df))

0.6418561675478671


In [9]:
#NER SparklNLP
count = 0
for j in range(len(df)):
    if pd.isna(df.loc[j, 'ner_spark']):
        count += 1
print(count/len(df))

0.8881474256968328
