In [7]:
import en_core_web_lg
import pandas as pd
import re
import random
import spacy
from spacy.util import minibatch, compounding
import warnings
import matplotlib.pyplot as plt
import re
import os
import numpy as np
import json

In [2]:
spacy.__version__

'3.3.0'

## Custom Dataset Generation

For custom dataset generation, I used a dataset related to food from kaggle named "Amazon Fine Food Reviews" (https://www.kaggle.com/datasets/snap/amazon-fine-food-reviews).

At first I created a list of some common foods (18 in total) and also a list of some helper words that are likely to be found in sentences related to foods.

In [3]:
ENTITY_NAMES = ['gravy',
        'ramen',
        'burger',
        'pizza',
        'pasta',
        'wings',
        'coke',
        'sprite',
        'water',
        'fanta',
        'pepsi',
        'seven up', 
        'biriyani', 
        'rice', 
        'pulao', 
        'bread', 
        'flat bread', 
        'rice bowl']

helper_words = ['flavour','flavours','tasty','delicious','juicy','spicy','soft','chewy']


Duplicate reviews were removed and texts were converted to lowercase

In [76]:
sentence_df = pd.read_csv("Reviews.csv")
sentence_df = sentence_df.drop_duplicates("Text")
sentence_df["Text"] = sentence_df["Text"].str.lower()

In [None]:
for food_no,food in enumerate(ENTITY_NAMES):
    print("Total # of reviews with the word '{}' in them: ".format(food), sentence_df['Text'].str.contains(food).sum())
    print("Total # of reviews with the word '{}' and helper words in them: ".format(food),(sentence_df['Text'].str.contains(food) & sentence_df['Text'].str.contains('|'.join(helper_words))).sum())

For every food, first 500 reviews were taken and split into lines. Then only if the line had a food and a helper word in it, it was written in a file called "food_sentences.txt". Some minor text formatting was done like removing tags i.e. break. Also for a food, reviews with previous foods were discarded to avoid repitition.

In [None]:
take_first = 500
with open('food_sentences.txt','w') as file:
    for food_no,food in enumerate(ENTITY_NAMES):
        for sentence in sentence_df['Text'][sentence_df['Text'].str.contains(food)].iloc[:take_first]:
            for line in sentence.split('.'):
                if any(x in line for x in ENTITY_NAMES) and all(x not in line for x in ENTITY_NAMES[:food_no]) and any(x in line for x in helper_words):
                    file.write(line.strip().replace('<br />','')+'\n')

The sentences saved in the file were then manually annotated using an NER annotator tool and saved in the file "annotations.json". The annotator tool that was used is: https://tecoholic.github.io/ner-annotator/ .

In [180]:
import json
f = open('annotations.json')
TRAIN_DATA = json.load(f)

In [191]:
print("Annotations of first 5 sentences \n")
for annotation in TRAIN_DATA["annotations"][:5]:
    print("Sentence: ",annotation[0])
    print("Entities:", annotation[1],'\n')

Annotations of first 5 sentences 

Sentence:  this mix makes the most delicious pancakes! my gluten eating family wanted mine instead of theirs! i have also make sausage balls and biscuits for biscuits and gravy with it! so good!
Entities: {'entities': [[34, 42, 'FOOD'], [116, 123, 'FOOD'], [134, 142, 'FOOD'], [147, 155, 'FOOD'], [160, 165, 'FOOD']]} 

Sentence:  lemon juice and fruit concentrates mingle with the sparkling water to create a delicate, smooth, slightly tingly and delicious drink
Entities: {'entities': [[0, 11, 'FOOD'], [16, 21, 'FOOD'], [61, 66, 'FOOD'], [127, 132, 'FOOD']]} 

Sentence:  it only takes two cans of chicken broth to one package of quinoa to make a lot of very tasty and filling finished product which can be used with gravy, sauce, mixed vegetables or anything you like
Entities: {'entities': [[26, 39, 'FOOD'], [58, 64, 'FOOD'], [145, 150, 'FOOD'], [152, 157, 'FOOD'], [165, 175, 'FOOD']]} 

Sentence:  my role is to greet them with a tasty treat - usually a gra

In [None]:
# with open('revision_sentences.txt','r') as file:
#     REVISION_TEXTS = file.read().split('\n')[:-1]
# REVISION_TEXTS
# for text in REVISION_TEXTS:
#     doc = nlp(text)
#     ("Where is ICICI bank located", {"entities": [(9, 18, "ORG")]})
#     ent_list = []
#     spacy.displacy.render(doc, style="ent", jupyter=True)
#     for ent in doc.ents:
#         pass
#         #print(ent.text, ent.start_char, ent.end_char, ent.label_)
#         #ent_list.append((ent.start_char, ent.end_char, ent.label_))

# for text in TEST_TEXTS:
#     doc = nlp(text)
#     ("Where is ICICI bank located", {"entities": [(9, 18, "ORG")]})
#     ent_list = []
#     spacy.displacy.render(doc, style="ent", jupyter=True)
#     for ent in doc.ents:
#         pass
#         #print(ent.text, ent.start_char, ent.end_char, ent.label_)
#         #ent_list.append((ent.start_char, ent.end_char, ent.label_))

In [None]:
# REVISION_DATA = []

# for text in sentence_list:
#     doc = nlp(text)
    
#     ent_list = []
#     for ent in doc.ents:
#         print(ent.text, ent.start_char, ent.end_char, ent.label_)
#         ent_list.append([ent.start_char, ent.end_char, ent.label_])
    
#     REVISION_DATA.append([text,{"entities": ent_list}])
#     colors = {"FOOD": "#F67DE3"}
#     options = {"colors": colors} 

#     #spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

Training data were converted into the docBin format for spacyv3

In [107]:
from spacy.tokens import DocBin
from tqdm import tqdm
import numpy as np

db = DocBin()

np.random.shuffle(TRAIN_DATA['annotations'])

for text, annot in tqdm(TRAIN_DATA['annotations']): 
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in annot["entities"]:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    doc.ents = ents 
    db.add(doc)

db.to_disk("./training_data.spacy")

100%|██████████| 295/295 [00:00<00:00, 308.59it/s]


In [None]:
nlp = spacy.load('en_core_web_lg')
ner = nlp.get_pipe("ner")
for label in ner.labels:
    print(label,spacy.explain(label))

Generate config file with code

In [52]:
! python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency


✘ The provided output file already exists. To force overwriting the config file,
set the --force or -F flag.



# Training

In [108]:
! python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy

ℹ Saving to output directory: .

[2022-06-20 22:29:53,860] [INFO] Set up nlp object from config
[2022-06-20 22:29:53,873] [INFO] Pipeline: ['tok2vec', 'ner']
[2022-06-20 22:29:53,878] [INFO] Created vocabulary
[2022-06-20 22:29:53,880] [INFO] Finished initializing nlp object
[2022-06-20 22:29:54,400] [INFO] Initialized pipeline components: ['tok2vec', 'ner']



ℹ Using CPU
ℹ To switch to GPU 0, use the option: --gpu-id 0
[1m
✔ Initialized pipeline
[1m
ℹ Pipeline: ['tok2vec', 'ner']
ℹ Initial learn rate: 0.001
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     41.83   12.33   10.01   16.07    0.12
  2     200         58.16   1800.90   89.99   90.95   89.06    0.90
  4     400        257.27    547.78   95.77   95.84   95.71    0.96
  8     600        116.65    265.07   98.47   98.75   98.20    0.98
 12     800        141.17    145.73   99.45   99.45   99.45    0.99
 17    1000        211.44    137.66   99.38   99.45   99.31    0.99
 23    1200        158.19    108.80   99.44   99.72   99.17    0.99
 31    1400        185.02    126.63   99.24   99.03   99.45    0.99
 41    1600        193.28    131.88   99.38   99.58   99.17    0.99
 53    1800        152.59    143.73   99.51   99.58   99.45    1.00
 68    2000        124.68    1

## Select Best Model

In [175]:
nlp_food = spacy.load("G:\\Pioneer Alpha\\Task 1\\model-best")
ner_food = nlp_food.get_pipe("ner")
nlp_food.replace_listeners("tok2vec", "ner", ["model.tok2vec"])

# Load new model

In [177]:
nlp = spacy.load('en_core_web_lg')
ner = nlp.get_pipe("ner")

# Combine two model pipes

In [178]:
nlp.add_pipe('ner', source = nlp_food , name="food_ner",before = 'ner')



<spacy.pipeline.ner.EntityRecognizer at 0x2a3df9d9fc0>

In [192]:
with open('test_sentences.txt','r') as file:
    TEST_TEXTS = file.read().split('\n')[:-1]
TEST_TEXTS

['I ate 10 burgers for lunch today.',
 'I had a pizza at 10:30 AM.',
 'The 2024 Olympics will be hosted by France.',
 'I went to Paris in this summer.',
 'United States of America is located between the Pacific and the Atlantic Ocean.',
 'I speak English, French and Bengali.',
 'Microsoft is currently the 2nd largest company in the world.',
 'Apple is releasing a new phone priced at 100$.',
 'Christians are the majority in England.',
 'Neil Armstrong was the first man to land on the moon.',
 'The United Nations is working in over 150 countries.',
 'Elon Musk owns 80% of the Tesla Stock.',
 'Nelson Mandela is the father of the nation of South Africa.',
 'The bus arrives at the station at 2:50 PM.',
 'I carried a 10 Kilogram bag of rice.',
 'I will be there in 1 hour.']

In [193]:
for text in TEST_TEXTS:
    doc = nlp(text)

    spacy.displacy.render(doc, style="ent", jupyter=True)

<spacy.lang.en.English at 0x1a01165e260>