In [1]:
# Import required libraries and install any necessary packages
import spacy
import warnings
import random

from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [9]:
import sys
import os

# Add the path to the directory containing the attribute_cleaner module
sys.path.append(os.path.abspath("../transformer"))

# Now you can import text_cleaner
from attribute_cleaner.general_string_cleaner import text_cleaner

In [2]:
# reading the data in doccano jsonl output format
with open('data/merged_data.jsonl', 'r') as f:
    lines = list(f)

training_data: list = []

for line in lines:
    row = json.loads(line)
    if row['label']:
        training_data.append(  [ row["text"], { "entities": row["label"] } ] )

random.shuffle(training_data)

print(len(training_data))

183


In [3]:
training_data[0]

['"our latest washed coffee is also the sweetest ! sourced from krishnagiri estate , high up in the chandra drona hills ( a . k . a the giris ) of chikmagalur , this lot of washed arabica fascinated us with its bean density and versatility ( traits we have come to associate with the coffee from krishnagiri estate ) . ripe cherries of the chandragiri varietal were picked for this lot and rested before pulping . the cherries were rested some more post pulping and then washed and slow-dried . the resulting cup is very sweet and delightfully nutty with a mildly cherry like finish . this sugar like sweetness reminds us of the rāga mohanakalyāni and this coffee is the new daily driver at kāpikottai hq . mohanakalyāni the coffee , is a delight across all manual brews and also shines as a single origin espresso"',
 {'entities': [[12, 18, 'PROCESSING'],
   [62, 73, 'ESTATE'],
   [98, 117, 'LOCATION'],
   [145, 156, 'LOCATION'],
   [178, 185, 'COFFEE TYPE'],
   [294, 305, 'ESTATE'],
   [338, 349

In [4]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [5]:
# Split the annotated data into training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(training_data, test_size=0.1)

# Display the number of items in the training and testing sets
print(len(train), len(test))

# Open a file to log errors during annotation processing
file = open('train_file_try_4.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, train)
db.to_disk('train_doccano_try_4.spacy')

db = get_spacy_doc(file, test)
db.to_disk('test_doccano_try_4.spacy')

# Close the error log file
file.close()

164 19


  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 164/164 [00:00<00:00, 1048.46it/s]
100%|██████████| 19/19 [00:00<00:00, 642.87it/s]


In [6]:
# configure spacy for custom NER model , using a base config file - https://spacy.io/usage/training#config

# !python -m spacy init fill-config base_config.cfg config.cfg

### Model Training

In [7]:
!python -m spacy train config.cfg  --output  output_try_4  --paths.train train_doccano_try_4.spacy  --paths.dev  test_doccano_try_4.spacy --gpu-id 0

[38;5;2m✔ Created output directory: output_try_3[0m
[38;5;4mℹ Saving to output directory: output_try_3[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.amp.autocast(self._mixed_precision):
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  with torch.cuda.amp.autocast(self._mixed_precision):
  0       0        1042.32   1636.80    0.54    0.28    5.26    0.01
  2      50       23101.30  34825.88    0.00    0.00    0.00    0.00
  5     100        9760.66   6768.00   48.16   66.06

### Loading and trying the model with new data

In [8]:
# Import the spaCy library
import spacy

# Load the trained spaCy NER model from the specified path
nlp = spacy.load('output_try_3/model-best')

  self._model.load_state_dict(torch.load(filelike, map_location=device))


In [11]:
text = """
Jasmine kissed cranberry - 100 % arabica that is sourced from Chikmagalur, grown at an altitude of approximately 4,100ft to 4,500ft. It gets its unique name through a process which include a strain of yeast used to create a carbon dioxide-rich environment during fermentation in stainless-steel fermenters. A remarkable level of complexity in the beans is created by this process, laying the groundwork for a wonderful cup of coffee. Steady drying on raised beds is the next important step, which lets the flavors gradually develop and intensify. After another 30 days of continuous stirring, the coffee undergo additional drying developing scent of jasmine, complex notes of raspberries, cranberries, and sparkling malic acidity with a lingering floral aftertaste 
"""

In [12]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text_cleaner(text))

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

  with torch.cuda.amp.autocast(self._mixed_precision):


jasmine   ->>>>   TASTING NOTES
arabica   ->>>>   COFFEE TYPE
chikmagalur   ->>>>   LOCATION
4 , 100ft   ->>>>   ELEVATION
jasmine   ->>>>   TASTING NOTES
raspberries   ->>>>   TASTING NOTES
cranberries   ->>>>   TASTING NOTES
malic   ->>>>   ACIDITY
floral   ->>>>   COFFEE_PROPERTIES


In [13]:
text ="""
Single origin Indian Coffee From Moganad Estate, Tamil Nadu which is a 100% Arabiaca coffee with 
an exquisite blend of balanced sweetness and brightness. A Medium Dark Roast coffee with 
flavor notes of Cocoa, Caramel and Nut which can also be enjoyed in a French press, moka pot, aeropress & espresso .
 It is washed processed, grown at an altitude of around 4430 ft

"""

In [14]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text_cleaner(text))

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

moganad   ->>>>   ESTATE
tamil nadu   ->>>>   LOCATION
arabiaca   ->>>>   COFFEE TYPE
medium dark   ->>>>   ROAST LEVEL
cocoa   ->>>>   TASTING NOTES
caramel   ->>>>   TASTING NOTES
nut   ->>>>   TASTING NOTES
washed   ->>>>   PROCESSING
4430 ft   ->>>>   ELEVATION
