In [1]:
# Import required libraries and install any necessary packages
import spacy
import warnings

from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [2]:
# reading the data in doccano jsonl output format
with open('data/admin.jsonl', 'r') as f:
    lines = list(f)

training_data: list = []

for line in lines:
    row = json.loads(line)
    if row['label']:
        training_data.append(  [ row["text"], { "entities": row["label"] } ] )

print(len(training_data))

149


In [3]:
training_data[0]

['"#1 fruit forward lots | 1 . 2 ell\'sworth black honey orange , berry fruit characteristics , green grapes & lemon crisp acidity . suitable for all brew methods . origin chikmagalur , karnataka process hand picked cherries are pulped , sun dried on raised beds and then eventually washed before they are hulled . dry aroma mild florals wet aroma red ripe fruits based on sensory evaluation orange , berry fruit characteristics , green grapes & lemon crisp acidity . med body , clean mouthfeel & long prevailing aftertaste . varietal chandragiri roast profile light-med roast with variable drum speed and a lower charge temperature to achieve a 13% dtr . altitude 1130 masl minimum resting period filter 6 days | espresso 14 days a song that pairs well roaster thoughts : as black honey produces fruit forward flavors and consists of jammy characteristics . we roasted this coffee at 205 degrees to extract the dense properties , fruitier flavor notes and a heavier body . after cupping the coffee a

In [4]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [5]:
# Split the annotated data into training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(training_data, test_size=0.1)

# Display the number of items in the training and testing sets
print(len(train), len(test))

# Open a file to log errors during annotation processing
file = open('train_file_try_2.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, train)
db.to_disk('train_doccano_try_2.spacy')

db = get_spacy_doc(file, test)
db.to_disk('test_doccano_try_2.spacy')

# Close the error log file
file.close()

134 15


  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
100%|██████████| 134/134 [00:00<00:00, 885.41it/s]
100%|██████████| 15/15 [00:00<00:00, 680.57it/s]


In [7]:
# configure spacy for custom NER model , using a base config file - https://spacy.io/usage/training#config

!python -m spacy init fill-config base_config.cfg config.cfg

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Model Training

In [8]:
!python -m spacy train config.cfg  --output  output_try_2  --paths.train train_doccano_try_2.spacy  --paths.dev  test_doccano_try_2.spacy --gpu-id 0

  _torch_pytree._register_pytree_node(
[38;5;2m✔ Created output directory: output_try_2[0m
[38;5;4mℹ Saving to output directory: output_try_2[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  with torch.cuda.amp.autocast(self._mixed_precision):
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['transformer', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.0[0m
E    #       LOSS TRANS...  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  -------------  --------  ------  ------  ------  ------
  with torch.cuda.amp.autocast(self._mixed_precision):
  0       0        1475.28   1145.36    0.00    0.00    0.00    0.00
 11 

### Loading and trying the model with new data

In [24]:
# Import the spaCy library
import spacy

# Load the trained spaCy NER model from the specified path
nlp = spacy.load('output_try_2/model-best')


In [37]:
text = """
Jasmine kissed cranberry - 100 % arabica that is sourced from Chikmagalur, grown at an altitude of approximately 4,100ft to 4,500ft. It gets its unique name through a process which include a strain of yeast used to create a carbon dioxide-rich environment during fermentation in stainless-steel fermenters. A remarkable level of complexity in the beans is created by this process, laying the groundwork for a wonderful cup of coffee. Steady drying on raised beds is the next important step, which lets the flavors gradually develop and intensify. After another 30 days of continuous stirring, the coffee undergo additional drying developing scent of jasmine, complex notes of raspberries, cranberries, and sparkling malic acidity with a lingering floral aftertaste 
"""

In [38]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text.lower())

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

jasmine kissed cranberry -   ->>>>   NAME
arabica   ->>>>   COFFEE TYPE
chikmagalur   ->>>>   LOCATION
jasmine   ->>>>   TASTING NOTES
raspberries   ->>>>   TASTING NOTES
cranberries   ->>>>   TASTING NOTES
malic   ->>>>   ACIDITY


In [41]:
text ="""
Single origin Indian Coffee From Moganad Estate, Tamil Nadu which is a 100% Arabiaca coffee with 
an exquisite blend of balanced sweetness and brightness. A Medium Dark Roast coffee with 
flavor notes of Cocoa, Caramel and Nut which can also be enjoyed in a French press, moka pot, aeropress & espresso .
 It is washed processed, grown at an altitude of around 4430 ft

"""

In [42]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text.lower())

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

moganad   ->>>>   ESTATE
tamil nadu   ->>>>   LOCATION
medium dark   ->>>>   ROAST LEVEL
cocoa   ->>>>   TASTING NOTES
caramel   ->>>>   TASTING NOTES
nut   ->>>>   TASTING NOTES
4430 ft   ->>>>   ELEVATION
