In [1]:
# Import required libraries and install any necessary packages
import spacy
import warnings
import random

from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [2]:
import sys
import os

# Add the path to the directory containing the attribute_cleaner module
sys.path.append(os.path.abspath("../transformer"))

# Now you can import text_cleaner
from attribute_cleaner.general_string_cleaner import text_cleaner

In [3]:
# reading the data in doccano jsonl output format
with open('data/merged_data.jsonl', 'r') as f:
    lines = list(f)

training_data: list = []

for line in lines:
    row = json.loads(line)
    if row['label']:
        training_data.append(  [ row["text"], { "entities": row["label"] } ] )

random.shuffle(training_data)

print(len(training_data))

183


In [4]:
training_data[0]

['"elevation : 1500 masl variety : s795 processing : burundi washed fermentation : two-stage anaerobic fermentation roast : medium-light producers : shreeraksha purnesh tasting notes : red grapes , sweet lime , hazelnuts sreeraksha is one of those planters without whom most coffee businesses will fail . he understands and supports , ensuring mutual success , not only does he produce incredible coffee , but he is also a fantastic person . the baarbara estate burundi washed is one of the most pleasant coffees we tasted this year , with its complex dual fermentation ( dry and wet ) and superbly focussed sorting and meticulous picking . this coffee is\'simply\'a delight . the roast profile is medium-light ; here , the longer development helps caramelise the sugars and make them easily soluble in the coffee . the complex fermentation brings out the flavours of red grapes , with the sweet acidity of sweet lime and a delicious hazelnutty aftertaste , making it a crowd-pleaser . this medium-li

In [5]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [6]:
# Split the annotated data into training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(training_data, test_size=0.1)

# Display the number of items in the training and testing sets
print(len(train), len(test))

# Open a file to log errors during annotation processing
file = open('train_file_try_4.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, train)
db.to_disk('train_doccano_try_4.spacy')

db = get_spacy_doc(file, test)
db.to_disk('test_doccano_try_4.spacy')

# Close the error log file
file.close()

164 19


100%|██████████| 164/164 [00:00<00:00, 1138.32it/s]
100%|██████████| 19/19 [00:00<00:00, 863.17it/s]


In [7]:
# configure spacy for custom NER model , using a base config file - https://spacy.io/usage/training#config

# !python -m spacy init fill-config config_cpu_en_blank_base.cfg config_cpu_en_blank.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config_cpu_en_blank.cfg
You can now add your data and train your pipeline:
python -m spacy train config_cpu_en_blank.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Model Training

In [8]:
!python -m spacy train config_cpu_en_blank.cfg  --output  output_try_4  --paths.train train_doccano_try_4.spacy  --paths.dev  test_doccano_try_4.spacy --gpu-id 0

[38;5;2m✔ Created output directory: output_try_4[0m
[38;5;4mℹ Saving to output directory: output_try_4[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     99.88    0.00    0.00    0.00    0.00
  0      50        299.45   2391.04    0.00    0.00    0.00    0.00
  0     100        462.22   1068.04   42.46   49.29   37.30    0.42
  0     150        307.02    911.28   30.56   30.32   30.81    0.31
  1     200        880.29    865.58   49.46   49.73   49.19    0.49
  1     250        306.06    814.23   53.92   64.18   46.49    0.54
  1     300        186.18    952.95   52.43   52.43   52.43    0.52
  2     350        439.59    808.87   58.09   74.58   47.57    0.58
  2     400        325.14    735.45   59.20   58.42

### Loading and trying the model with new data

In [9]:
# Import the spaCy library
import spacy

# Load the trained spaCy NER model from the specified path
nlp = spacy.load('output_try_4/model-best')

In [10]:
text = """
Jasmine kissed cranberry - 100 % arabica that is sourced from Chikmagalur, grown at an altitude of approximately 4,100ft to 4,500ft. It gets its unique name through a process which include a strain of yeast used to create a carbon dioxide-rich environment during fermentation in stainless-steel fermenters. A remarkable level of complexity in the beans is created by this process, laying the groundwork for a wonderful cup of coffee. Steady drying on raised beds is the next important step, which lets the flavors gradually develop and intensify. After another 30 days of continuous stirring, the coffee undergo additional drying developing scent of jasmine, complex notes of raspberries, cranberries, and sparkling malic acidity with a lingering floral aftertaste 
"""

In [11]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text_cleaner(text))

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

jasmine   ->>>>   TASTING NOTES
cranberry   ->>>>   TASTING NOTES
arabica   ->>>>   COFFEE TYPE
chikmagalur   ->>>>   LOCATION
500ft   ->>>>   ELEVATION
jasmine   ->>>>   TASTING NOTES
raspberries   ->>>>   TASTING NOTES
cranberries   ->>>>   TASTING NOTES
floral   ->>>>   TASTING NOTES


In [12]:
text ="""
Single origin Indian Coffee From Moganad Estate, Tamil Nadu which is a 100% Arabiaca coffee with 
an exquisite blend of balanced sweetness and brightness. A Medium Dark Roast coffee with 
flavor notes of Cocoa, Caramel and Nut which can also be enjoyed in a French press, moka pot, aeropress & espresso .
 It is washed processed, grown at an altitude of around 4430 ft

"""

In [13]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text_cleaner(text))

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

moganad   ->>>>   ESTATE
tamil nadu   ->>>>   LOCATION
arabiaca   ->>>>   COFFEE TYPE
dark   ->>>>   ROAST LEVEL
cocoa   ->>>>   TASTING NOTES
caramel   ->>>>   TASTING NOTES
nut   ->>>>   TASTING NOTES
washed   ->>>>   PROCESSING
4430 ft   ->>>>   ELEVATION
