In [5]:
# Import required libraries and install any necessary packages
import spacy
import warnings
import random

from spacy.tokens import DocBin
from tqdm import tqdm
import json

In [4]:
import sys
import os

# Add the path to the directory containing the attribute_cleaner module
sys.path.append(os.path.abspath("../transformer"))

# Now you can import text_cleaner
from attribute_cleaner.general_string_cleaner import text_cleaner

In [3]:
# reading the data in doccano jsonl output format
with open('data/merged_data.jsonl', 'r') as f:
    lines = list(f)

training_data: list = []

for line in lines:
    row = json.loads(line)
    if row['label']:
        training_data.append(  [ row["text"], { "entities": row["label"] } ] )

random.shuffle(training_data)

print(len(training_data))

183


In [4]:
training_data[0]

['"savorworks x mooleh manay ! a collaboration that has been in the works for 4 years and it \' s shaping up to be something truly "" phenomenal "" . this year \' s phenom promises an extraordinary experience as we proudly announce our partnership with mooley manay estate , situated near the beautiful backwaters of the harangi reservoir in coorg , karnataka . producers komal and akshay dashrath have masterfully embraced a data - driven approach to farming and coffee processing . their unwavering belief in the pivotal role of data in optimizing agricultural techniques and ensuring the utmost quality in coffee production is truly inspiring . this coffee is a 64 hours yeast inoculated - natural . the process begins with hand - picking ripe coffee cherries and floating them in water . ripe , denser cherries sink , while unripe or damaged ones float and are removed . the coffee is then placed in sealed tanks for a controlled , anaerobic fermentation lasting 64 hours , during which a custom 

In [5]:
# Define a function to create spaCy DocBin objects from the annotated data
def get_spacy_doc(file, data):
  # Create a blank spaCy pipeline
  nlp = spacy.blank('en')
  db = DocBin()

  # Iterate through the data
  for text, annot in tqdm(data):
    doc = nlp.make_doc(text)
    annot = annot['entities']

    ents = []
    entity_indices = []

    # Extract entities from the annotations
    for start, end, label in annot:
      skip_entity = False
      for idx in range(start, end):
        if idx in entity_indices:
          skip_entity = True
          break
      if skip_entity:
        continue

      entity_indices = entity_indices + list(range(start, end))
      try:
        span = doc.char_span(start, end, label=label, alignment_mode='strict')
      except:
        continue

      if span is None:
        # Log errors for annotations that couldn't be processed
        err_data = str([start, end]) + "    " + str(text) + "\n"
        file.write(err_data)
      else:
        ents.append(span)

    try:
      doc.ents = ents
      db.add(doc)
    except:
      pass

  return db

In [6]:
# Split the annotated data into training and testing sets
from sklearn.model_selection import train_test_split
train, test = train_test_split(training_data, test_size=0.1)

# Display the number of items in the training and testing sets
print(len(train), len(test))

# Open a file to log errors during annotation processing
file = open('train_file_try_5.txt','w')

# Create spaCy DocBin objects for training and testing data
db = get_spacy_doc(file, train)
db.to_disk('train_doccano_try_5.spacy')

db = get_spacy_doc(file, test)
db.to_disk('test_doccano_try_5.spacy')

# Close the error log file
file.close()

164 19


100%|██████████| 164/164 [00:00<00:00, 1110.77it/s]
100%|██████████| 19/19 [00:00<00:00, 697.53it/s]


In [7]:
# configure spacy for custom NER model , using a base config file - https://spacy.io/usage/training#config

# !python -m spacy init fill-config config_cpu_en_blank_base.cfg config_ner_en_blank.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config_ner_en_blank.cfg
You can now add your data and train your pipeline:
python -m spacy train config_ner_en_blank.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


### Model Training

In [8]:
!python -m spacy train config_ner_en_blank.cfg  --output  output_try_5  --paths.train train_doccano_try_5.spacy  --paths.dev  test_doccano_try_5.spacy --gpu-id 0

[38;5;2m✔ Created output directory: output_try_5[0m
[38;5;4mℹ Saving to output directory: output_try_5[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00    102.92    0.00    0.00    0.00    0.00
  0      50        360.85   3088.34    0.00    0.00    0.00    0.00
  0     100       1627.55   1139.89    2.25    8.33    1.30    0.02
  0     150         59.82    850.47   28.01   39.68   21.65    0.28
  1     200        229.60    807.99   43.97   57.75   35.50    0.44
  1     250         54.14    649.97   46.87   63.24   37.23    0.47
  1     300       1984.74    721.40   44.75   61.83   35.06    0.45
  2     350        356.52    625.71   53.40   60.77   47.62    0.53
  2     400        144.80    483.87   50.37   58.62

### Loading and trying the model with new data

In [6]:
# Import the spaCy library
import spacy

# Load the trained spaCy NER model from the specified path
nlp = spacy.load('output_try_5/model-best')

In [7]:
text = """
Jasmine kissed cranberry - 100 % arabica that is sourced from Chikmagalur, grown at an altitude of approximately 4,100ft to 4,500ft. It gets its unique name through a process which include a strain of yeast used to create a carbon dioxide-rich environment during fermentation in stainless-steel fermenters. A remarkable level of complexity in the beans is created by this process, laying the groundwork for a wonderful cup of coffee. Steady drying on raised beds is the next important step, which lets the flavors gradually develop and intensify. After another 30 days of continuous stirring, the coffee undergo additional drying developing scent of jasmine, complex notes of raspberries, cranberries, and sparkling malic acidity with a lingering floral aftertaste 
"""

In [8]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text_cleaner(text))

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

jasmine   ->>>>   TASTING NOTES
arabica   ->>>>   COFFEE TYPE
chikmagalur   ->>>>   LOCATION
jasmine   ->>>>   TASTING NOTES
raspberries   ->>>>   TASTING NOTES
cranberries   ->>>>   TASTING NOTES
malic acidity   ->>>>   ESTATE


In [9]:
text ="""
Single origin Indian Coffee From Moganad Estate, Tamil Nadu which is a 100% Arabiaca coffee with 
an exquisite blend of balanced sweetness and brightness. A Medium Dark Roast coffee with 
flavor notes of Cocoa, Caramel and Nut which can also be enjoyed in a French press, moka pot, aeropress & espresso .
 It is washed processed, grown at an altitude of around 4430 ft

"""

In [10]:
# Process the extracted text using the loaded spaCy NER model
doc = nlp(text_cleaner(text))

# Iterate through the named entities (entities) recognized by the model
for ent in doc.ents:
  # Print the recognized text and its corresponding label
  print(ent.text, "  ->>>>  ", ent.label_)

moganad   ->>>>   ESTATE
medium dark roast   ->>>>   ROAST LEVEL
cocoa   ->>>>   TASTING NOTES
caramel   ->>>>   TASTING NOTES
4430 ft   ->>>>   ELEVATION
