In [1]:
import spacy
import numpy as np
import pandas as pd
import cupy as cp

nlp = spacy.load('en_core_web_trf')

In [2]:
s = 999
np.random.seed(s)
spacy.util.fix_random_seed(s)
ner=nlp.get_pipe("ner")

In [3]:
f = open('plain_cannabis.txt', 'r')
strains = f.read().splitlines()
text = ' '.join(strains)
print(strains)

['INDICA', 'Afghani', 'Afgoo', 'Berry White', 'Blueberry', 'Bubba Kush', 'G13', 'Granddaddy Purple', 'Grape Ape', 'Herijuana', 'Hindu Kush', 'Ingrid', 'Kosher Kush', 'Lavender', 'Master Kush', 'Northern Lights', 'Obama Kush', 'Pez', 'Plushberry', 'Presidential OG', 'Purple Urkle', 'Willy’s Wonder', 'HYBRID', 'ACDC', 'AK-47', 'Zkittlez', 'Ewok', 'Gelato', 'Banana OG', 'Blue Dream', 'Cannatonic', 'Chemdawg', 'Chernobyl', 'Cherry Pie', 'Cinderella 99', 'Dancehall', 'Double Dream', 'Dutch Treat', 'Fruity Pebbles', 'Headband', 'Jean Guy', 'Jillybean', 'Juicy Fruit', 'Larry OG', 'Lemonder', 'Lodi Dodi', 'Mango Kush', 'Mendocino Purps', 'Middlefork', 'OG Kush', 'Pineapple Chunk', 'Pineapple Express', 'Pink Kush', 'Raskal OG', 'SAGE', 'SFV OG', 'Shiatsu Kush', 'Skunk No. 1', 'Snoop’s Dream', 'Snowcap', 'Sour OG', 'Sour Tsunami', 'Space Queen', 'Sunset Sherbet', 'Tahoe OG', 'Tangerine Dream', 'Trainwreck', 'UK Cheese', 'White Fire OG', 'White Widow', 'XJ-13', 'SATIVA', 'Acapulco Gold', 'Alaskan

In [4]:
words = []
labels = []

for strain in strains:
    split_strain = strain.split()
    if len(split_strain) > 1:
        for i,word in enumerate(split_strain, 1):
            if i == 1:
                words.append(word)
                labels.append('B-STRAIN')
            elif i ==len(split_strain):
                words.append(word)
                labels.append('L-STRAIN')
            else:
                words.append(word)
                labels.append('I-STRAIN')
    else:  
        words.append(strain)
        labels.append('U-STRAIN') # As most of token will be non-entity (OUT). Replace this later with actual entity a/c the scheme.

df = pd.DataFrame({'word': words, 'label': labels})
df.to_csv('cannabis_data.bilou', index=False) # biluo in extension to indicate the type of encoding, it is ok to keep csv

In [5]:
dpath = 'cannabis_data_e.bilou'
df = pd.read_csv(dpath, sep=',')
words  = list(df.word.values)
ents = list(df.label.values)
text = ' '.join(words)

In [6]:
add_ents = ['STRAIN'] #

prev_ents = ner.move_names

for ent in add_ents:
    ner.add_label(ent)
    
new_ents = ner.move_names
# print('\n[All Entities] = ', ner.move_names)


In [7]:
#### Create Dataset
from spacy.training import Example
print(text)
doc = nlp.make_doc(text)
g = Example.from_dict(doc, {"entities": ents})
# Add examples as avaialble or needed
X = [doc]
Y = [ g]

INDICA Afghani Afgoo Berry White Blueberry Bubba Kush G13 Granddaddy Purple Grape Ape Herijuana Hindu Kush Ingrid Kosher Kush Lavender Master Kush Northern Lights Obama Kush Pez Plushberry Presidential OG Purple Urkle HYBRID ACDC Zkittlez Ewok Gelato Banana OG Blue Dream Cannatonic Chemdawg Chernobyl Cherry Pie Cinderella 99 Dancehall Double Dream Dutch Treat Fruity Pebbles Headband Jean Guy Jillybean Juicy Fruit Larry OG Lemonder Lodi Dodi Mango Kush Mendocino Purps Middlefork OG Kush Pineapple Chunk Pineapple Express Pink Kush Raskal OG SAGE SFV OG Shiatsu Kush Snowcap Sour OG Sour Tsunami Space Queen Sunset Sherbet Tahoe OG Tangerine Dream Trainwreck UK Cheese White Fire OG White Widow SATIVA Acapulco Gold Alaskan Thunder Fuck Allen Wrench Amnesia Bay 11 Chocolope Cinex Dirty Girl Durban Poison Ghost Train Haze Grapefruit Green Crack Harlequin Island Sweet Skunk Jack Herer Kali Mist Laughing Buddha Maui Wowie Panama Red Purple Haze Red Headed Stranger Schrom Sour Diesel Strawberry C

In [8]:
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
print(f'[OtherPipes] = {other_pipes} will be disabled')

[OtherPipes] = ['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer'] will be disabled


In [9]:
TRAIN_DATA = [
              ("Indica remains the most common form of the drug.", {"entities": [(0, 6, "STAIN")]}),
              ("The English word canvas sufficiently reveals its derivation from cannabis.", {"entities": [(66, 75, "STAIN")]}),
              ("ample evidence to suggest sativa be legal.", {"entities": [(27,34, "STAIN")]}),
              ("Lavender downgraded from a Class B to Class C drug.", {"entities": [(0,8, "STAIN")]}),
              ("The price for good herbal lemonder should be £ 120 an ounce.", {"entities": [(27,36, "STAIN")]}),
              ("What is GW's position on crude herbal g13?", {"entities": [(39,42, "STAIN")]}),
              ("cannabis blueberry.", {"entities": [(9,18, "STAIN")]})
              ]

In [10]:
import random
examples = []
for text, annots in TRAIN_DATA:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))
nlp.initialize(lambda: examples)
for i in range(20):
    random.shuffle(examples)
    for batch in spacy.util.minibatch(examples, size=8):
        nlp.update(batch)



In [24]:
examples = []
for text, annots in TRAIN_DATA:
    examples.append(Example.from_dict(nlp.make_doc(text), annots))
nlp.initialize(lambda: examples)
losses = {}

for i in range(20):
    random.shuffle(examples)
    for batch in spacy.util.minibatch(examples, size=4):
        print()
        nlp.update(batch,                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses)
        print("Losses", losses)



Losses {'transformer': 85.78706187009811, 'tagger': 0.0, 'parser': 0.0, 'ner': 26.91504728794098}

Losses {'transformer': 144.1557461619377, 'tagger': 0.0, 'parser': 0.0, 'ner': 53.22083252668381}

Losses {'transformer': 206.93320339918137, 'tagger': 0.0, 'parser': 0.0, 'ner': 87.48028248548508}

Losses {'transformer': 258.3176002204418, 'tagger': 0.0, 'parser': 0.0, 'ner': 106.59361857175827}

Losses {'transformer': 297.2268190085888, 'tagger': 0.0, 'parser': 0.0, 'ner': 137.0761826634407}

Losses {'transformer': 304.23779578506947, 'tagger': 0.0, 'parser': 0.0, 'ner': 159.87776166200638}

Losses {'transformer': 328.80488486588, 'tagger': 0.0, 'parser': 0.0, 'ner': 193.06311017274857}

Losses {'transformer': 340.59306724369526, 'tagger': 0.0, 'parser': 0.0, 'ner': 215.0660789012909}

Losses {'transformer': 419.6733380109072, 'tagger': 0.0, 'parser': 0.0, 'ner': 252.37887692451477}

Losses {'transformer': 429.99520452320576, 'tagger': 0.0, 'parser': 0.0, 'ner': 270.2199483513832}

Los

In [19]:
doc = nlp('cannabis blueberry.')
for ent in doc.ents:
    print(ent.text, ent.label_)



cannabis sativa DATE
