## Import Libraries 

In [41]:
# Install and  Import the required libraries
import pandas as pd
import spacy
from spacy.tokens import DocBin
from tqdm import tqdm
from spacy.util import filter_spans

nlp = spacy.load("en_core_web_sm")
nlp.disable_pipe("parser")
doc_bin = DocBin() # create a DocBin object

## Training data Preparation

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
df = pd.DataFrame(columns=["Sentence", "tag", "token"])
train = open("/content/drive/MyDrive/Untitled folder/ner_train.txt", "r")

idx = 0
sent = 0
for line in train:
    if line=="\n":
        sent = sent+1
    else:
        val = line.split("\t")
        df = pd.concat([df, pd.DataFrame({"Sentence": "Sentence:" + str(sent) , "tag": val[0], "token": val[1][:-1]}, index=[idx])], axis=0)
        idx=idx+1

df.head()

Unnamed: 0,Sentence,tag,token
0,Sentence:0,O,what
1,Sentence:0,O,movies
2,Sentence:0,O,star
3,Sentence:0,B-ACTOR,bruce
4,Sentence:0,I-ACTOR,willis


In [52]:
# generating sentence
train_df = df
train_df = train_df.groupby("Sentence").agg(list)

# training model on only 200 samples due to resource unavailability
train_df = train_df.iloc[:200,:]
train_df['sent'] = train_df['token'].apply(lambda x: ' '.join(x))

train_df.head()

Unnamed: 0_level_0,tag,token,sent
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sentence:0,"[O, O, O, B-ACTOR, I-ACTOR]","[what, movies, star, bruce, willis]",what movies star bruce willis
Sentence:1,"[O, O, O, O, B-ACTOR, I-ACTOR, O, O, B-YEAR]","[show, me, films, with, drew, barrymore, from,...",show me films with drew barrymore from the 1980s
Sentence:10,"[O, O, O, O, B-PLOT]","[what, movie, is, references, zydrate]",what movie is references zydrate
Sentence:100,"[O, O, O, B-RATINGS_AVERAGE, I-RATINGS_AVERAGE...","[what, is, the, top, rated, martin, scorsesy, ...",what is the top rated martin scorsesy moive
Sentence:1000,"[O, O, O, B-ACTOR, I-ACTOR, O, O]","[show, me, a, christopher, lee, movie, trailer]",show me a christopher lee movie trailer


In [53]:
# defining sentence to ner tag function
def sent2label(sent, tag, token):
    lst = []
    for idx,val in enumerate(tag):
        if val!="O":
            lst.append({"start":sent.find(token[idx]), "end":sent.find(token[idx])+len(token[idx]), "label":val})
    return lst

In [54]:
# generating spans for Spacy format
train_df['spans'] = train_df.apply(lambda x: sent2label(x.sent, x.tag, x.token), axis=1)
train_df.head()

Unnamed: 0_level_0,tag,token,sent,spans
Sentence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Sentence:0,"[O, O, O, B-ACTOR, I-ACTOR]","[what, movies, star, bruce, willis]",what movies star bruce willis,"[{'start': 17, 'end': 22, 'label': 'B-ACTOR'},..."
Sentence:1,"[O, O, O, O, B-ACTOR, I-ACTOR, O, O, B-YEAR]","[show, me, films, with, drew, barrymore, from,...",show me films with drew barrymore from the 1980s,"[{'start': 19, 'end': 23, 'label': 'B-ACTOR'},..."
Sentence:10,"[O, O, O, O, B-PLOT]","[what, movie, is, references, zydrate]",what movie is references zydrate,"[{'start': 25, 'end': 32, 'label': 'B-PLOT'}]"
Sentence:100,"[O, O, O, B-RATINGS_AVERAGE, I-RATINGS_AVERAGE...","[what, is, the, top, rated, martin, scorsesy, ...",what is the top rated martin scorsesy moive,"[{'start': 12, 'end': 15, 'label': 'B-RATINGS_..."
Sentence:1000,"[O, O, O, B-ACTOR, I-ACTOR, O, O]","[show, me, a, christopher, lee, movie, trailer]",show me a christopher lee movie trailer,"[{'start': 10, 'end': 21, 'label': 'B-ACTOR'},..."


In [55]:
# genetaing train data is Spacy format
annotations = []

for x in range(train_df.shape[0]):
    spans_lst = train_df['spans'][x]
    if len(spans_lst)>0:
        for i in range(len(spans_lst)):
            dct={}
            spans_dict = spans_lst[i]
            dct['entities'] = [(int(spans_dict['start']), int(spans_dict['end']), spans_dict['label'])]
            dct['text'] = train_df['sent'][x]
            annotations.append(dct)
    
print(annotations[:5])

training_data = {'classes' : df['tag'].unique(), 'annotations' : annotations}

[{'entities': [(17, 22, 'B-ACTOR')], 'text': 'what movies star bruce willis'}, {'entities': [(23, 29, 'I-ACTOR')], 'text': 'what movies star bruce willis'}, {'entities': [(19, 23, 'B-ACTOR')], 'text': 'show me films with drew barrymore from the 1980s'}, {'entities': [(24, 33, 'I-ACTOR')], 'text': 'show me films with drew barrymore from the 1980s'}, {'entities': [(43, 48, 'B-YEAR')], 'text': 'show me films with drew barrymore from the 1980s'}]


In [56]:
# creating Docbin format for spacy
from spacy.util import filter_spans

for training_example  in tqdm(training_data['annotations']): 
    text = training_example['text']
    labels = training_example['entities']
    doc = nlp.make_doc(text) 
    ents = []
    for start, end, label in labels:
        span = doc.char_span(start, end, label=label, alignment_mode="contract")
        if span is None:
            print("Skipping entity")
        else:
            ents.append(span)
    filtered_ents = filter_spans(ents)
    doc.ents = filtered_ents 
    doc_bin.add(doc)

doc_bin.to_disk("training_data.spacy") # save the docbin object

100%|██████████| 569/569 [00:00<00:00, 6764.22it/s]

Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity
Skipping entity





## Define the Parameters for your model training.


In [57]:
# define the variables like Number of iterations,model output dir and model   etc..

!python -m spacy init fill-config base_config.cfg config.cfg

[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


## Model Training

In [58]:
# Code for model training

!python -m spacy train config.cfg --output ./ --paths.train ./training_data.spacy --paths.dev ./training_data.spacy --gpu-id 0

[38;5;4mℹ Saving to output directory: .[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
[2023-01-14 15:34:03,430] [INFO] Set up nlp object from config
INFO:spacy:Set up nlp object from config
[2023-01-14 15:34:03,440] [INFO] Pipeline: ['tok2vec', 'ner']
INFO:spacy:Pipeline: ['tok2vec', 'ner']
[2023-01-14 15:34:03,443] [INFO] Created vocabulary
INFO:spacy:Created vocabulary
[2023-01-14 15:34:03,444] [INFO] Finished initializing nlp object
INFO:spacy:Finished initializing nlp object
[2023-01-14 15:34:04,289] [INFO] Initialized pipeline components: ['tok2vec', 'ner']
INFO:spacy:Initialized pipeline components: ['tok2vec', 'ner']
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00     61.51    0.00    0.00    0.00    0.00
  4     200         94.20   4849.13    8.76   