## Creating a Spacy NER Model for laptops

The Brand, Model, Processor, RAM, OS, Disk, Dim are the entities we want to define.

In [1]:
##import libraries
import pandas as pd
import numpy as np
import spacy
import random
import time

### 1. Data


In [2]:
## loading data with the columns names as entities
data = pd.read_csv("catalogue/laptop.csv",index_col=None)
data

Unnamed: 0,Brand,Model,Processor,RAM,OS,Disk,Dim,Category
0,Lenovo,Ideapad,Intel Core i3 Processor (7th Gen),4 GB DDR4,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
1,Lenovo,Ideapad,Intel Core i3 Processor (7th Gen),4 GB DDR4,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
2,HP,EliteBook,Intel Core i3 Processor (7th Gen),8 GB DDR4,64 bit Windows 10,256 GB SSD,35.56 cm (14 inch),Laptop
3,Dell,Vostro,Intel Core i3 Processor (8th Gen),4 GB DDR4,Linux/Ubuntu,1 TB HDD,35.56 cm (14 inch),Laptop
4,HP,Zbook,Intel Core i5 Processor (8th Gen),8 GB DDR4,64 bit Windows 10,1 TB HDD,35.56 cm (14 inch),Laptop
...,...,...,...,...,...,...,...,...
411,Lenovo,Ideapad,Intel Core i5 Processor (6th Gen),4 GB DDR3,64 bit Windows 10,1 TB HDD,35.56 cm (14 inch),Laptop
412,Lenovo,Ideapad,Intel Core i7 Processor (8th Gen),8 GB DDR4,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
413,Lenovo,Ideapad,AMD APU Quad Core A6 Processor,4 GB DDR3,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
414,Lenovo,Legion,Intel Pentium Quad Core Processor (4th Gen),4 GB DDR3,D,500 GB HDD,39.62 cm (15.6 inch),Laptop


### 2. Pre-processing

The input data has to be in a particular format.

Steps:
1. Create a phrase with jumbled entities and the annotations for each entity.
2. Convert into json file with the content (phrase above) and the corresponding entities.
3. The json file is converted into a list.


####  a. Creating content for the laptops

In [3]:
## the entities
cols = data.columns
cols

Index(['Brand', 'Model', 'Processor', 'RAM', 'OS', 'Disk', 'Dim', 'Category'], dtype='object')

In [4]:
## number of entities
num_ent = len(data.columns)
ent_list = list(np.arange(num_ent))
ent_list

[0, 1, 2, 3, 4, 5, 6, 7]

In [5]:
# sample of an entity
data.iloc[0,2]

'Intel Core i3 Processor (7th Gen)'

In [6]:
## jumble indices of entities to create a phrase
prod_name = [] # list of all product names
prod_ann = [] # list of all the annotations
for i in range(len(data)): # loop for each laptop
    idx_list = random.sample(ent_list,num_ent) # shuffling indices
    cont = []
    ann = []
    ann_idx = 0 # pointer for annotating 
    for j in range(num_ent): # creating the jumbled product name
        col_num = idx_list[j] # column number according jumbled column index
        val = data.iloc[i,col_num] # value of the entity 
        cont.append(val) # appending list of entities into a single list
        ann.append((ann_idx, len(val)+ ann_idx, cols[col_num])) # annotations and entity name
        ann_idx = ann_idx + len(val) + 1 # updating the annotation pointer
        
    prod_name.append( ' '.join(cont)) # complete phrase for each laptop
    prod_ann.append(ann) 


    

In [7]:
## Example
## This is the content
sample_prod_name = prod_name[8]
sample_prod_name

'Intel Core i5 Processor (10th Gen) Modern 512 GB SSD Laptop 8 GB DDR4  MSI 35.56 cm (14 inch)  64 bit Windows 10'

In [8]:
## entities in it with their annotations
sample_prod_ent = prod_ann[8]
sample_prod_ent

[(0, 34, 'Processor'),
 (35, 41, 'Model'),
 (42, 52, 'Disk'),
 (53, 59, 'Category'),
 (60, 70, 'RAM'),
 (71, 74, 'Brand'),
 (75, 94, 'Dim'),
 (95, 112, 'OS')]

In [9]:
## take an entity in the prod 
st_id = sample_prod_ent[7][0]
end_id = sample_prod_ent[7][1]
ent = sample_prod_ent[7][2]
print(ent,':', sample_prod_name[st_id : end_id])


OS : 64 bit Windows 10


In [10]:
prod =[]
for i in range(len(data)):
    prod.append([prod_name[i], prod_ann[i]])

prod[4]

['1 TB HDD Zbook Intel Core i5 Processor (8th Gen) Laptop 8 GB DDR4  HP 35.56 cm (14 inch)  64 bit Windows 10',
 [(0, 8, 'Disk'),
  (9, 14, 'Model'),
  (15, 48, 'Processor'),
  (49, 55, 'Category'),
  (56, 66, 'RAM'),
  (67, 69, 'Brand'),
  (70, 89, 'Dim'),
  (90, 107, 'OS')]]

In [11]:
## creating a dataframe with product names and annotations
prod_data = pd.DataFrame(prod, columns = ['ProdName','Annotations'])
prod_data.head()

Unnamed: 0,ProdName,Annotations
0,4 GB DDR4 64 bit Windows 10 Ideapad 1 TB HDD ...,"[(0, 10, RAM), (11, 28, OS), (29, 36, Model), ..."
1,4 GB DDR4 Ideapad Lenovo Intel Core i3 Proces...,"[(0, 10, RAM), (11, 18, Model), (19, 25, Brand..."
2,256 GB SSD 35.56 cm (14 inch) Laptop EliteBoo...,"[(0, 10, Disk), (11, 30, Dim), (31, 37, Catego..."
3,Vostro Linux/Ubuntu Laptop Intel Core i3 Proce...,"[(0, 6, Model), (7, 19, OS), (20, 26, Category..."
4,1 TB HDD Zbook Intel Core i5 Processor (8th Ge...,"[(0, 8, Disk), (9, 14, Model), (15, 48, Proces..."


In [12]:
# converting into csv file
prod_data.to_csv('laptop_prodNames.csv', index= None)

#### b. Creating json file

In [13]:
# converting into json format
import csv
import json

csvfile = open('laptop_prodNames.csv', 'r')
jsonfile = open('laptop_prodNames.json', 'w')

fieldnames = ('ProdName', 'Annotations')
reader = csv.DictReader( csvfile, fieldnames)

for row in reader:
    json.dump(row, jsonfile)
    jsonfile.write('\n')

#### c. json to list (spacy format)

In [14]:
## function to convert json file into spacy traning data format
def convert_to_spacytrain(json_file):
    training_data = []
    lines=[]
    with open(json_file, 'r') as f:
        lines = f.readlines() # this has 416 lines
        
    for line in lines[1:400]: # loop for every product
        data = json.loads(line) # single row
        text = data['ProdName'] #this is complete phrase
        entities = data['Annotations']
        training_data.append((text, {"entities" : eval(entities)}))
        
    return training_data
    
    

In [15]:
train_data = convert_to_spacytrain('laptop_prodNames.json')
train_data

JSONDecodeError: Expecting value: line 1 column 14 (char 13)

### 3. Training NER model

In [33]:
from spacy.training.example import Example
def train_spacy(data,iterations):
    
    TRAIN_DATA = data
    nlp = spacy.blank('en')  # create blank Language class
    
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        # ner = nlp.create_pipe()
        ner = nlp.add_pipe('ner', last=True)
       

    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
                ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                #print(text, annotations)
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, annotations)
                nlp.update([example],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
                
            if (losses['ner'] <100):
                    break
            print(losses)
    return nlp

In [None]:
# can plot losses and choose best model
print(train_data)

In [34]:
%%time
prdnlp = train_spacy(train_data,10)

{'ner': 122.29532516133688}
Starting iteration 2
Wall time: 1min 8s


In [35]:
prdnlp_1 = train_spacy(train_data,20)

Starting iteration 0
{'ner': 1277.6230622168216}
Starting iteration 1


In [38]:
prdnlp.to_bytes()

{'paths': {'train': None, 'dev': None, 'vectors': None, 'init_tok2vec': None},
 'system': {'seed': 0, 'gpu_allocator': None},
 'nlp': {'lang': 'en',
  'pipeline': ['ner'],
  'disabled': [],
  'before_creation': None,
  'after_creation': None,
  'after_pipeline_creation': None,
  'batch_size': 1000,
  'tokenizer': {'@tokenizers': 'spacy.Tokenizer.v1'}},
 'components': {'ner': {'factory': 'ner',
   'incorrect_spans_key': None,
   'model': {'@architectures': 'spacy.TransitionBasedParser.v2',
    'state_type': 'ner',
    'extra_state_tokens': False,
    'hidden_width': 64,
    'maxout_pieces': 2,
    'use_upper': True,
    'tok2vec': {'@architectures': 'spacy.HashEmbedCNN.v2',
     'pretrained_vectors': None,
     'width': 96,
     'depth': 4,
     'embed_size': 2000,
     'window_size': 1,
     'maxout_pieces': 3,
     'subword_features': True},
    'nO': None},
   'moves': None,
   'scorer': {'@scorers': 'spacy.ner_scorer.v1'},
   'update_with_oracle_cut_size': 100}},
 'corpora': {'dev':

### 4. Testing the model

In [36]:
test_text = input("Enter your testing text: ")
doc = prdnlp(test_text)

for ent in doc.ents:

    print('Entity: ',ent.text)
    print('Details: ',ent.start_char, ent.end_char, ent.label_)

Entity:  Panasonic
Details:  0 9 Model
Entity:  Latitude
Details:  10 18 Model
Entity:  14 B5232 -
Details:  19 29 OS
Entity:  Solid
Details:  30 35 Model
Entity:  BX80621E52640)
Details:  41 55 Model
Entity:  160GB :
Details:  56 63 Disk
Entity:  Intel DVDRW | Laptop,14
Details:  64 87 Processor
Entity:  Laptop
Details:  88 94 Category
Entity:  ,
Details:  94 95 Model
Entity:  Kids /
Details:  95 101 OS
Entity:  Anodized
Details:  102 110 Brand
Entity:  Netbook -
Details:  111 120 Model
Entity:  7 8)
Details:  121 125 Model


In [37]:
test_text = input("Enter your testing text: ")
doc = prdnlp_1(test_text)

for ent in doc.ents:

    print('Entity: ',ent.text)
    print('Details: ',ent.start_char, ent.end_char, ent.label_)

Entity:  Panasonic
Details:  0 9 Model
Entity:  Latitude
Details:  10 18 Model
Entity:  14 B5232 -
Details:  19 29 Disk
Entity:  Solid
Details:  30 35 Model
Entity:  Duo (
Details:  36 41 Model
Entity:  )
Details:  54 55 Brand
Entity:  160GB :
Details:  56 63 Disk
Entity:  Intel DVDRW |
Details:  64 77 Processor
Entity:  Laptop,14
Details:  78 87 Brand
Entity:  Laptop
Details:  88 94 Category
Entity:  ,Kids
Details:  94 99 Model
Entity:  Anodized
Details:  102 110 Brand
Entity:  Netbook
Details:  111 118 Model
Entity:  7
Details:  121 122 Model
