In [1]:
import pandas as pd
import numpy as np
import spacy 
import random
import time

import csv
import json

import re

In [2]:
df = pd.read_csv('data/ebay_clean.csv', index_col = None)
df.columns
df = df.drop(columns ='Unnamed: 0')
df['Category'] = 'Laptop'
df.head()

Unnamed: 0,ProductBrand,ProductVer,Processor,RAM,OS,Disk,Dim,Category
0,Lenovo,Ideapad,Intel Core i3 Processor (7th Gen),4 GB DDR4,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
1,HP,Envy,Intel Core i3 Processor (7th Gen),8 GB DDR4,64 bit Windows 10,256 GB SSD,35.56 cm (14 inch),Laptop
2,Dell,Vostro,Intel Core i3 Processor (8th Gen),4 GB DDR4,Linux/Ubuntu,1 TB HDD,35.56 cm (14 inch),Laptop
3,HP,EliteBook,Intel Core i3 Processor (7th Gen),4 GB DDR4,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop
4,Lenovo,Ideapad,Intel Core i3 Processor (7th Gen),4 GB DDR4,64 bit Windows 10,1 TB HDD,39.62 cm (15.6 inch),Laptop


### Tokenizing the data

In [3]:
# Entities 
entity = df.columns
num_entity = len(df.columns)
entity_list = list(np.arange(num_entity))
entity_list

[0, 1, 2, 3, 4, 5, 6, 7]

In [4]:
#creating phrases from entities
LaptopName = []
Annotations  = []

for i in range(len(df)):
    ind_list = random.sample(entity_list, num_entity)  #shuffle list
    phrase =  []
    ann = []
    ann_ind = 0
    for j in range(num_entity):
        col = ind_list[j]
        val = df.iloc[i, col]
        phrase.append(val)
        ann.append((ann_ind, len(val) + ann_ind, entity[col]))
        ann_ind = ann_ind + len(val) + 1
        
    LaptopName.append(' '.join(phrase))
    Annotations.append(ann)

In [5]:
LaptopName[0]

'4 GB DDR4  Laptop 39.62 cm (15.6 inch)  64 bit Windows 10 Intel Core i3 Processor (7th Gen) 1 TB HDD Lenovo Ideapad'

In [6]:
Annotations[0]

[(0, 10, 'RAM'),
 (11, 17, 'Category'),
 (18, 39, 'Dim'),
 (40, 57, 'OS'),
 (58, 91, 'Processor'),
 (92, 100, 'Disk'),
 (101, 107, 'ProductBrand'),
 (108, 115, 'ProductVer')]

In [7]:
#finding brand from tokens
entity_check = Annotations[0][4][2]
entity_id = Annotations[0][4][1]
name = Annotations[0][4][0]

print(entity_check, ':',LaptopName[0][name:entity_id] )

Processor : Intel Core i3 Processor (7th Gen)


In [8]:
#Preparing dataset of tokens
data = []
for i in range(len(df)):
    data.append([LaptopName[i], Annotations[i]])
    
LaptopData = pd.DataFrame(data, columns = ['Description', 'Annotations'])

In [9]:
LaptopData.head()

Unnamed: 0,Description,Annotations
0,4 GB DDR4 Laptop 39.62 cm (15.6 inch) 64 bit...,"[(0, 10, RAM), (11, 17, Category), (18, 39, Di..."
1,35.56 cm (14 inch) Laptop 8 GB DDR4 Envy HP...,"[(0, 19, Dim), (20, 26, Category), (27, 37, RA..."
2,Vostro 1 TB HDD 35.56 cm (14 inch) Laptop Del...,"[(0, 6, ProductVer), (7, 15, Disk), (16, 35, D..."
3,EliteBook Intel Core i3 Processor (7th Gen) 3...,"[(0, 10, ProductVer), (11, 44, Processor), (45..."
4,4 GB DDR4 Lenovo Laptop 39.62 cm (15.6 inch) ...,"[(0, 10, RAM), (11, 17, ProductBrand), (18, 24..."


In [10]:
LaptopData.to_csv('data/Desc_anno.csv', index= None)

### Creating json file

In [11]:
file = open('data/Desc_anno.csv', 'r')
json_file = open('data/Laptop_Description.json', 'w')

columns = ('Description', 'Annotations')
reader = csv.DictReader(file, columns)

for row in reader:
    json.dump(row, json_file)
    json_file.write('\n')

### Preparing Training Data

In [12]:
def create_train(file):
    try:
        training_data = []
        data = []
        with open(file, 'r') as f:
            data = f.readlines()
        print("Length of complete dataset : ",len(data))
        m = int(len(data)) # length of complete data
        training_len = round(m * 0.8) # length of training data
        print("Length of training dataset : ",training_len)
        
        invalid_span_tokens = re.compile(r'\s')
        
        for line in data[1:training_len]:
            row = json.loads(line)
            desc  = row['Description']  #Laptop description
            entity = row['Annotations']  # Annontation of description
            valid_entities = []
            for i in eval(entity):
                valid_start, valid_end, label = i[0],i[1],i[2]
                while valid_start < len(desc) and invalid_span_tokens.match(desc[valid_start]):
                    valid_start += 1
                while valid_end > 1 and invalid_span_tokens.match(desc[valid_end - 1]):
                    valid_end -= 1
                valid_entities.append([valid_start, valid_end, label])
            training_data.append((desc, {"entities" : valid_entities}))
        
        return training_data
        
                
    except Exception as e:
        logging.exception("Unable to process " + file + '\n' + "error = " + str(e))
        return None
    
                

In [13]:
training_data = []
data = []
with open(input_file, 'r') as f:
    data = f.readlines()
m = int(len(data))
training_len = round(m * 0.8)

'''for line in data[1:training_len]:
    row = json.loads(line)
    desc  = row['Description']
    entity = row['Annotations']
    print(entity)
    valid_entities = []
    for ent in entity.split(','):
        print(ent)'''
invalid_span_tokens = re.compile(r'\s')
row = json.loads(data[1])
#print(row)
desc  = row['Description']
entity = row['Annotations']
#print(entity)
valid_entities = []
for i in eval(entity):
    valid_start , valid_end , label = i[0],i[1],i[2]
    while valid_start < len(desc) and invalid_span_tokens.match(desc[valid_start]):
        valid_start += 1
    while valid_end > 1 and invalid_span_tokens.match(desc[valid_end - 1]):
        valid_end -= 1
    valid_entities.append([valid_start, valid_end, label])
    print(valid_entities)


NameError: name 'input_file' is not defined

In [14]:
input_file = 'data/Laptop_Description.json'
training_data = create_train(input_file)
training_data[0]

Length of complete dataset :  416
Length of training dataset :  333


('4 GB DDR4  Laptop 39.62 cm (15.6 inch)  64 bit Windows 10 Intel Core i3 Processor (7th Gen) 1 TB HDD Lenovo Ideapad',
 {'entities': [[0, 9, 'RAM'],
   [11, 17, 'Category'],
   [18, 38, 'Dim'],
   [40, 57, 'OS'],
   [58, 91, 'Processor'],
   [92, 100, 'Disk'],
   [101, 107, 'ProductBrand'],
   [108, 115, 'ProductVer']]})

### Training NER model

In [16]:
def training(train_data, iterations):
    
    nlp = spacy.blank('en') #blank language class
    
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)

    
    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    
    # disable other pips name
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for i in range(iterations):
            print("Starting iteration " + str(i))
            random.shuffle(train_data)
            losses = {}
            for desc, annotations in training_data:
                nlp.update(
                    [desc],
                    [annotations],
                    drop = 0.2,
                    sgd = optimizer,
                    losses = losses
                )
            #if (losses['ner'] < 50):
            #    break
            print(losses)
    return nlp
    

In [17]:
%%time
nlp_1 = training(training_data, 10)

  proc.begin_training(


Starting iteration 0
{'ner': 1933.5915573758327}
Starting iteration 1
{'ner': 190.12729213023496}
Starting iteration 2
{'ner': 71.32842122774716}
Starting iteration 3
{'ner': 100.84694997425362}
Starting iteration 4
{'ner': 63.450267644574296}
Starting iteration 5
{'ner': 74.00841993304392}
Starting iteration 6
{'ner': 54.06889538502306}
Starting iteration 7
{'ner': 77.30535836653793}
Starting iteration 8
{'ner': 75.6625128423183}
Starting iteration 9
{'ner': 85.10703722400429}
Wall time: 11min 3s


In [18]:
nlp_2 = training(training_data, 30)

Starting iteration 0
{'ner': 1851.2582731855505}
Starting iteration 1
{'ner': 124.47345139901309}
Starting iteration 2
{'ner': 69.23241189434714}
Starting iteration 3
{'ner': 116.61889072746945}
Starting iteration 4
{'ner': 51.460620769704136}
Starting iteration 5
{'ner': 70.04433799400528}
Starting iteration 6
{'ner': 64.04877992113097}
Starting iteration 7
{'ner': 71.2596126969744}
Starting iteration 8
{'ner': 78.6439628091487}
Starting iteration 9
{'ner': 56.25349386714585}
Starting iteration 10
{'ner': 86.57250023254396}
Starting iteration 11
{'ner': 44.38575762916058}
Starting iteration 12
{'ner': 62.15800921292055}
Starting iteration 13
{'ner': 36.557689213875115}
Starting iteration 14
{'ner': 89.09271829112207}
Starting iteration 15
{'ner': 46.97607737096716}
Starting iteration 16
{'ner': 46.64348932113302}
Starting iteration 17
{'ner': 31.64071704456642}
Starting iteration 18
{'ner': 35.46992082918963}
Starting iteration 19
{'ner': 60.72419901744476}
Starting iteration 20
{'ner

### Testing the model

In [19]:
test_val = str(input("Enter your testing text: "))

Enter your testing text: 256 GB SSD and Intel Core i7 Processor (8th Gen) HP


In [21]:
predict_data = nlp_1(test_val)

for i in predict_data.ents:
    print('Entity :', i.text)
    print('Details: ',i.start_char, i.end_char, i.label_)

Entity : 256 GB SSD
Details:  0 10 Disk
Entity : and
Details:  11 14 OS
Entity : Intel Core i7 Processor (8th Gen)
Details:  15 48 Processor
Entity : HP
Details:  49 51 ProductBrand


In [22]:
predict_data2 = nlp_2(test_val)

for i in predict_data2.ents:
    print('Entity :', i.text)
    print('Details: ',i.start_char, i.end_char, i.label_)

Entity : 256 GB SSD
Details:  0 10 Disk
Entity : and
Details:  11 14 ProductBrand
Entity : Intel Core i7 Processor (8th Gen)
Details:  15 48 Processor
Entity : HP
Details:  49 51 ProductBrand
