In [1]:
!pip install spacy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import spacy
import random
from spacy.tokens import Doc
from spacy.util import minibatch, compounding
from spacy import displacy
from spacy.training.example import Example
from tqdm import tqdm
from pathlib import Path
import numpy as np



Note: Download 'NER.txt' file from google drive. Link:https://drive.google.com/drive/folders/1ucaAX_uKWugfeEbarVygPI0LP0OZ6iGR?usp=share_link

In [3]:
with open('NER.txt', 'r') as f:
    data = f.readlines()
# Creation of custom entity data for input text where the company entity is to be identified by the model during prediction period
TRAIN_DATA = []

sentence = ""
entities = []

for line in data:
    if line.strip() == "":
        if sentence:
            TRAIN_DATA.append([sentence, {"entities": entities}])
            sentence = ""
            entities = []
    else:
        start = 0
        end = 0
        tag = ''
        word, entity = line.strip().split()
        if entity.startswith("B-"):
            if entities and entities[-1][2:] == entity[2:]:
                entities[-1][1] += len(entities[-1][0]) + 1
            entities.append([len(sentence), len(sentence) + len(word), entity[2:]])
        elif entity.startswith("I-"):
            if entities and entities[-1][2:][0] == entity[2:]:
                entities[-1][1] += len(word) + 1
            else:
                entities.append([len(sentence), len(sentence) + len(word), entity[2:]])

        sentence += word + " "
if sentence:
    TRAIN_DATA.append([sentence, {"entities": entities}])

# Train data is a list of tweets with their corresponding list of entities which could be person or company
# Format of train data : [tweet, {'entities':[]}]
print(TRAIN_DATA)

[['Lithium - ion battery explosions are now the third leading because of fires in the city , how much carbon and toxic materials are released with each fire . Also , electric vehicle are actually fossil fuel vehicles because only 1 - 5 % are run from environmentally irresponsible green sources . ', {'entities': []}], ['When an Electric Vehicle catches fire , it can t be put out as water or foam will cause an EXPLOSION . So what happens on the freeway when there s a pile - up if all vehicles were electric ? Many many MANY needless deaths ', {'entities': []}], ['The current Mini electric vehicle is delightful , and a recent survey found its owners more satisfied than those of any other affordable EV . But it has a short range . A new version appearing next year may fix that : ', {'entities': [[12, 16, 'company']]}], ['Vietnamese EV maker VinFast remains optimistic despite challenging entry to U.S. auto market : VinFast CEO says that there is a lot of room for players in the U.S. market a

In [4]:
def train_NER_model(new_model_name, label_name, output_dir, n_iter=20):
    
    nlp = spacy.blank('en')  # create blank model to carry out NER
    print("Created blank 'en' model")
    
    # set up the nlp pipeline with only 'ner' using create function
    ner = nlp.create_pipe('ner')
    nlp.add_pipe('ner')
    ner.add_label('company')   # add new entity label i.e 'company' to entity recognizer
    
    nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
    optimizer = nlp.begin_training() # To initialize the model, thus zero out existing entity types
    
    for itn in tqdm(range(n_iter)):
      random.shuffle(TRAIN_DATA)
      losses = {}
      batch_size = compounding(4.0, 16.0, 1.001)
      batches = minibatch(TRAIN_DATA, size=batch_size)
      for batch in batches:
        examples = []
        for text, annotations in batch:
          examples.append(Example.from_dict(nlp.make_doc(text), annotations))
          nlp.update(examples, sgd=optimizer, drop=0.5,losses=losses)
      print('Losses', losses)
    
    # save the trained model to output directory
    if output_dir is not None:
      nlp.meta['name'] = new_model_name
      save_updated_NER_model(nlp, output_dir)
        
    return nlp

# save the trained model to output directory
def save_updated_NER_model(model, output_dir):
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    
    model.to_disk(output_dir)
    print("Final model saved to", output_dir)
    
# load the trained NER model    
def load_trained_NER_model(output_dir):
    # load the trained model
    print("Loading from", output_dir)
    nlp_model = spacy.load(output_dir)
    return nlp_model

In [5]:
label = 'company'

In [6]:
out_dir='/content/drive/MyDrive/Colab Notebooks/Text Mining/NERMODEL' # Give the directory path where model needs to stored
nlp_model = train_NER_model(new_model_name='NER_EV_Model', label_name=label,output_dir=out_dir, n_iter=5)

Created blank 'en' model


 20%|██        | 1/5 [00:15<01:02, 15.66s/it]

Losses {'ner': 1508.0444300602321}


 40%|████      | 2/5 [00:24<00:35, 11.73s/it]

Losses {'ner': 630.2399900411579}


 60%|██████    | 3/5 [00:34<00:22, 11.09s/it]

Losses {'ner': 357.707203223397}


 80%|████████  | 4/5 [00:45<00:10, 10.88s/it]

Losses {'ner': 371.6255257488996}


100%|██████████| 5/5 [00:54<00:00, 10.83s/it]

Losses {'ner': 268.5609066221979}
Final model saved to /content/drive/MyDrive/Colab Notebooks/Text Mining/NERMODEL





In [7]:
def test_NER_model(nlp_model, document_test, show_entities=True, style_sentence=True):
    document = nlp_model(document_test)
    if show_entities:
      for entities in document.ents:
        if(entities.label_ == 'company' and entities.text != None and entities.label_ != None):
          print(entities.label_, entities.text)
    if style_sentence:
      colors = {'company': 'linear-gradient(90deg, #aa9cfc, #fc9ce7)'}
      displacy.render(document, jupyter=True, style='ent', options={'colors': colors})

In [8]:
nlp_trained_model = load_trained_NER_model(output_dir=out_dir)

# Predict entities in test tweets
company_list = []
final_list = []
sample_docs = ['Revised EV incentives are pushing buyers to models made in North America, boosting EVs from Tesla, Chevrolet and Volkswagen to the detriment of Korean imports from Hyundai and Kia, Experian data for new-vehicle registrations shows.',
               'BRILLIANT!!! Tesla spends ZERO DOLLARS on advertising yet gaining from Twitter and other EV companies - VW, Lucid, Chevy, BMW, Nissan, GM, Ford. @Tesla @elonmusk',
               '$TSLA No other company can even come close to quality, level of craftsmanship of Tesla when it comes to EVs. Now VW, BMW, and Ford are all trying hard. $TSLA']
for test_doc in sample_docs:
  test_NER_model(nlp_trained_model, test_doc)

Loading from /content/drive/MyDrive/Colab Notebooks/Text Mining/NERMODEL
company Tesla
company Volkswagen
company Hyundai
company Kia
company Experian


company Tesla
company Lucid
company Chevy
company BMW
company Nissan
company GM
company Ford


company Tesla
company BMW
company Ford
