<a href="https://colab.research.google.com/github/prith189/GLG_DL/blob/main/NER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [178]:
#Huggingface library has pretrained models that have been trained on a large corpus and can perform NER

In [179]:
!pip install transformers[sentencepiece]



In [180]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [209]:
import pandas as pd
import os
import random
from tqdm import tqdm
from pandas.core.groupby import groupby
import tensorflow as tf
import numpy as np
from transformers import AutoTokenizer, AutoConfig, TFAutoModel
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, multilabel_confusion_matrix
from gensim.summarization.textcleaner import split_sentences

In [182]:
PRETRAINED_MODEL_NAME = 'bert-base-uncased'
FINETUNED_MODEL_NAME = 'finetuned_' + PRETRAINED_MODEL_NAME
FILE_DIR = '/content/drive/My Drive/fourthbrain/NER_Labels'
SEQUENCE_LENGTH = 128
BATCH_SIZE = 32
EPOCHS = 8
RUN_TRAINING = False

In [183]:
class NER_Model:
    def __init__(self):
        #Initialize the pretrained model
        self.config = AutoConfig.from_pretrained(PRETRAINED_MODEL_NAME)
        self.backbone = TFAutoModel.from_pretrained(PRETRAINED_MODEL_NAME,config=self.config)
    
    def build_model(self, num_classes, use_finetuned=False):
        tokens = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'tokens', dtype=tf.int32)
        att_masks = tf.keras.layers.Input(shape=(SEQUENCE_LENGTH,), name = 'attention', dtype=tf.int32)
        
        features = self.backbone(tokens, attention_mask=att_masks)[0]
        
        target = tf.keras.layers.Dropout(0.5)(features)
        target = tf.keras.layers.Dense(num_classes, activation='softmax')(target)
        
        self.model = tf.keras.Model([tokens,att_masks],target)

        self.model.compile(optimizer=tf.keras.optimizers.Adam(1e-5),
                           loss=tf.keras.losses.sparse_categorical_crossentropy,
                           metrics=['accuracy'])
        if(use_finetuned):
            self.model.load_weights(os.path.join(FILE_DIR, FINETUNED_MODEL_NAME))

    def train_model(self, x_data_in, x_data_att, y_data, x_data_in_val, x_data_att_val, y_data_val):
        history = self.model.fit(x = [x_data_in, x_data_att], y = y_data, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=([x_data_in_val, x_data_att_val], y_data_val))

    def save_model(self):
        self.model.save_pretrained(os.path.join(FILE_DIR, FINETUNED_MODEL_NAME))


In [184]:
from sklearn import preprocessing

class NERDataset:
    def __init__(self):
        #Read the data files
        self.dataset = pd.read_csv(os.path.join(FILE_DIR, 'ner_dataset.csv'), encoding = 'ISO-8859-1')

        #Preprocess the dataset
        self.dataset["Sentence #"] = self.dataset["Sentence #"].fillna(method="ffill")

        #Convert tags into labels using label encoder
        self.tag_encoder = preprocessing.LabelEncoder()
        self.dataset.loc[:, 'Tag'] = self.tag_encoder.fit_transform(self.dataset['Tag'])
        self.background_class = self.tag_encoder.transform(['O'])[0]

        self.sentences = self.dataset.groupby("Sentence #")["Word"].apply(list).values
        self.tags = self.dataset.groupby("Sentence #")["Tag"].apply(list).values
 
    def build_ner_dataset(self, mode='train'):
        
        #Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME,normalization=True)
        self.config = AutoConfig.from_pretrained(PRETRAINED_MODEL_NAME)

        self.input_ids = []
        self.attention_masks = []
        self.token_type_ids = []
        for sentence in self.sentences:
            encoded = self.tokenizer.encode_plus(sentence,
                                       add_special_tokens = True,
                                       max_length = SEQUENCE_LENGTH,
                                       is_split_into_words=True,
                                       return_attention_mask=True,
                                       padding = 'max_length',
                                       truncation=True,return_tensors = 'np')
            self.input_ids.append(encoded['input_ids'])
            self.attention_masks.append(encoded['attention_mask'])
            self.token_type_ids.append(encoded.word_ids())
            #print('Length of sentence:{}, Length of encoded:{}'.format(len(sentence), encoded['input_ids'].shape))
        self.input_ids = np.vstack(self.input_ids)
        self.attention_masks = np.vstack(self.attention_masks)
        self.token_type_ids = np.vstack(self.token_type_ids)

        

        self.tags_proper = []
        for ntag, tag in enumerate(self.tags):
            word_ids = self.token_type_ids[ntag][self.token_type_ids[ntag] != np.array(None)]
            tag_proper = [tag[i] for i in word_ids]
            self.tags_proper.append(tag_proper)
        
        self.targets = np.ones([self.input_ids.shape[0], SEQUENCE_LENGTH], dtype=np.int32)*self.background_class
        for n, tag in enumerate(self.tags_proper):
            tag_len = len(tag)
            self.targets[n,1:tag_len+1] = np.array(tag)
    
    
    def test_train_split(self):
        self.seq_train, self.seq_test, self.mask_train, self.mask_test, self.target_train, self.target_test, self.word_id_train, self.word_id_test = train_test_split(self.input_ids, self.attention_masks, self.targets, self.token_type_ids, test_size=0.20, random_state=42)
    


In [185]:
#Prepare the dataset
dataset = NERDataset()
dataset.build_ner_dataset()
dataset.test_train_split()
n_classes = dataset.tag_encoder.classes_.shape[0]

In [186]:
#Prepare the model
ner_modeler = NER_Model()
if(RUN_TRAINING):
    ner_modeler.build_model(n_classes)
    ner_modeler.train_model(dataset.seq_train, dataset.mask_train, dataset.target_train, dataset.seq_test, dataset.mask_test, dataset.target_test)
    ner_modeler.model.save_weights(os.path.join(FILE_DIR, FINETUNED_MODEL_NAME))
else:
    ner_modeler.build_model(n_classes,True)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [187]:
#Evaluate the model
test_preds = ner_modeler.model.predict([dataset.seq_test, dataset.mask_test])
test_preds = np.argmax(test_preds, axis=2)
test_preds_flat = test_preds.flatten()
test_true_flat = dataset.target_test.flatten()

In [188]:
print('F1 Score when background class is included')
print('Precision:{}, Recall:{}, F1 Score:{}'.format(precision_score(test_true_flat, test_preds_flat, average='micro'), recall_score(test_true_flat, test_preds_flat, average='micro'), f1_score(test_true_flat, test_preds_flat, average='micro')))

F1 Score when background class is included
Precision:0.9922868666597164, Recall:0.9922868666597164, F1 Score:0.9922868666597164


In [189]:
print('F1 Score when background class is excluded')
print('Precision:{}, Recall:{}, F1 Score:{}'.format(precision_score(test_true_flat, test_preds_flat, labels=np.arange(0,16), average='micro'), recall_score(test_true_flat, test_preds_flat, labels=np.arange(0,16), average='micro'), f1_score(test_true_flat, test_preds_flat, labels=np.arange(0,16), average='micro')))

F1 Score when background class is excluded
Precision:0.8387444459108706, Recall:0.8340661023251744, F1 Score:0.8363987321642045


In [190]:
#Function to test a new sample text
def test_new_text(sample_text):
    #Tokenize the sample text, and get the word ids
    encoded = dataset.tokenizer.encode_plus(sample_text,
                                        add_special_tokens = True,
                                        max_length = SEQUENCE_LENGTH,
                                        is_split_into_words=True,
                                        return_attention_mask=True,
                                        padding = 'max_length',
                                        truncation=True,return_tensors = 'np')
    input_seq = encoded['input_ids']
    att_mask = encoded['attention_mask']
    word_ids = encoded.word_ids()

    #Predict the classes for each token
    sample_out = ner_modeler.model.predict([input_seq, att_mask])
    sample_out = np.argmax(sample_out, axis=2)
    word_ids = np.array(word_ids)
    valid_sample_out = sample_out[0, word_ids!=None]
    valid_word_ids = word_ids[word_ids!=None]
    names = [sample_text[i] for i in valid_word_ids[valid_sample_out!=dataset.background_class]]
    labels = [dataset.tag_encoder.inverse_transform([i])[0] for i in valid_sample_out[valid_sample_out!=dataset.background_class]]

    #Combine the tokens and correponding labels. Output the final names and their corresponding classes
    full_names = []
    full_labels = []
    prev_index = -1
    completed = {}
    for name, label in zip(names, labels):
        if(name not in completed):
            if(label[0]=='B'):
                full_names.append(name)
                full_labels.append(label[2:])
                prev_index += 1
            else:
                full_names[prev_index] = full_names[prev_index] + ' ' + name
            completed[name] = 1
    return full_names, full_labels

In [194]:
sample_text = dataset.sentences[-40000]
print(" ".join(sample_text))
print(test_new_text(sample_text))

The U.S. Senate passed the bill Thursday , after a new report showed the number of unemployed Americans signing up for benefits for the first time grew to the highest level in 16 years .
(['U.S. Senate', 'Thursday', 'Americans', '16'], ['org', 'tim', 'gpe', 'tim'])


In [192]:
csv_file = '/content/drive/My Drive/fourthbrain/all-the-news-2-1.csv'

In [215]:
import pandas as pd
import spacy

def display_ner(doc):
    spacy.displacy.render(doc, style="ent",manual=True, jupyter=True)

class NewsDataset:
    def __init__(self):
        self.df = pd.read_csv(csv_file)
        self.preprocess()
        self.ner = None
    
    def preprocess(self):
        self.df.drop(columns=['Unnamed: 0', 'Unnamed: 0.1','date','year','month','day','title','publication'], inplace=True)
        self.df = self.df.iloc[:10000,:]

In [216]:
news = NewsDataset()

  """Entry point for launching an IPython kernel.


In [226]:
#Create the end-to-end pipeline and test on new dataset
class NER_Pipeline:
    def __init__(self):
        #Load the tokenizer in the dataset
        self.dataset = NERDataset()
        self.dataset.build_ner_dataset()
        self.dataset.test_train_split()
        n_classes = self.dataset.tag_encoder.classes_.shape[0]

        #Load the finetuned model
        self.ner_modeler = NER_Model()
        self.ner_modeler.build_model(n_classes, True)
    
    def run_ner_on_sentence(self, sample_text):
        #Tokenize the sample text, and get the word ids
        encoded = self.dataset.tokenizer.encode_plus(sample_text,
                                            add_special_tokens = True,
                                            max_length = SEQUENCE_LENGTH,
                                            is_split_into_words=True,
                                            return_attention_mask=True,
                                            padding = 'max_length',
                                            truncation=True,return_tensors = 'np')
        input_seq = encoded['input_ids']
        att_mask = encoded['attention_mask']
        word_ids = encoded.word_ids()

        #Predict the classes for each token
        sample_out = self.ner_modeler.model.predict([input_seq, att_mask])
        sample_out = np.argmax(sample_out, axis=2)
        word_ids = np.array(word_ids)
        valid_sample_out = sample_out[0, word_ids!=None]
        valid_word_ids = word_ids[word_ids!=None]
        names = [sample_text[i] for i in valid_word_ids[valid_sample_out!=self.dataset.background_class]]
        labels = [self.dataset.tag_encoder.inverse_transform([i])[0] for i in valid_sample_out[valid_sample_out!=self.dataset.background_class]]

        #Combine the tokens and correponding labels. Output the final names and their corresponding classes
        full_names = []
        full_labels = []
        prev_index = -1
        completed = {}
        for name, label in zip(names, labels):
            if(name not in completed):
                if(label[0]=='B'):
                    full_names.append(name)
                    full_labels.append(label[2:])
                    prev_index += 1
                else:
                    full_names[prev_index] = full_names[prev_index] + ' ' + name
                completed[name] = 1
        return full_names, full_labels
    
    def run_ner(self, full_text):
        sentences = split_sentences(sample_text)
        names = []
        labels = []
        for sentence in sentences:
            snames, slabels = self.run_ner_on_sentence(sentence.split(' '))
            names.extend(snames)
            labels.extend(slabels)

        ner_dict = {name:label for name,label in zip(names, labels)}
        return ner_dict

In [224]:
pipeline = NER_Pipeline()

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [217]:
sample_text = news.df.iloc[7000]['article']

In [227]:
pipeline.run_ner(sample_text)

{'2013': 'tim',
 'American Civil Liberties Union,': 'org',
 'Amnesty International,': 'org',
 'Barack': 'per',
 'Barack Obama': 'per',
 'Clinton': 'per',
 'Donald Trump': 'per',
 'Edward Snowden': 'per',
 'Ewen MacAskill—': 'per',
 'General Eric Holder': 'per',
 'Guardian': 'org',
 'Hillary Clinton': 'per',
 'Hong Kong': 'geo',
 'Human Right Watch,': 'org',
 'Joseph Gordon-Levitt': 'per',
 'May': 'tim',
 'Obama': 'per',
 'Oliver Stone’s': 'per',
 'President Obama': 'per',
 'Snowden': 'per',
 'Snowden’s': 'per',
 'Snowden”': 'per',
 'Stone': 'per',
 'Toronto International Film Festival,': 'org',
 'Trump': 'per',
 'Tuesday': 'tim',
 'US': 'geo',
 'VICE’s': 'org',
 'Wednesday,': 'tim',
 '[Snowden]': 'per',
 'music.”': 'per',
 'past weekend': 'tim',
 'three': 'tim',
 '“Mr.': 'per',
 '“Snowden”': 'per'}

In [228]:
sample_text

'Edward Snowden touched off an enormous public relations campaign on Tuesday calling for Barack Obama to grant him a presidential pardon. In an interview with Ewen MacAskill— one of The Guardian reporters who secretly met Snowden in Hong Kong three years ago in an unprecedented leak of state secrets — Snowden argued that the pardon power exists precisely for people like him. There are “things that may seem unlawful in letters on a page but when we look at them morally, when we look at them ethically,” he said, “these were vital things.” Snowden’s comments come on the eve of a “Pardon Snowden” operation by The American Civil Liberties Union, Human Right Watch, Amnesty International, and other human rights groups, as first reported by VICE’s Motherboard. On Wednesday, Oliver Stone’s new film “Snowden” starring Joseph Gordon-Levitt will also be shown in 700 theaters around the country with plans for a webinar discussion between Snowden and Stone following. This past weekend at the Toronto