In [None]:
!pip install datasets

In [None]:
!pip install seqeval

In [3]:
#Importing the required libraries

import os
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from datasets import load_dataset
from collections import Counter
from seqeval.metrics import precision_score, recall_score, f1_score, classification_report

In [4]:
#The conll dataset (2003) is used for training and validating the model
dataset = load_dataset("conll2003", revision="master")

Downloading builder script:   0%|          | 0.00/2.58k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

Downloading and preparing dataset conll2003/conll2003 (download: 959.94 KiB, generated: 9.78 MiB, post-processed: Unknown size, total: 10.72 MiB) to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee...


Downloading data:   0%|          | 0.00/983k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14042 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3251 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3454 [00:00<?, ? examples/s]

Dataset conll2003 downloaded and prepared to /root/.cache/huggingface/datasets/conll2003/conll2003/1.0.0/63f4ebd1bcb7148b1644497336fd74643d4ce70123334431a3c053b7ee4e96ee. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
#Creating all the building blocks required for constructing the model

#Dropout layer is used to avoid overfitting the model
def layer_dropout(rate):
  dropout = keras.layers.Dropout(rate)
  return dropout;

def layer_feedforward(seq_input, key_dim):
  feedforward = keras.Sequential(
      [
          keras.layers.Dense(seq_input, activation="relu"),
          keras.layers.Dense(key_dim),
      ]
  )
  return feedforward;

#Used as the activation function of hidden layers
#It is also possible to use leaky relu to avoid the dying neuron problem
def activation_relu(seq_input):
  feedforward = layers.Dense(seq_input, activation="relu")
  return feedforward

#Used as the activation function of final layer
def activation_softmax(seq_input):
  feedforward = layers.Dense(seq_input, activation="softmax")
  return feedforward

#Used for the multihead attention layer of the transformer model
def layer_attention(num_heads, key_dim):
  attn = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=key_dim)
  return attn

#Used for normalizing the addition of the input keys and the output of the attention layer in the transformer
def normalization():
  ln = keras.layers.LayerNormalization(epsilon=1e-3)
  return ln

#Used for creating word embeddings from the words
def word_embedding(input_dim_token, input_dim_pos, output_dim_token, output_dim_pos):
  embedding_token = keras.layers.Embedding(input_dim=input_dim_token, output_dim=output_dim_token)
  embedding_pos = keras.layers.Embedding(input_dim=input_dim_pos, output_dim=output_dim_pos)
  return embedding_token, embedding_pos

In [6]:
#Based on the paper "attention is all you need". In this code section, a transformer model is created which will help the model to learn the relationship between
# each word and the other words that are used in the sentence.

class model_transformer(layers.Layer):
    def __init__(self, embed_dim, num_heads, seq_input, rate=0.1):
        super(model_transformer, self).__init__()
        self.attn =              layer_attention(num_heads=num_heads, key_dim=embed_dim)
        self.ln =                normalization()
        self.feedforward =       layer_feedforward(seq_input, embed_dim)
        self.dropout =           layer_dropout(rate)

    def call(self, inputs, training=False):
        attn_output =            self.attn(inputs, inputs)
        attn_output =            self.dropout(attn_output, training=training)
        ln_out =                 self.ln(inputs + attn_output)
        feedforward_output =     self.feedforward(ln_out)
        feedforward_output =     self.dropout(feedforward_output, training=training)
        first_layer_out =        self.ln(ln_out + feedforward_output)

        return self.ln(ln_out + feedforward_output)

In [7]:
#Since BERT has shown to produce promising results, it makes sense to try to make the model more similar to it and capture more semantic relationships
#in the model. Since BERT uses more encoders (either 12 or 24 encoders), in this work the number of encoders are set to 4 to make the model more effective
# than a model that only uses one.

class name_entity_recognition_model(keras.Model):
    def __init__(
        self, num_tags, vocab_size, num_encoders, maxlen=128, embed_dim=32, num_heads=2, seq_input=32, rate=0.1
    ):
        super(name_entity_recognition_model, self).__init__()
        self.num_encoders = num_encoders
        self.embedding_token, self.embedding_pos =  word_embedding(vocab_size, maxlen, embed_dim, embed_dim)
        self.transformer =                          model_transformer(embed_dim, num_heads, seq_input)
        self.dropout =                              layer_dropout(rate)
        self.feedforward_relu =                     activation_relu(seq_input)
        self.feedforward_softmax =                  activation_softmax(num_tags)

    def call(self, inputs, training=False):
      
      maxlen = tf.shape(inputs)[-1]
      positions = tf.range(start=0, limit=maxlen, delta=1)
      position_embeddings = self.embedding_pos(positions)
      token_embeddings = self.embedding_token(inputs)
      embed_out = token_embeddings + position_embeddings

      transformer_out = self.transformer(embed_out)

      for i in range(self.num_encoders - 1):
        transformer_out = self.transformer(transformer_out)
      
      transformer_out = self.dropout(transformer_out, training=training)

      relu_out = self.feedforward_relu(transformer_out)
      relu_out = self.dropout(relu_out, training=training)
      softmax_out = self.feedforward_softmax(relu_out)
      return softmax_out

In [8]:
#Since we only need name, organization name, and location, the extra tags have been changed to 'O'
tag_dict = {0: '[PAD]', 1: 'O', 2: 'B-PER', 3: 'I-PER', 4: 'B-ORG', 5: 'I-ORG', 6: 'B-LOC', 7: 'I-LOC', 8: 'O', 9: 'O'}

In [9]:
def data_file_creation(file_path, input_data):
    with open(file_path, "w") as f:
        for record in input_data:
            ner_tags = record["ner_tags"]
            tokens = record["tokens"]
            f.write(
                str(len(tokens))
                + "\t"
                + "\t".join(tokens)
                + "\t"
                + "\t".join(map(str, ner_tags))
                + "\n"
            )

In [10]:
os.mkdir("data")
data_file_creation("./data/training_data.txt", dataset["train"])
data_file_creation("./data/validation_data.txt", dataset["validation"])

In [11]:
t = dataset["train"]["tokens"]
all_tokens = [item for sublist in t for item in sublist]
lower_all_tokens = [x.lower() for x in all_tokens]
array_all_tokens = np.array(lower_all_tokens)
counter = Counter(array_all_tokens)
num_tags = len(tag_dict)
vocabulary_size = 20000
vocabulary = [token for token, count in counter.most_common(vocabulary_size - 2)]

In [12]:
#Getting the training and validation data
train_data = tf.data.TextLineDataset("./data/training_data.txt")
val_data = tf.data.TextLineDataset("./data/validation_data.txt")
training_data = train_data.take(len(list(train_data.map(lambda x: x)))-2)
validation_data = val_data.take(len(list(val_data.map(lambda x: x)))-2)

In [13]:
#Using Keras string lookup to convert strings to integers
layer_string_lookup = keras.layers.StringLookup(vocabulary=vocabulary)

In [14]:
def record_mapping(record):
    record = tf.strings.split(record, sep="\t")
    length = tf.strings.to_number(record[0], out_type=tf.int32)
    tokens = record[1 : length + 1]
    tokens = tf.strings.lower(tokens)
    tokens = layer_string_lookup(tokens)
    tags = record[length + 1 :]
    tags = tf.strings.to_number(tags, out_type=tf.int64)
    tags += 1
    return tokens, tags

In [15]:
#The input of the model is of a fixed size. However, the inputs given to the model don't have the same size.
#Therefore, padding is performed for the inputs that have a lower number of tokens than what the model's expected input is.
batch_size = 16
def prepare_data(dttf):
  res = dttf.map(record_mapping)
  res = res.padded_batch(batch_size)
  return res
training_dataset = prepare_data(training_data)
validation_dataset = prepare_data(validation_data)

In [16]:
#Specifying the number of encoders used for the model(ex:4)
ner_model = name_entity_recognition_model(num_tags, vocabulary_size, num_encoders=4, embed_dim=32, num_heads=4, seq_input=64)

In [17]:
#Creating a customized loss functions since padding has been performed
class loss_function_customized(keras.losses.Loss):
    def __init__(self, name="custom_ner_loss"):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        loss_fn = keras.losses.SparseCategoricalCrossentropy(
            from_logits=True, reduction=keras.losses.Reduction.NONE
        )
        loss = loss_fn(y_true, y_pred)
        mask = tf.cast((y_true > 0), dtype=tf.float32)
        loss = loss * mask
        return tf.reduce_sum(loss) / tf.reduce_sum(mask)


loss = loss_function_customized()

In [18]:
#Model training
ner_model.compile(optimizer="adam", loss=loss)
ner_model.fit(training_dataset, epochs=12)

text = "She is also a pediatric cardiologist with Children's Healthcare of Atlanta"
tokens = text.split()
tokens = tf.strings.lower(tokens)
sample_input = layer_string_lookup(tokens)
sample_input = tf.reshape(sample_input, shape=[1, -1])
output = ner_model.predict(sample_input)
prediction = np.argmax(output, axis=-1)[0]
prediction = [tag_dict[i] for i in prediction]
print(prediction)

Epoch 1/10


  return dispatch_target(*args, **kwargs)


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
tf.Tensor([[   0   29   82    7    0    0   23    0 9853    4  470]], shape=(1, 11), dtype=int64)
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC']


In [19]:
tag_ids_actual_all, tag_ids_prediction_all = [], []

for x, y in validation_dataset:
    output = ner_model.predict(x)
    predictions = np.argmax(output, axis=-1)
    predictions = np.reshape(predictions, [-1])
    tag_ids_actual = np.reshape(y, [-1])
    masked = (tag_ids_actual > 0) & (predictions > 0)
    tag_ids_actual = tag_ids_actual[masked]
    tag_ids_prediction = predictions[masked]
    tag_ids_actual_all.append(tag_ids_actual)
    tag_ids_prediction_all.append(tag_ids_prediction)

tag_ids_actual_all = np.concatenate(tag_ids_actual_all)
tag_ids_prediction_all = np.concatenate(tag_ids_prediction_all)

In [20]:
#Since most words get the 'O' tag, accuracy is not a good measurement of the model's performance. Thus, precision, recall, and f1-score are used

real_tags = [[tag_dict[tag] for tag in tag_ids_actual_all]]
predicted_tags = [[tag_dict[tag] for tag in tag_ids_prediction_all]]
print ("predicted tags = ", predicted_tags)
print("real tag = ", real_tags)
print(classification_report(real_tags, predicted_tags))

predicted tags =  [['O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-ORG', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'B-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'B-LO

Web Scraping

In [21]:
from bs4 import BeautifulSoup as soup
import requests

In [22]:
#Web scraping NBC news for COVID-19 related news
nbc_url = 'https://www.nbcnews.com/health/health-news/covid-boosters-fda-advisers-meet-discuss-shots-need-rcna22824'

In [23]:
req = requests.get(nbc_url)

In [24]:
base = soup(req.content, 'html.parser')

In [25]:
object_b = base
div = object_b.find('div',{'class':'article-body__content'})
ps = div.findAll('p')
paragraphs = ''
for p in ps:
  if(p.string != None):
    paragraphs += p.string.strip() + ' '
print(paragraphs)

Food and Drug Administration advisers will meet Wednesday to hash out what the future of Covid-19 boosters looks like in the United States. Full coverage of the Covid-19 pandemic The Covid vaccination schedule in the U.S. has so far been determined in real time. When vaccines were first available, it was thought two doses of the Pfizer-BioNTech or the Moderna vaccines, or one dose of the Johnson & Johnson, would be sufficient. As the pandemic evolved, however, new variants emerged and immunity waned, leading to the need for boosters. But the U.S. never led the way on boosters, instead looking to other countries, including Israel and the United Kingdom, for data on when to boost. The hope, experts say, is that by the end of Wednesday's meeting, U.S. regulators will have a clearer picture of how to think about Covid boosters moving forward. The meeting doesn't include a formal vote on recommendations, though the discussion is expected to influence the agency’s thinking on the future of C

In [26]:
def lookup_lower_token(tokens):
    tokens = tf.strings.lower(tokens)
    return layer_string_lookup(tokens)

def tokenization(text):
    tokens = text.split()
    return lookup_lower_token(tokens)

In [27]:
answer = []
text_list = paragraphs.split('.')
print(text_list)
for i in range (len(text_list)):
  sample_input = tokenization(text_list[i])
  sample_input = tf.reshape(sample_input, shape=[1, -1])
  output = ner_model.predict(sample_input)
  prediction = np.argmax(output, axis=-1)[0]
  prediction = [tag_dict[i] for i in prediction]
  answer.append(prediction)

['Food and Drug Administration advisers will meet Wednesday to hash out what the future of Covid-19 boosters looks like in the United States', ' Full coverage of the Covid-19 pandemic The Covid vaccination schedule in the U', 'S', ' has so far been determined in real time', ' When vaccines were first available, it was thought two doses of the Pfizer-BioNTech or the Moderna vaccines, or one dose of the Johnson & Johnson, would be sufficient', ' As the pandemic evolved, however, new variants emerged and immunity waned, leading to the need for boosters', ' But the U', 'S', ' never led the way on boosters, instead looking to other countries, including Israel and the United Kingdom, for data on when to boost', " The hope, experts say, is that by the end of Wednesday's meeting, U", 'S', ' regulators will have a clearer picture of how to think about Covid boosters moving forward', " The meeting doesn't include a formal vote on recommendations, though the discussion is expected to influence th

In [None]:
import pickle
with open('answer.dat','wb') as file:
  pickle.dump(answer, file)
  file.close()
with open('text_list.dat','wb') as file:
  pickle.dump(text_list, file)
  file.close()

This section is for storing the results in database

In [None]:
!pip install psycopg2

In [None]:
#These commands need to get executed in Postgresql locally

# create database ner_data;
# CREATE TABLE data ( id serial PRIMARY KEY, content VARCHAR NOT NULL, ners VARCHAR);

In [None]:
#Executed locally
import psycopg2
with open('answer.dat','rb') as file:
  answer = pickle.load(file)
  
with open('text_list.dat','rb') as file:
  text_list = pickle.load(file)

#Your own password
conn = psycopg2.connect(dbname="ner_data", user='postgres', password='password', host='localhost', port= '5432')
conn.autocommit = True
cursor = conn.cursor()

for i,j in zip(answer, text_list):
    query = '''INSERT INTO data(content, ners) VALUES (%s, %s)'''
    if len(i) > 0:
        data = (j,(",").join(i))
        print(i,j)
        cursor.execute(query, data)

conn.commit()
print("Records inserted........")

conn.close()