# Knowledge Enhanced Masked Language Model <br>


This model is inspired by the paper on Knowledge Enhaned Masked Language Modeling on Stance Detection. <br>
In this notebook a similar model is developed for Sentiment Analysis that is trained on the twitter corpus of NLTK

# Importing libraries

In [1]:
import torch
import re
import numpy as np
import pandas as pd
import math
import random
from tqdm import tqdm

In [2]:
from sklearn.model_selection import train_test_split
from datasets import load_metric
from sklearn.metrics import f1_score

In [3]:
from transformers import TrainingArguments, BertTokenizer, BertForMaskedLM, AdamW, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertTokenizer, BertForSequenceClassification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

2023-11-19 10:39:25.884898: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
from sklearn.metrics import f1_score

In [5]:
import warnings
warnings.filterwarnings("ignore")

# Importing Dataset

In [6]:
import nltk 
from nltk.corpus import twitter_samples
nltk.download(['twitter_samples'])

[nltk_data] Downloading package twitter_samples to
[nltk_data]     /Users/shivika/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

In [7]:
pos_tweets = twitter_samples.strings('positive_tweets.json')
neg_tweets = twitter_samples.strings('negative_tweets.json')

In [8]:
# Function to pre-process a tweet
def preprocess_text(text):
    text = text.replace('_', ' ')
    text = text.replace('@', '')
    text = re.sub(r'[^a-zA-Z\s:(\):-PpD]', '', text)
    return text

In [9]:
pos_tweets_processed = [preprocess_text(text) for text in pos_tweets]
neg_tweets_processed = [preprocess_text(text) for text in neg_tweets]

In [10]:
assert(len(pos_tweets)==len(neg_tweets))
train_len = int(0.8*len(pos_tweets))
df_positive_train = pd.DataFrame({'Text': pos_tweets_processed[:train_len], 'Label': 'positive'})
df_negative_train = pd.DataFrame({'Text': neg_tweets_processed[:train_len], 'Label': 'negative'})

In [11]:
df_positive_test = pd.DataFrame({'Text': pos_tweets_processed[train_len:], 'Label': 'positive'})
df_negative_test = pd.DataFrame({'Text': neg_tweets_processed[train_len:], 'Label': 'negative'})

In [12]:
df_train = pd.concat([df_positive_train, df_negative_train])
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train.insert(0, 'Tweet_ID', df_train.index)

In [13]:
df_test = pd.concat([df_positive_test, df_negative_test])
df_test = df_test.sample(frac=1).reset_index(drop=True)
df_test.insert(0, 'Tweet_ID', df_test.index)

In [14]:
df_train.to_csv("./df_train.csv",index=False)

In [15]:
df_positive_train.insert(0, 'Tweet_ID', df_positive_train.index)
df_negative_train.insert(0, 'Tweet_ID', df_negative_train.index)

In [16]:
df_positive_train.to_csv("./df_positive_train.csv",index=False)
df_negative_train.to_csv("./df_negative_train.csv",index=False)

# Finding Sentiment Tokens

Source code to identify tokens important for sentiment analysis: https://github.com/kornosk/log-odds-ratio<br>

The above references repository computes the tokens most 'important' to the datasets using the Log-Odds Ratio. <br>
It returns a z_scores.txt file where 
- the top 'k' words are k most important words for one sentiment and 
- bottom k words are k most important words for the other sentiment <br>
(Order depends on which dataset is given first for computation) <br>

A set of sentiment tokens are thus creating using these words

In [17]:
ordered_token_list = []
with open('./z_scores.txt', 'r') as file:
    for line in file:
        token = line.strip()
        token = token.split(",")[0]
        ordered_token_list.append(token)

In [18]:
k = 40
sentiment_token_list = ordered_token_list[:k]
sentiment_token_list.extend(ordered_token_list[:-k-1:-1])

In [19]:
sentiment_token_list = set(sentiment_token_list)

# Pre-training MLM

In this step we strategically mask tokens in input tweets and pre-train a BERT model to make it more 'aware' of sentiment classification.

## Initialise Model

In [20]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [21]:
text = df_train["Text"][:100].tolist()

In [22]:
inputs = tokenizer(text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [23]:
inputs

{'input_ids': tensor([[  101, 22827,  9247,  ...,     0,     0,     0],
        [  101, 13698,  7559,  ...,     0,     0,     0],
        [  101, 16590, 17063,  ...,     0,     0,     0],
        ...,
        [  101, 16985,  5937,  ...,     0,     0,     0],
        [  101,  3582, 27439,  ...,     0,     0,     0],
        [  101,  2000,  2035,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [24]:
inputs['labels'] = inputs.input_ids.detach().clone()

In [25]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

## Masking using Sentiment Tokens

In [26]:
df_train_masked = df_train.copy()

We mask 15% tokens in input tweet, giving sentiment tokens a higher preference as they better represent the downstream task.


In [27]:
masked_tweets = []
for index, row in tqdm(df_train_masked.iterrows()):
  text = row["Text"]
  masked = 0
  masked_text = []
  for token in text.split(" "):
    if token in sentiment_token_list and masked <= 0.15*len(text.split(" ")):
      masked_text.append("[MASK]")
      masked += 1
    else:
      masked_text.append(token)

  indices = []
  for index, w in enumerate(masked_text):
      if w != "[MASK]":
          indices.append(index)

  while masked < 0.15*len(text.split(" ")):
    random_index = random.choice(indices)
    masked_text[random_index] = "[MASK]"
    indices.remove(random_index)
    masked += 1

  masked_text = ' '.join(masked_text)
  masked_tweets.append(masked_text)

8000it [00:00, 42102.82it/s]


In [28]:
df_train_masked = df_train.copy()
df_train_masked['Text'] = masked_tweets

In [29]:
df_train_masked

Unnamed: 0,Tweet_ID,Text,Label
0,0,Kandelamison Youre [MASK] [MASK] for followin...,positive
1,1,alpstart Powys close to the Shropshire border ...,positive
2,2,SkyeSOS we are friends [MASK] now [MASK] hahaha,positive
3,3,taesprout god [MASK] so [MASK] i hope you are ...,negative
4,4,selenagcyrus [MASK],negative
...,...,...,...
7995,7995,BIG thx bikechainricci [MASK] kind support SH...,positive
7996,7996,[MASK] I have in the past A hoodie covers a [M...,positive
7997,7997,Jumma Mubarik to all of you [MASK] Stay Happy ...,positive
7998,7998,I [MASK] his massages :(,negative


In [30]:
masked_text = df_train_masked["Text"][:100].tolist()

In [31]:
sentiment_tokenized = tokenizer(masked_text, return_tensors='pt', max_length=512, truncation=True, padding='max_length')

In [32]:
inputs.input_ids = sentiment_tokenized.input_ids

## Pre-Train

In [33]:
inputs.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])

In [34]:
# This will compute loss only for [MASK] tokens
for i in range(inputs.input_ids.shape[0]):
    for j in range(inputs.input_ids.shape[1]):
      if inputs.input_ids[i, j] != tokenizer.mask_token_id:
        inputs.labels[i, j] = -100

In [35]:
class Pretraining_Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
    def __len__(self):
        return len(self.encodings.input_ids)

In [36]:
dataset = Pretraining_Dataset(inputs)

In [37]:
loader = torch.utils.data.DataLoader(dataset, batch_size=5, shuffle=True)

In [38]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [39]:
model.train()
optim = AdamW(model.parameters(), lr=2e-3)

In [40]:
epochs = 1

for epoch in range(epochs):
    loop = tqdm(loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

Epoch 0: 100%|███████████████████████| 20/20 [14:12<00:00, 42.64s/it, loss=6.47]


In [41]:
model.save_pretrained("./")

# Fine Tuning BERT

In [42]:
fine_tuned_model = BertForSequenceClassification.from_pretrained("./", num_labels=2)

Some weights of the model checkpoint at ./ were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./ and are newly initialized: ['classifier

In [43]:
metric_1 = load_metric("f1")
metric_2 = load_metric("accuracy")

Using the latest cached version of the module from /Users/shivika/.cache/huggingface/modules/datasets_modules/metrics/f1/1ae7ede7c974ce472c931da73bdb2e3f0e1044a996d2348b328773d44dd1847b (last modified on Sun Sep 17 11:35:25 2023) since it couldn't be found locally at f1, or remotely on the Hugging Face Hub.


In [44]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric_1.compute(predictions=predictions, references=labels, average="weighted")

In [45]:
f1_metric = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return f1_metric.compute(predictions=predictions, references=labels)

In [46]:
X_train = df_train["Text"][:1200].tolist()
y_train = df_train["Label"][:1200].tolist()
y_train = [0 if item == "positive" else 1 for item in y_train]

X_val = df_test["Text"][:300].tolist()
y_val = df_test["Label"][:300].tolist()
y_val = [0 if item == "positive" else 1 for item in y_val]

X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val, padding=True, truncation=True, max_length=512)

In [47]:
X_train_tokenized.keys()

dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

In [48]:
len(X_train),len(X_val)

(1200, 300)

In [49]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [50]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [51]:
train_dataset[5]

{'input_ids': tensor([  101,  1062,  3593, 10930,  2243,  2474,  2232,  2156,  2017,  1024,
          1052,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0]),
 'labels': tensor(0)}

In [55]:
args = TrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    per_device_train_batch_size=10

)

args.optimizers = AdamW(
    fine_tuned_model.parameters(),
    lr=2e-3,
    eps=1e-7,
    weight_decay=0.01
)


trainer = Trainer(
    model=fine_tuned_model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

In [56]:
trainer.train()

Step,Training Loss
500,0.6988


TrainOutput(global_step=600, training_loss=0.698488515218099, metrics={'train_runtime': 3551.4066, 'train_samples_per_second': 1.689, 'train_steps_per_second': 0.169, 'total_flos': 151083301320000.0, 'train_loss': 0.698488515218099, 'epoch': 5.0})

In [57]:
trainer.evaluate()

{'eval_loss': 0.6932433843612671,
 'eval_f1': 0.0,
 'eval_runtime': 51.6973,
 'eval_samples_per_second': 5.803,
 'eval_steps_per_second': 0.735,
 'epoch': 5.0}