# Word Sense Disambiguation (WSD)
### Sam Timmins, Alex Cerpa, Kas Taghavi

### Preprocessing

In [None]:
!pip install evaluate

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [2]:
import pandas as pd
import re

def remove_quotes(line):
    # Remove starting and ending quotes.
    if line.startswith('"'):
        line = line[1:]
    if line.endswith('"'):
        line = line[:-1]
    return line

def preprocess_sentence(s, word, senses):
    # Add the word and its senses to the input for the model.
    s += f' [SEP] {word}'
    for sense in senses:
        s += f' [SEP] {sense}'
    return s


def parse_file_to_df(filename):
    with open(filename) as f:
        lines = [remove_quotes(line.strip()) for line in f.readlines()]
        word = lines[0]
        senses = []
        
        # Read senses
        i = 2
        for i in range(2, len(lines)):
            if not re.search(r'^[0-9]:? \([a-z]+\)', lines[i]):
                break
            else:
                senses.append(lines[i])
        
        curr_sense = 1
        sentences = []
        sense = []
        # Read sentences
        for i in range(i, len(lines)):
            if not lines[i]:
                continue
            if re.match(r'[0-9]', lines[i]):
                curr_sense = int(lines[i])
            else:
                s = lines[i].strip()
                sentences.append(preprocess_sentence(s, word ,senses))
                sense.append(curr_sense - 1)
            
        
        
        return senses, pd.DataFrame({"sentence": sentences, "sense": sense})

In [11]:
rubbish_senses, rubbish_df = parse_file_to_df('rubbish.txt')
tissue_senses, tissue_df = parse_file_to_df('tissue.txt')
yarn_senses, yarn_df = parse_file_to_df('yarn.txt')

words = ['rubbish', 'tissue', 'yarn']
dfs = [rubbish_df, tissue_df, yarn_df]

In [13]:
for df in dfs:
    df['sentence'] = df['sentence'].str.replace('[^\w\s\[\]]','', regex=True)
    df['sentence'] = df['sentence'].str.strip()

In [14]:
dfs[0].iloc[0]['sentence']

'There was a thick film of dust on every exposed surface rubbish and the carcass of some small animal had liecn swept carelessly into a corner [SEP] Rubbish [SEP] 1 n rubbish trash scrap worthless material that is to be disposed of [SEP] 2 n folderol rubbish tripe trumpery trash wishwash applesauce codswallop nonsensical talk or writing'

### Training

In [268]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(rubbish[0])).to("cuda" if torch.cuda.is_available() else "cpu")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

In [16]:
from sklearn.model_selection import train_test_split
from tensorflow import keras

from torch.utils.data import Dataset

class WordSenseDataset(Dataset):
    def __init__(self, encodings, senses):
        self.encodings = encodings
        self.labels = senses

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels) 


def encode(texts, tokenizer, max_length=512):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt", add_special_tokens=True)


datasets = {}


for word, df in zip(words, dfs):
    X_train, X_test, y_train, y_test = train_test_split(
        df['sentence'].to_numpy(), df['sense'].to_numpy(), test_size=0.1)

    train_encodings = encode(X_train.tolist(), tokenizer=tokenizer)
    test_encodings = encode(X_test.tolist(), tokenizer=tokenizer)
    
    train_dataset = WordSenseDataset(train_encodings, y_train)
    test_dataset = WordSenseDataset(test_encodings, y_test)
    datasets[word] = (train_dataset, test_dataset)

datasets

{'rubbish': (<__main__.WordSenseDataset at 0x7ff240590a00>,
  <__main__.WordSenseDataset at 0x7ff240590a60>),
 'tissue': (<__main__.WordSenseDataset at 0x7ff2405908b0>,
  <__main__.WordSenseDataset at 0x7ff240590e20>),
 'yarn': (<__main__.WordSenseDataset at 0x7ff2483ea970>,
  <__main__.WordSenseDataset at 0x7ff2483ea850>)}

In [275]:
%%time
from transformers import Trainer, TrainingArguments
import evaluate

accuracy = evaluate.load("accuracy")
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=100,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
)

CPU times: user 85.5 ms, sys: 40.6 ms, total: 126 ms
Wall time: 433 ms


In [278]:
print(model)

for word, (train_dataset, test_dataset) in datasets.items():
    print(f'Training model for {word}:')
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    trainer.save_model(f'{word}-word-sense')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 



Step,Training Loss,Validation Loss,Accuracy
10,0.6417,0.524612,1.0
20,0.4315,0.294198,1.0


Training model for tissue:




Step,Training Loss,Validation Loss,Accuracy
10,0.7226,0.609725,1.0


Training model for yarn:




Step,Training Loss,Validation Loss,Accuracy
10,0.6012,0.397013,1.0


### Inference

In [279]:
from transformers import pipeline
pipe = pipeline("text-classification", 
                model='rubbish-word-sense', 
                tokenizer=tokenizer)

In [241]:
for i, sense in enumerate(rubbish[0]):
    print(i, sense[1:])

0 : (n) rubbish, trash, scrap (worthless material that is to be disposed of)
1 : (n) folderol, rubbish, tripe, trumpery, trash, wish-wash, applesauce, codswallop (nonsensical talk or writing)


In [280]:
input_ = preprocess_sentence(
    "I couldn't stand listening to the lecturer's rubbish for another minute and walked out of the lecture hall.",
    'rubbish', rubbish[0])
print(input_)
pipe(input_)

I couldn't stand listening to the lecturer's rubbish for another minute and walked out of the lecture hall. [SEP] rubbish


[{'label': 'LABEL_1', 'score': 0.5070088505744934}]

In [281]:
pipe(preprocess_sentence(
    'The construction site was littered with rubbish, including scraps of metal and discarded building materials.',
    'rubbish', rubbish[0]))

[{'label': 'LABEL_0', 'score': 0.8271171450614929}]

In [232]:
trainer.evaluate()

{'eval_loss': 9.222963853972033e-05,
 'eval_accuracy': 1.0,
 'eval_runtime': 3.0946,
 'eval_samples_per_second': 2.262,
 'eval_steps_per_second': 0.323,
 'epoch': 3.0}

### Quantization

In [288]:
!pip install accelerate bitsandbytes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting accelerate
  Downloading accelerate-0.18.0-py3-none-any.whl (215 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m215.3/215.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.38.1-py3-none-any.whl (104.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: bitsandbytes, accelerate
Successfully installed accelerate-0.18.0 bitsandbytes-0.38.1


In [31]:
model = AutoModelForSequenceClassification.from_pretrained('rubbish-word-sense', 
                                                            num_labels=2)
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [24]:
import tensorflow as tf
# Convert the model to a TFLite-compatible format
converter = tf.lite.TFLiteConverter.from_keras_model(model)
converter.optimizations = [tf.lite.Optimize.DEFAULT]
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
converter.inference_input_type = tf.int8
converter.inference_output_type = tf.int8
converter.representative_dataset = datasets['rubbish'][0].encodings['input_ids'][:10]
quantized_model = converter.convert()