# Word Sense Disambiguation (WSD)
### Sam Timmins, Alex Cerpa, Kas Taghavi

### Preprocessing

In [None]:
!pip install evaluate

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [32]:
import pandas as pd
import re

def remove_quotes(line):
    # Remove starting and ending quotes.
    if line.startswith('"'):
        line = line[1:]
    if line.endswith('"'):
        line = line[:-1]
    return line

def preprocess_sentence(s, word, senses):
    # Add the word and its senses to the input for the model.
    s += f' [SEP] {word}'
    for sense in senses:
        s += f' [SEP] {sense}'
    return s


def parse_file_to_df(filename):
    with open(filename) as f:
        lines = [remove_quotes(line.strip()) for line in f.readlines()]
        word = lines[0]
        senses = []
        
        # Read senses
        i = 2
        for i in range(2, len(lines)):
            if not re.search(r'^[0-9]:? \([a-z]+\)', lines[i]):
                break
            else:
                sense_line = lines[i]
                sense_line = sense_line.replace('(', '').replace(')', '')
                senses.append(sense_line)
        
        curr_sense = 1
        sentences = []
        sense = []
        # Read sentences
        for i in range(i, len(lines)):
            if not lines[i]:
                continue
            if re.match(r'[0-9]', lines[i]):
                curr_sense = int(lines[i])
            else:
                s = lines[i].strip()
                sentences.append(preprocess_sentence(s, word ,senses))
                sense.append(curr_sense - 1)
            
        
        
        return senses, pd.DataFrame({"sentence": sentences, "sense": sense})

In [33]:
rubbish_senses, rubbish_df = parse_file_to_df('rubbish.txt')
tissue_senses, tissue_df = parse_file_to_df('tissue.txt')
yarn_senses, yarn_df = parse_file_to_df('yarn.txt')

words = ['rubbish', 'tissue', 'yarn']
dfs = [rubbish_df, tissue_df, yarn_df]

for df in dfs:
    df['sentence'] = df['sentence'].str.replace('[^\w\s\[\]]','', regex=True)
    df['sentence'] = df['sentence'].str.strip()

In [34]:
yarn_df

Unnamed: 0,sentence,sense
0,The yarn is no longer novel too many other wr...,0
1,I have just finished reading a rather lengthy ...,0
2,But spin those reporters some yarn and keep th...,0
3,They censored all that out of my copy made it...,0
4,Nan do you think that kind of yarn is going to...,0
...,...,...
104,The yarn connected people across lands and oce...,1
105,It was a reminder that no matter where we come...,1
106,In the end the humble yarn held within it the ...,1
107,It was a testament to the strength of communit...,1


In [18]:
dfs[2].iloc[0]['sentence']

'The yarn is no longer novel  too many other writers have since taken off from Galluns inspiration  but it is just as fine to me as it always was [SEP] Yarn [SEP] 1 n narration recital yarn the act of giving an account describing incidents or a course of events his narration was hesitant [SEP] 2 n thread yarn a fine cord of twisted fibers of cotton or silk or wool or nylon etc used in sewing and weaving'

### Training

In [36]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2).to("cpu")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classi

In [37]:
from sklearn.model_selection import train_test_split
from tensorflow import keras

from torch.utils.data import Dataset

class WordSenseDataset(Dataset):
    def __init__(self, encodings, senses):
        self.encodings = encodings
        self.labels = senses

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels) 


def encode(texts, tokenizer, max_length=512):
    return tokenizer(texts, padding="max_length", truncation=False, max_length=max_length, return_tensors="pt", add_special_tokens=True)


datasets = {}


for word, df in zip(words, dfs):
    X_train, X_test, y_train, y_test = train_test_split(
        df['sentence'].to_numpy(), df['sense'].to_numpy(), test_size=0.1)

    train_encodings = encode(X_train.tolist(), tokenizer=tokenizer)
    test_encodings = encode(X_test.tolist(), tokenizer=tokenizer)
    
    train_dataset = WordSenseDataset(train_encodings, y_train)
    test_dataset = WordSenseDataset(test_encodings, y_test)
    datasets[word] = (train_dataset, test_dataset)

datasets

{'rubbish': (<__main__.WordSenseDataset at 0x7fe5a226c8e0>,
  <__main__.WordSenseDataset at 0x7fe5a226c820>),
 'tissue': (<__main__.WordSenseDataset at 0x7fe59193a520>,
  <__main__.WordSenseDataset at 0x7fe5a226bd30>),
 'yarn': (<__main__.WordSenseDataset at 0x7fe5a226bc10>,
  <__main__.WordSenseDataset at 0x7fe5a226bb50>)}

In [38]:
%%time
from transformers import Trainer, TrainingArguments
import evaluate

accuracy = evaluate.load("accuracy")
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=100,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
)

CPU times: user 86.3 ms, sys: 29.6 ms, total: 116 ms
Wall time: 389 ms


In [39]:
for word, (train_dataset, test_dataset) in datasets.items():
    # if word != 'yarn': continue
    print(f'Training model for {word}:')
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    trainer.save_model(f'{word}-word-sense')

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 



Step,Training Loss,Validation Loss,Accuracy
10,0.6826,0.644054,0.636364
20,0.6477,0.555471,0.636364
30,0.497,0.456069,0.727273


### Inference

In [46]:
from transformers import pipeline
model_name = 'yarn-word-sense'
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
pipe = pipeline("text-classification", 
                model=model_name, 
                tokenizer=tokenizer)

In [41]:
print(yarn_senses)

['1 n narration, recital, yarn the act of giving an account describing incidents or a course of events "his narration was hesitant', '2 n thread, yarn a fine cord of twisted fibers of cotton or silk or wool or nylon etc. used in sewing and weaving']


In [42]:
input_ = preprocess_sentence(
    'Her yarn about the enchanted forest captured the imagination of the listeners.',
    'yarn', yarn_senses)
print(input_)
pipe(input_)

Her yarn about the enchanted forest captured the imagination of the listeners. [SEP] yarn [SEP] 1 n narration, recital, yarn the act of giving an account describing incidents or a course of events "his narration was hesitant [SEP] 2 n thread, yarn a fine cord of twisted fibers of cotton or silk or wool or nylon etc. used in sewing and weaving


[{'label': 'LABEL_0', 'score': 0.7731480598449707}]

In [43]:
input_ = preprocess_sentence(
    'The sailor\'s yarn of his journey across the sea had everyone on the edge of their seats.',
    'yarn', yarn_senses)
print(input_)
pipe(input_)

The sailor's yarn of his journey across the sea had everyone on the edge of their seats. [SEP] yarn [SEP] 1 n narration, recital, yarn the act of giving an account describing incidents or a course of events "his narration was hesitant [SEP] 2 n thread, yarn a fine cord of twisted fibers of cotton or silk or wool or nylon etc. used in sewing and weaving


[{'label': 'LABEL_0', 'score': 0.6420883536338806}]

In [47]:
input_ = preprocess_sentence(
    "I couldn't stand listening to the lecturer's rubbish for another minute and walked out of the lecture hall.",
    'rubbish', rubbish_senses)
print(input_)
pipe(input_)

I couldn't stand listening to the lecturer's rubbish for another minute and walked out of the lecture hall. [SEP] rubbish [SEP] 1: n rubbish, trash, scrap worthless material that is to be disposed of [SEP] 2: n folderol, rubbish, tripe, trumpery, trash, wish-wash, applesauce, codswallop nonsensical talk or writing


[{'label': 'LABEL_0', 'score': 0.5794568061828613}]

In [29]:
pipe(preprocess_sentence(
    'The construction site was littered with rubbish, including scraps of metal and discarded building materials.',
    'rubbish', rubbish_senses))

[{'label': 'LABEL_0', 'score': 0.7427711486816406}]

In [28]:
pipe(preprocess_sentence(
    'Her yarn about the enchanted forest captured the imagination of the listeners.',
    'yarn', yarn_senses))

[{'label': 'LABEL_1', 'score': 0.5107261538505554}]

In [48]:
trainer.evaluate()

{'eval_loss': 0.34677550196647644,
 'eval_accuracy': 0.9090909090909091,
 'eval_runtime': 2.9576,
 'eval_samples_per_second': 3.719,
 'eval_steps_per_second': 0.676,
 'epoch': 3.0}