# Word Sense Disambiguation (WSD)
### Sam Timmins, Alex Cerpa, Kas Taghavi

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [210]:
import pandas as pd
import re

def remove_quotes(line):
    if line.startswith('"'):
        line = line[1:]
    if line.endswith('"'):
        line = line[:-1]
    return line

def preprocess_sentence(s, word, senses):
    s += f' [SEP] {word}'
    for sense in senses:
        s += f' [SEP] {sense}'
    return s


def parse_file_to_df(filename):
    with open(filename) as f:
        lines = [remove_quotes(line.strip()) for line in f.readlines()]
        word = lines[0]
        senses = []
        
        i = 2
        for i in range(2, len(lines)):
            if not re.search(r'^[0-9]:? \([a-z]+\)', lines[i]):
                break
            else:
                senses.append(lines[i])
        
        curr_sense = 1
        sentences = []
        sense = []
        for i in range(i, len(lines)):
            if not lines[i]:
                continue
            if re.match(r'[0-9]', lines[i]):
                curr_sense = int(lines[i])
            else:
                s = lines[i].strip()
                sentences.append(preprocess_sentence(s, word ,senses))
                sense.append(curr_sense - 1)
            
        
        
        return senses, pd.DataFrame({"sentence": sentences, "sense": sense})

In [211]:
rubbish = parse_file_to_df('rubbish.txt')
tissue = parse_file_to_df('tissue.txt')
yarn = parse_file_to_df('yarn.txt')
dfs = [rubbish[1], tissue[1], yarn[1]]

In [212]:
rubbish[1].iloc[0]['sentence']

'There was a thick film of dust on every exposed surface; rubbish and the carcass of some small animal had liecn swept carelessly into a corner. [SEP] Rubbish [SEP] 1: (n) rubbish, trash, scrap (worthless material that is to be disposed of) [SEP] 2: (n) folderol, rubbish, tripe, trumpery, trash, wish-wash, applesauce, codswallop (nonsensical talk or writing)'

In [213]:
# from nltk.corpus import stopwords
# stop = stopwords.words('english')

for df in dfs:
    df['sentence'] = df['sentence'].str.replace('[^\w\s\[\]]','')
    df['sentence'] = df['sentence'].str.strip()
    # df['sentence'] = df['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

  df['sentence'] = df['sentence'].str.replace('[^\w\s\[\]]','')


In [214]:
dfs[0].iloc[0]['sentence']

'There was a thick film of dust on every exposed surface rubbish and the carcass of some small animal had liecn swept carelessly into a corner [SEP] Rubbish [SEP] 1 n rubbish trash scrap worthless material that is to be disposed of [SEP] 2 n folderol rubbish tripe trumpery trash wishwash applesauce codswallop nonsensical talk or writing'

In [215]:
rubbish_max_length = dfs[0]['sentence'].str.split().apply(len).max()
rubbish_max_length

75

In [216]:
tissue_max_length = dfs[1]['sentence'].str.split().apply(len).max()
tissue_max_length

74

In [217]:
yarn_max_length = dfs[2]['sentence'].str.split().apply(len).max()
yarn_max_length

98

In [218]:
model_name = "bert-base-uncased"  # You can choose a different model if you prefer

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(rubbish[0])).to("cuda" if torch.cuda.is_available() else "cpu")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [219]:
from sklearn.model_selection import train_test_split
from tensorflow import keras

rubbish_X_train, rubbish_X_test, rubbish_y_train, rubbish_y_test = train_test_split(dfs[0]['sentence'].to_numpy(), dfs[0]['sense'].to_numpy(), test_size=0.1)
#tissue_X_train, tissue_X_test, tissue_y_train, tissue_y_test = train_test_split(tissue_X, dfs[1]['sense'].to_numpy(), test_size=0.1)
#yarn_X_train, yarn_X_test, yarn_y_train, yarn_y_test = train_test_split(yarn_X, dfs[2]['sense'].to_numpy(), test_size=0.1)

print('Rubbish: ', len(rubbish_X_train), len(rubbish_X_test))
#print('Tissue: ', len(tissue_X_train), len(tissue_X_test))
#print('Yarn: ', len(yarn_X_train), len(yarn_X_test))

Rubbish:  59 7


In [220]:
def encode(texts, tokenizer, max_length=512):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=max_length, return_tensors="pt", add_special_tokens=True)

rubbish_train_encodings = encode(rubbish_X_train.tolist(), tokenizer=tokenizer)
rubbish_test_encodings = encode(rubbish_X_test.tolist(), tokenizer=tokenizer)
print(tokenizer.convert_ids_to_tokens(rubbish_train_encodings[0].ids))

['[CLS]', 'and', 'the', 'winds', 'swept', 'away', 'from', 'the', 'picture', 'the', 'rubbish', 'the', 'shovel', 'and', 'many', 'of', 'the', 'objects', '[SEP]', 'rubbish', '[SEP]', '1', 'n', 'rubbish', 'trash', 'scrap', 'worthless', 'material', 'that', 'is', 'to', 'be', 'disposed', 'of', '[SEP]', '2', 'n', 'folder', '##ol', 'rubbish', 'trip', '##e', 'trump', '##ery', 'trash', 'wish', '##wash', 'apples', '##au', '##ce', 'cod', '##sw', '##all', '##op', 'non', '##sen', '##sic', '##al', 'talk', 'or', 'writing', '[SEP]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[P

In [221]:
from torch.utils.data import Dataset

class WordSenseDataset(Dataset):
    def __init__(self, encodings, senses):
        self.encodings = encodings
        self.labels = senses

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = WordSenseDataset(rubbish_train_encodings, rubbish_y_train)
test_dataset = WordSenseDataset(rubbish_test_encodings, rubbish_y_test)

In [231]:
%%time
from transformers import Trainer, TrainingArguments
import evaluate

accuracy = evaluate.load("accuracy")
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_steps=10,
    save_steps=100,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



Step,Training Loss,Validation Loss,Accuracy
10,0.0011,0.000194,1.0
20,0.0002,9.9e-05,1.0


CPU times: user 33min 58s, sys: 6min 41s, total: 40min 39s
Wall time: 4min 43s


TrainOutput(global_step=24, training_loss=0.0005613863759208471, metrics={'train_runtime': 283.2442, 'train_samples_per_second': 0.625, 'train_steps_per_second': 0.085, 'total_flos': 46570656798720.0, 'train_loss': 0.0005613863759208471, 'epoch': 3.0})

In [233]:
trainer.save_model('word-sense-1')

In [228]:
!pip install evaluate

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0mta [36m0:00:01[0m
[?25hCollecting dill
  Downloading dill-0.3.6-py3-none-any.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2023.4.0-py3-none-any.whl (153 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.0/154.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash
  Downloading xxhash-3.2.0-cp39-cp39-macosx_10_9_x86_64.

In [234]:
from transformers import pipeline
pipe = pipeline("text-classification", model='word-sense-1', tokenizer=tokenizer)

In [237]:
pipe(preprocess_sentence("I couldn't stand listening to the lecturer's rubbish for another minute and walked out of the lecture hall.", 'rubbish', rubbish[0]))

[{'label': 'LABEL_1', 'score': 0.9994663596153259}]

In [238]:
pipe(preprocess_sentence('The construction site was littered with rubbish, including scraps of metal and discarded building materials.', 'rubbish', rubbish[0]))

[{'label': 'LABEL_0', 'score': 0.9999083280563354}]

In [232]:
trainer.evaluate()

{'eval_loss': 9.222963853972033e-05,
 'eval_accuracy': 1.0,
 'eval_runtime': 3.0946,
 'eval_samples_per_second': 2.262,
 'eval_steps_per_second': 0.323,
 'epoch': 3.0}