# Word Sense Disambiguation (WSD)
### Sam Timmins, Alex Cerpa, Kas Taghavi

In [1]:
import pandas as pd

def parse_file_to_df(filename):
    with open(filename) as f:
        lines = f.readlines()
        
        data = []
        for i, line in enumerate(lines):
            if '1\n' in lines[i]:
                i = i + 1
                while '2\n' not in lines[i] and lines[i][0] != '\n':
                    temp = lines[i].replace('\"', '')
                    temp = temp.replace('\n', '')
                    data.append((temp, 1))
                    i = i + 1
                last = i
                
        for i in range(last + 2, len(lines)):
            temp = lines[i].replace('\"', '')
            temp = temp.replace('\n', '')
            data.append((temp, 2))
            
        return pd.DataFrame(data, columns =['sentence', 'sense'])

In [2]:
rubbish = parse_file_to_df('rubbish.txt')

In [3]:
tissue = parse_file_to_df('tissue.txt')

In [4]:
yarn = parse_file_to_df('yarn.txt')

In [5]:
dfs = [rubbish, tissue, yarn]

In [6]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

for df in dfs:
    df['sentence'] = df['sentence'].str.replace('[^\w\s]','')
    df['sentence'] = df['sentence'].str.lower()
    df['sentence'] = df['sentence'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

  df['sentence'] = df['sentence'].str.replace('[^\w\s]','')


In [7]:
rubbish_max_length = dfs[0]['sentence'].str.split().apply(len).max()
rubbish_max_length

24

In [8]:
tissue_max_length = dfs[1]['sentence'].str.split().apply(len).max()
tissue_max_length

22

In [9]:
yarn_max_length = dfs[2]['sentence'].str.split().apply(len).max()
yarn_max_length

25

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
import numpy as np

rubbish_X = dfs[0]['sentence'].to_numpy()

NUM_TOP_WORDS = None

rubbish_tokenizer = Tokenizer(num_words=rubbish_max_length)
rubbish_tokenizer.fit_on_texts(rubbish_X)
rubbish_sequences = rubbish_tokenizer.texts_to_sequences(rubbish_X)

rubbish_word_index = rubbish_tokenizer.word_index
NUM_TOP_WORDS = len(rubbish_word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
rubbish_top_words = min((len(rubbish_word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(rubbish_word_index), rubbish_top_words))

rubbish_X = sequence.pad_sequences(rubbish_sequences, maxlen=rubbish_max_length)

print('Shape of data tensor:', rubbish_X.shape)
print(np.max(rubbish_X))

Found 543 unique tokens. Distilled to 543 top words.
Shape of data tensor: (66, 24)
23


In [11]:
tissue_X = dfs[1]['sentence'].to_numpy()

NUM_TOP_WORDS = None

tissue_tokenizer = Tokenizer(num_words=tissue_max_length)
tissue_tokenizer.fit_on_texts(tissue_X)
tissue_sequences = tissue_tokenizer.texts_to_sequences(tissue_X)

tissue_word_index = tissue_tokenizer.word_index
NUM_TOP_WORDS = len(tissue_word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
tissue_top_words = min((len(tissue_word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(tissue_word_index), tissue_top_words))

tissue_X = sequence.pad_sequences(tissue_sequences, maxlen=tissue_max_length)

print('Shape of data tensor:', tissue_X.shape)
print(np.max(tissue_X))

Found 400 unique tokens. Distilled to 400 top words.
Shape of data tensor: (50, 22)
21


In [12]:
yarn_X = dfs[2]['sentence'].to_numpy()

NUM_TOP_WORDS = None

yarn_tokenizer = Tokenizer(num_words=yarn_max_length)
yarn_tokenizer.fit_on_texts(yarn_X)
yarn_sequences = yarn_tokenizer.texts_to_sequences(yarn_X)

yarn_word_index = yarn_tokenizer.word_index
NUM_TOP_WORDS = len(yarn_word_index) if NUM_TOP_WORDS==None else NUM_TOP_WORDS
yarn_top_words = min((len(yarn_word_index),NUM_TOP_WORDS))
print('Found %s unique tokens. Distilled to %d top words.' % (len(yarn_word_index), yarn_top_words))

yarn_X = sequence.pad_sequences(yarn_sequences, maxlen=yarn_max_length)

print('Shape of data tensor:', yarn_X.shape)
print(np.max(yarn_X))

Found 466 unique tokens. Distilled to 466 top words.
Shape of data tensor: (50, 25)
24


In [23]:
from sklearn.model_selection import train_test_split
from tensorflow import keras

rubbish_X_train, rubbish_X_test, rubbish_y_train, rubbish_y_test = train_test_split(rubbish_X, dfs[0]['sense'].to_numpy(), test_size=0.1)
tissue_X_train, tissue_X_test, tissue_y_train, tissue_y_test = train_test_split(tissue_X, dfs[1]['sense'].to_numpy(), test_size=0.1)
yarn_X_train, yarn_X_test, yarn_y_train, yarn_y_test = train_test_split(yarn_X, dfs[2]['sense'].to_numpy(), test_size=0.1)

print('Rubbish: ', len(rubbish_X_train), len(rubbish_X_test))
print('Tissue: ', len(tissue_X_train), len(tissue_X_test))
print('Yarn: ', len(yarn_X_train), len(yarn_X_test))

Rubbish:  59 7
Tissue:  45 5
Yarn:  45 5


In [None]:
EMBED_SIZE = 50

embeddings_index = {}
f = open('glove.6B.50d.txt', encoding="utf8")
# save key/array pairs of the embeddings
#  the key of the dictionary is the word, the array is the embedding
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

# now fill in the matrix, using the ordering from the
#  keras word tokenizer from before
rubbish_found_words = 0
rubbish_embedding_matrix = np.zeros((len(rubbish_word_index) + 1, EMBED_SIZE))
for word, i in rubbish_word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be ALL-ZEROS
        rubbish_embedding_matrix[i] = embedding_vector
        rubbish_found_words = rubbish_found_words+1

print("Embedding Shape:",rubbish_embedding_matrix.shape, "\n",
      "Total words found:",rubbish_found_words, "\n",
      "Percentage:",100*rubbish_found_words/rubbish_embedding_matrix.shape[0])