# Training word vectors using SVD

In [1]:
import torch
import torch.nn as nn
import numpy as np
import csv
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re
from torchtext.vocab import build_vocab_from_iterator
# import scipy sparse matrix to use scipy.sparse.linalg.svds
import scipy.sparse as sp
import scipy.sparse.linalg as linalg
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /home2/sanika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# load textual data from csv file as a list of strings
def load_data(file_path):
    # load only the second column of the csv file
    data = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append(row[1])
    # delete the first element of the list (header)
    del data[0]
    return data

In [3]:
data = load_data('./ANLP-2/train.csv')

In [None]:
class Preprocess():
    def __init__ (self, data, train=True):
        self.data = data
        self.train = train

    def tokenize(self, data):
        tokenized_data = []
        for text in data:
            text = re.sub(r'\\', ' ', text)
            text = re.sub(r'\"', ' ', text)
            text = re.sub(r'\d+', '<NUMBER>', text)
            text = text.lower()
            # remove punctuation
            tokenized_data.append(word_tokenize(text))
        return tokenized_data

    def convert_to_outofvocab(self, data):
        # make a dictionary of frequencies, words that appear less than 3 times are converted to <OOV>
        freq_dict = {}
        for sentence in data:
            for word in sentence:
                if word in freq_dict:
                    freq_dict[word] += 1
                else:
                    freq_dict[word] = 1

        for i in range(len(data)):
            for j in range(len(data[i])):
                if freq_dict[data[i][j]] < 2:
                    data[i][j] = '<OOV>'
        return data
    
    def build_vocab(self, tokenized_data):
        OUT_OF_VOCAB = '<OOV>'
        PAD_TAG = '<PAD>'
        START_TAG = '<BOS>'
        END_TAG = '<EOS>'
        vocab = build_vocab_from_iterator(tokenized_data, specials=[OUT_OF_VOCAB, PAD_TAG, START_TAG, END_TAG])
        return vocab
    
    def text_to_indices(self, tokenized_data, vocab):
        indexed_data = []
        for sentence in tokenized_data:
            indexed_data.append([vocab[token] for token in sentence])
        return indexed_data
    
    def __call__(self):
        tokenized_data = self.tokenize(self.data)
        if self.train:
            tokenized_data = self.convert_to_outofvocab(tokenized_data)
        vocab = self.build_vocab(tokenized_data)
        indexed_data = self.text_to_indices(tokenized_data, vocab)
        return indexed_data, vocab, tokenized_data 

In [5]:
indexed_data, vocab, tokenized_data = Preprocess(data, True)()

In [6]:
# build a co-occurrence matrix
def build_co_occurrence_matrix(corpus, vocab, window_size=5):
    co_occurrence_matrix = sp.lil_matrix((len(vocab), len(vocab)), dtype=np.float32)
    for sentence in tqdm(corpus):
        for i, word in enumerate(sentence):
            for j in range(max(i - window_size, 0), min(i + window_size + 1, len(sentence))):
                if i != j:
                    co_occurrence_matrix[word, sentence[j]] += 1
    return co_occurrence_matrix

def perform_svd(co_occurrence_matrix, k=300):
    U, S, V = linalg.svds(co_occurrence_matrix, k=k)
    word_vectors = get_word_vectors(U, S, k)
    return word_vectors

# Select word vectors
def get_word_vectors(U, S, k):
    word_vectors = U[:, :k]
    return word_vectors

# Map words to their corresponding word vectors
def map_words_to_vectors(word_vectors, word_to_index):
    word_to_vector = {}
    for word, index in word_to_index.items():
        word_to_vector[word] = word_vectors[index]
    return word_to_vector


In [14]:
len(vocab)

33981

In [7]:
coocerrence_matrix = build_co_occurrence_matrix(indexed_data, vocab)
print("Cooccurence matrix built")
word_vectors = perform_svd(coocerrence_matrix)
print("Word vectors obtained")
word_to_vector = map_words_to_vectors(word_vectors, vocab.get_stoi())

100%|██████████| 120000/120000 [03:07<00:00, 639.14it/s]


Cooccurence matrix built
Word vectors obtained


In [8]:
# save word vectors with the index of the word in the vocab as a dictionary (pickle file)
import pickle
with open('word_vectors_2.pkl', 'wb') as f:
    pickle.dump(word_vectors, f)

In [10]:
# save the vocabulary as a pickle file
import pickle

with open('vocab_2.pkl', 'wb') as file:
    pickle.dump(vocab, file)

In [25]:
# print the word vector into a text file
with open('word_vectors.txt', 'w') as file:
    for i in range(len(word_vectors)):
        file.write(f'{i} ')
        for j in range(len(word_vectors[i])):
            file.write(f'{word_vectors[i][j]} ')
        file.write('\n')
