# Training word vectors using SVD

In [9]:
import torch
import torch.nn as nn
import numpy as np
import csv
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re
from torchtext.vocab import build_vocab_from_iterator
# import scipy sparse matrix to use scipy.sparse.linalg.svds
import scipy.sparse as sp
import scipy.sparse.linalg as linalg
from tqdm import tqdm

[nltk_data] Downloading package punkt to /home2/sanika/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [1]:
# load textual data from csv file as a list of strings
def load_data(file_path):
    # load only the second column of the csv file
    data = []
    with open(file_path, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            data.append(row[1])
    # delete the first element of the list (header)
    del data[0]
    return data

In [3]:
data = load_data('./ANLP-2/train.csv')

In [4]:
class Preprocess():
    def __init__ (self, data):
        self.data = data

    def tokenize(self, data):
        tokenized_data = []
        for text in data:
            text = re.sub(r'\\', ' ', text)
            text = re.sub(r'\d+', '<NUMBER>', text)
            text = text.lower()
            # remove punctuation
            text = re.sub(r'[^\w\s]', ' ', text)
            tokenized_data.append(word_tokenize(text))
        return tokenized_data
    
    def build_vocab(self, tokenized_data):
        OUT_OF_VOCAB = '<OOV>'
        PAD_TAG = '<PAD>'
        vocab = build_vocab_from_iterator(tokenized_data, specials=[OUT_OF_VOCAB, PAD_TAG])
        return vocab
    
    def text_to_indices(self, tokenized_data, vocab):
        indexed_data = []
        for sentence in tokenized_data:
            indexed_data.append([vocab[token] for token in sentence])
        return indexed_data
    
    def __call__(self):
        tokenized_data = self.tokenize(self.data)
        vocab = self.build_vocab(tokenized_data)
        indexed_data = self.text_to_indices(tokenized_data, vocab)
        return indexed_data, vocab, tokenized_data 

In [5]:
indexed_data, vocab, tokenized_data = Preprocess(data)()

In [17]:
tokenized_data[0]

['reuters',
 'short',
 'sellers',
 'wall',
 'street',
 's',
 'dwindling',
 'band',
 'of',
 'ultra',
 'cynics',
 'are',
 'seeing',
 'green',
 'again']

In [27]:
# build a co-occurrence matrix
def build_co_occurrence_matrix(corpus, vocab, window_size=2):
    co_occurrence_matrix = sp.lil_matrix((len(vocab), len(vocab)), dtype=np.float32)
    for sentence in tqdm(corpus):
        for i, word in enumerate(sentence):
            for j in range(max(i - window_size, 0), min(i + window_size + 1, len(sentence))):
                if i != j:
                    co_occurrence_matrix[word, sentence[j]] += 1
    return co_occurrence_matrix

In [28]:
len(vocab)

57898

In [30]:
coocerrence_matrix = build_co_occurrence_matrix(indexed_data, vocab)

100%|██████████| 120000/120000 [02:24<00:00, 829.91it/s]


In [31]:
# svd decomposition
def svd(matrix, k):
    u, s, v = linalg.svds(matrix, k=k)
    return u, s, v

In [32]:
u, s, v = svd(coocerrence_matrix, 100)

In [33]:
# get word vectors
def get_word_vectors(u, s, v, k):
    # get sub-matrices for each word
    word_vectors = np.dot(u, np.diag(s))
    return word_vectors

In [34]:
word_vectors = get_word_vectors(u, s, v, 100)

In [35]:
# save word vectors along with actual words
def save_word_vectors(word_vectors, vocab, file_path):
    with open(file_path, 'w') as file:
        for i, word in enumerate(vocab.get_itos()):
            vector = word_vectors[i]
            vector_str = ' '.join([str(x) for x in vector])
            file.write(f'{word} {vector_str}\n')

In [36]:
save_word_vectors(word_vectors, vocab, 'word_vectors.txt')

In [None]:
# save the vocabulary as a pickle file
import pickle

with open('vocab.pkl', 'wb') as file:
    pickle.dump(vocab, file)