<a href="https://colab.research.google.com/github/rajattur-nlp/stanfordAssignments/blob/main/Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import sys
import os
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

In [2]:
#!/usr/bin/env python

import os
import nltk

if os.name == 'nt':
    import ssl
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

nltk.download('reuters')
from nltk.corpus import reuters

START_TOKEN = '<START>'
END_TOKEN = '<END>'

def read_corpus(category="crude"):
    """ Read files from the specified Reuter's category.
        Params:
            category (string): category name
        Return:
            list of lists, with words from each of the processed files
    """
    files = reuters.fileids(category)
    return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]

[nltk_data] Downloading package reuters to /root/nltk_data...


In [3]:
DATA = read_corpus()

In [4]:
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = 0

    # ### START CODE HERE ###
    dist_words = set()

    for document in corpus:
      for word in document:
        dist_words.add(word)

    corpus_words = sorted(dist_words)
    num_corpus_words = len(corpus_words)

    # ### END CODE HERE ###
    return corpus_words, num_corpus_words

In [5]:
def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).

        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.

              For example, if we take the document "START All that glitters is not gold END" with window size of 4,
              "All" will co-occur with "START", "that", "glitters", "is", and "not".

        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)):
                Co-occurrence matrix of word counts.
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, num_words = distinct_words(corpus)
    M = None
    word2Ind = {}

    # ### START CODE HERE ###

    word2Ind = {word: i for i, word in enumerate(words)}

    M = np.zeros((num_words, num_words))

    # Step 3: Update co-occurrence matrix for each document
    for document in corpus:
        # print(document)
        for index, word in enumerate(document):
            left_index = max(index - window_size, 0)
            right_index = min(index + window_size + 1, len(document))

            # Get the context window surrounding the current word
            context_window = document[left_index:index] + document[index + 1:right_index]

            # Update co-occurrence counts
            word_index = word2Ind[word]
            for context_word in context_window:
                context_word_index = word2Ind[context_word]
                M[word_index, context_word_index] += 1

    # ### END CODE HERE ###

    return M, word2Ind

In [6]:
M, word2Ind = compute_co_occurrence_matrix(DATA)
# np.set_printoptions(threshold=sys.maxsize)

In [8]:
def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurrence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html

        Params:
            M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): co-occurrence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of unique words in the corpus, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """
    np.random.seed(4355)
    n_iter = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))

    # ### START CODE HERE ###
    svd = TruncatedSVD(n_components=k, n_iter=n_iter)

    M_reduced = svd.fit_transform(M)
    # ### END CODE HERE ###

    print("Done.")
    return M_reduced

In [9]:
reducedM = reduce_to_k_dim(M)

Running Truncated SVD over 8185 words...
Done.


In [10]:
reducedM

array([[ 7.32630060e+02, -1.16894192e+02],
       [ 1.26000427e+00, -1.61923588e-01],
       [ 2.80304332e-01,  6.47334603e-02],
       ...,
       [ 1.04145879e+00, -3.06320300e-01],
       [ 6.19972477e-01, -1.25537234e-01],
       [ 2.42230659e+00,  2.28089719e-01]])