In [51]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from collections import Counter, defaultdict
import joblib

## 1 Distributional Counting (24pt)

### 1.1 implement distributional counting (12pt)

In [2]:
def distrib_counting(doc, w, V, Vc, wv):

    """ 
    inputs:
        - doc: list of words; a **single** document to do distributional counting on
        - w: integer; window size
        - V: set of words; vocabulary
        - Vc: set of words; context vocabulary
        - wv: dict of counters; mapping from each V to Vc to counts
    outputs: 
        none. the function mutates wv.
    """

    # sliding window
    for i, center in enumerate(doc):

        # count only if the center word is in vocab
        if center in V:

            # get the start and end idx of window
            start = max(0, i - w)
            end = min(i + w + 1, len(doc))   # upper bound exclusive

            # counting co-occurences
            for j in range(start, end):

                # skip center word
                if j == i:
                    continue

                # increment counts if doc[j] is in contex vocab
                if doc[j] in Vc:
                    wv[center][doc[j]] += 1

In [3]:
# sanity check with toy example
wv = defaultdict(Counter)
distrib_counting(
    doc=['rest', 'for', 'the', 'rest', 'of', 'the', 'day'],
    w=3, 
    V=set(['day', 'for', 'of', 'rest', 'the']),
    Vc=set(['day', 'for', 'of', 'rest', 'the']),
    wv=wv
)
wv

defaultdict(collections.Counter,
            {'rest': Counter({'for': 2,
                      'the': 3,
                      'rest': 2,
                      'of': 1,
                      'day': 1}),
             'for': Counter({'rest': 2, 'the': 1, 'of': 1}),
             'the': Counter({'rest': 3,
                      'for': 1,
                      'of': 2,
                      'the': 2,
                      'day': 1}),
             'of': Counter({'for': 1, 'the': 2, 'rest': 1, 'day': 1}),
             'day': Counter({'rest': 1, 'of': 1, 'the': 1})})

In [4]:
# sanity check with toy example
wv = defaultdict(Counter)
distrib_counting(
    doc=['rest', 'for', 'the', 'rest', 'of', 'the', 'day'],
    w=3, 
    V=set(['day', 'for', 'of', 'rest', 'the']),
    Vc=set(['day', 'for', 'rest']),
    wv=wv
)
wv

defaultdict(collections.Counter,
            {'rest': Counter({'for': 2, 'rest': 2, 'day': 1}),
             'for': Counter({'rest': 2}),
             'the': Counter({'rest': 3, 'for': 1, 'day': 1}),
             'of': Counter({'for': 1, 'rest': 1, 'day': 1}),
             'day': Counter({'rest': 1})})

### 1.2 run distributional counting on data (6pt)

In [115]:
def process_corpus(corpus_path, V, Vc, w):

    """ 
    apply distrib_counting() to each doc in corpus and return sparsely represented final word vecs.
    inputs:
        - corpus_path: string; path to txt corpus where each line is a space-seperated document
        - V: set of words; vocabulary
        - Vc: set of words; context vocabulary
        - w: integer; window size
    outputs: 
        - wv: sparsely represented final word vecs computed across all docs in the corpus
    """

    # initialize co-occurence counts storage: V - Vc - counts
    wv = defaultdict(Counter)

    # process corpus
    with open(corpus_path, 'r') as file:

        for i, line in enumerate(file):

            # # only process a subset of corpus for testing
            # if i >= 10:
            #     break

            # # print progress
            # if i % 100 == 0: print(f'document {i}')
        
            # split string into list of words
            doc = line.split()
            
            # count co-occurences in doc
            distrib_counting(
                doc=doc,
                w=w, 
                V=V,
                Vc=Vc,
                wv=wv
            )

    return wv

In [9]:
# load vocab-15kws.txt
vocab15kws_path = 'data/vocab-15kws.txt'
with open(vocab15kws_path, 'r') as file:
    vocab15kws = set(file.read().splitlines())
len(vocab15kws)

15228

In [10]:
# load vocab-5k.txt
vocab5k_path = 'data/vocab-5k.txt'
with open(vocab5k_path, 'r') as file:
    vocab5k = set(file.read().splitlines())
len(vocab5k)

5000

In [None]:
# set corpus path
corpus_path = 'data/wiki-1percent.txt'

#### w = 3

In [116]:
# w = 3
wv_V15k_Vc5k_win3 = process_corpus(
                        corpus_path=corpus_path,
                        V=vocab15kws,
                        Vc=vocab5k,
                        w=3
                    )

In [27]:
# save
joblib.dump(wv_V15k_Vc5k_win3, 'results/wv_V15k_Vc5k_win3.pkl')

['results/wv_V15k_Vc5k_win3.pkl']

In [117]:
# retrieve co-occurence counts
wv_V15k_Vc5k_win3['coffee']['coffee']

4

#### w = 6

In [35]:
# w = 6
wv_V15k_Vc5k_win6 = process_corpus(
                        corpus_path=corpus_path,
                        V=vocab15kws,
                        Vc=vocab5k,
                        w=6
                    )

In [36]:
# save
joblib.dump(wv_V15k_Vc5k_win6, 'results/wv_V15k_Vc5k_win6.pkl')

['results/wv_V15k_Vc5k_win6.pkl']

In [43]:
# retrieve co-occurence counts
wv_V15k_Vc5k_win6['coffee']['coffee']

36

### 1.3 EvalWS (6pt)

In [99]:
def cos_sim(a, b):

    ab_dot = a@b
    ab_norm = np.linalg.norm(a) * np.linalg.norm(b)

    if ab_norm:   # if both word vectors have length
        return ab_dot / ab_norm
    else:   # if either word has a 0-length word vector, define cos_sim as 0
        return 0


def cos_sim_sparse(w1, w2, Vc, wv_sparse):

    """ 
    inputs:
        - w1, w2: string; the two words to compute cosine similarity between
        - Vc: set of words; context vocabulary
        - wv_sparse: defaultdict; mapping V - Vc - counts
    outputs: 
        - cos_sim between w1 and w2 computed using model wv_sparse
    """

    ab_dot = 0
    for w_context in Vc:   # we retrieve context word counts for w1 and w2 in the same iter so no need to convert Vc from set to list
        ab_dot += wv_sparse[w1][w_context] * wv_sparse[w2][w_context]

    ab_norm = np.linalg.norm(list(wv_sparse[w1].values())) * np.linalg.norm(list(wv_sparse[w2].values()))  # 0 entries does not matter for 2-norm

    if ab_norm:   # if both word vectors have length
        return ab_dot / ab_norm
    else:   # if either word has a 0-length word vector, define cos_sim as 0
        return 0

In [118]:
# sanity check
cos_sim_sparse(w1='sun', w2='sunlight', Vc=vocab5k, wv_sparse=wv_V15k_Vc5k_win3)

0.895510424327929

In [112]:
def evalWS(word_pairs, wv, Vc, true_sims):

    """ 
    inputs:
        - word_pairs: list of 2-word tuples to compute cosine similarity between
        - wv: defaultdict; wv model to be evaluated, mapping V - Vc - counts
        - Vc: set of words; context vocabulary
        - true_sims: 1d array of ground-truth similarity of each word_pair
    outputs: 
        - spearmanr between true_sims and wv_sims
    """

    # get wv similarity for word_pairs
    wv_sims = []
    for w1, w2 in word_pairs:
        wv_sims.append(cos_sim_sparse(w1=w1, w2=w2, Vc=Vc, wv_sparse=wv))
    
    # compute spearmanr between true_sims and wv_sims
    return spearmanr(true_sims, wv_sims)

#### simlex-999

In [82]:
# load simlex-999
simlex999_path = 'data/simlex-999.txt'

simlex999_words, simlex999_sims = [], []
with open(simlex999_path, 'r') as file:
    for line in file.read().splitlines()[1:]:
        w1, w2, sim = line.strip().split()
        simlex999_words.append((w1, w2))
        simlex999_sims.append(float(sim))
simlex999_sims = np.array(simlex999_sims)

print(f'simlex999_words len: {len(simlex999_words)}; simlex999_sims len: {len(simlex999_sims)}')

simlex999_words len: 999; simlex999_sims len: 999


In [113]:
evalWS(word_pairs=simlex999_words, wv=wv_V15k_Vc5k_win3, Vc=vocab5k, true_sims=simlex999_sims)

SpearmanrResult(correlation=0.05876135331349779, pvalue=0.06337563925440041)

#### MEN

In [84]:
# load MEN
MEN_path = 'data/men.txt'

MEN_words, MEN_sims = [], []
with open(MEN_path, 'r') as file:
    for line in file.read().splitlines()[1:]:
        w1, w2, sim = line.strip().split()
        MEN_words.append((w1, w2))
        MEN_sims.append(float(sim))
MEN_sims = np.array(MEN_sims)

print(f'MEN_words len: {len(MEN_words)}; MEN_sims len: {len(MEN_sims)}')

MEN_words len: 3000; MEN_sims len: 3000


In [114]:
evalWS(word_pairs=MEN_words, wv=wv_V15k_Vc5k_win3, Vc=vocab5k, true_sims=MEN_sims)

SpearmanrResult(correlation=0.2251396048448754, pvalue=8.800788745595221e-36)

## 2 Inverse Document Frequency (IDF) (10pt)

In [124]:
def IDF(corpus_path, Vc):

    # for storing number of doc/sentences containing each word in context vocab
    Vc_doc = defaultdict(int)

    # process corpus to count Vc occurence
    with open(corpus_path, 'r') as file:

        for i, line in enumerate(file):
        
            # split string into list of words, and convert to set to remove duplicates
            doc = set(line.split())
            
            # increment doc count for context vocab
            for word in doc:
                if word in Vc: Vc_doc[word] += 1

    # get total num of docs in corpus
    corpus_size = i + 1
    print(corpus_size)

    # compute IDF: corpus_size / freq
    return {word: (corpus_size / doc_count) for word, doc_count in Vc_doc.items()}

In [125]:
# compute IDF for Vc
Vc_IDF = IDF(corpus_path, vocab5k)

997898


In [140]:
def TF_IDF(wv_TF, Vc_IDF):

    """ 
    combine counts with IDF
    """

    # for storing counts combined with IDF 
    wv_TF_IDF = defaultdict(Counter)

    # iterate through every word pair in original count-based wv
    for w_center, w_context_count in wv_TF.items():
        for w_context, count in w_context_count.items():
            wv_TF_IDF[w_center][w_context] = count * Vc_IDF[w_context]

    return wv_TF_IDF

In [141]:
# compute TF_IDF for wv_V15k_Vc5k_win3
wv_V15k_Vc5k_win3_IDF = TF_IDF(wv_TF=wv_V15k_Vc5k_win3, Vc_IDF=Vc_IDF)

In [143]:
# evaluate on simlex999
evalWS(word_pairs=simlex999_words, wv=wv_V15k_Vc5k_win3_IDF, Vc=vocab5k, true_sims=simlex999_sims)

SpearmanrResult(correlation=0.12778528309126566, pvalue=5.111621348174487e-05)

In [144]:
# evaluate on MEN
evalWS(word_pairs=MEN_words, wv=wv_V15k_Vc5k_win3_IDF, Vc=vocab5k, true_sims=MEN_sims)

SpearmanrResult(correlation=0.4388165624442915, pvalue=1.9006405060621283e-141)

## 3 Pointwise Mutual Information (PMI) (14pt)

## 4 Quantitative Comparisons (12pt)

## 5 Qualitative Analysis (25pt)