### <center>Word2Vec Modelling</center>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
from warnings import filterwarnings
from collections import Counter
from pylab import rcParams
filterwarnings(action='ignore', category=DeprecationWarning)

from utils.utils import *

%matplotlib inline

In [2]:
from scipy import spatial
from nltk import word_tokenize
from gensim.models import Word2Vec

rcParams['figure.figsize'] = 15, 15

In [4]:
class HOCw2v:
    """
         A class to return word2vec model for a given dataframe
    """

    def tokenize(self, df):
        """
            This function takes a dataframe and returns a list of lists of tokens
            df - Dataframe
        """
        df['speech_tokens'] = df.apply(lambda row: word_tokenize(row['speech_processed']), axis=1)

        return df

    def word2vec_model(self, data, vector_size=300, min_count=5, workers=4):
        """
            This function takes a dataframe and returns a word2vec model
            data - Dataframe
            vector_size - Dimension of the vector
            min_count - Minimum number of times a word must appear in the corpus to be included in the model
            workers - Number of workers to use for training
        """
        model = Word2Vec(data, vector_size=vector_size, min_count=min_count, workers=workers)
        
        return model

In [1]:
class HOCOrthoPros():
    """
         A class used to measure the semantic change in the vector after performing Embedding space alignment and orthogonal prosecution of the w2v vectors.

         Code will be used from https://gist.github.com/zhicongchen/9e23d5c3f1e5b1293b16133485cd17d8, ported from HistWords <https://github.com/williamleif/histwords>.
         First, we will define a function to find the intersection between the vocabularies of two word2vec models
    """

    def intersection_align_gensim(self, m1, m2, words=None):
        """
        Intersect two gensim word2vec models, m1 and m2.
        Only the shared vocabulary between them is kept.
        If 'words' is set (as list or set), then the vocabulary is intersected with this list as well.
        Indices are re-organized from 0..N in order of descending frequency (=sum of counts from both m1 and m2).
        These indices correspond to the new syn0 and syn0norm objects in both gensim models:
            -- so that Row 0 of m1.syn0 will be for the same word as Row 0 of m2.syn0
            -- you can find the index of any word on the .index2word list: model.index2word.index(word) => 2
        The .vocab dictionary is also updated for each model, preserving the count but updating the index.
        """

        # Get the vocab for each model
        vocab_m1 = set(m1.wv.index_to_key)
        vocab_m2 = set(m2.wv.index_to_key)

        # Find the common vocabulary
        common_vocab = vocab_m1 & vocab_m2
        if words: common_vocab &= set(words)

        # If no alignment necessary because vocab is identical...
        if not vocab_m1 - common_vocab and not vocab_m2 - common_vocab:
            return (m1,m2)

        # Otherwise sort by frequency (summed for both)
        common_vocab = list(common_vocab)
        common_vocab.sort(key=lambda w: m1.wv.get_vecattr(w, "count") + m2.wv.get_vecattr(w, "count"), reverse=True)

        # Then for each model...
        for m in [m1, m2]:
            # Replace old syn0norm array with new one (with common vocab)
            indices = [m.wv.key_to_index[w] for w in common_vocab]
            old_arr = m.wv.vectors
            new_arr = np.array([old_arr[index] for index in indices])
            m.wv.vectors = new_arr

            # Replace old vocab dictionary with new one (with common vocab)
            # and old index2word with new one
            new_key_to_index = {}
            new_index_to_key = []
            for new_index, key in enumerate(common_vocab):
                new_key_to_index[key] = new_index
                new_index_to_key.append(key)
            m.wv.key_to_index = new_key_to_index
            m.wv.index_to_key = new_index_to_key
            
            print(len(m.wv.key_to_index), len(m.wv.vectors))
            
        return (m1,m2)

    """Then, I define a function for aligning two spaces with [Orthogonal Procrustes](https://simonensemble.github.io/2018-10/orthogonal-procrustes.html):"""

    def smart_procrustes_align_gensim(self, base_embed, other_embed, words=None):
        """
        Original script: https://gist.github.com/quadrismegistus/09a93e219a6ffc4f216fb85235535faf
        Procrustes align two gensim word2vec models (to allow for comparison between same word across models).
        Code ported from HistWords <https://github.com/williamleif/histwords> by William Hamilton <wleif@stanford.edu>.
            
        First, intersect the vocabularies (see `intersection_align_gensim` documentation).
        Then do the alignment on the other_embed model.
        Replace the other_embed model's syn0 and syn0norm numpy matrices with the aligned version.
        Return other_embed.
        If `words` is set, intersect the two models' vocabulary with the vocabulary in words (see `intersection_align_gensim` documentation).
        """

        # make sure vocabulary and indices are aligned
        in_base_embed, in_other_embed = self.intersection_align_gensim(base_embed, other_embed, words=words)

        # re-filling the normed vectors
        in_base_embed.wv.fill_norms(force=True)
        in_other_embed.wv.fill_norms(force=True)

        # get the (normalized) embedding matrices
        base_vecs = in_base_embed.wv.get_normed_vectors()
        other_vecs = in_other_embed.wv.get_normed_vectors()

        # just a matrix dot product with numpy
        m = other_vecs.T.dot(base_vecs) 
        # SVD method from numpy
        u, _, v = np.linalg.svd(m)
        # another matrix operation
        ortho = u.dot(v) 
        # Replace original array with modified one, i.e. multiplying the embedding matrix by "ortho"
        other_embed.wv.vectors = (other_embed.wv.vectors).dot(ortho)    
        
        return other_embed


    """ Measure change
    Now we will measure the cosine similarity between the embedding of a word in the first time period and the embedding of the same word in the second time period.
    Let's define a function that calculates the semantic change of a word:
    """

    def semantic_change(self, word, model_prev, model_cur):
        """
            The function checks the semantic change in the two word2vec models.
        """
        sc = 1-spatial.distance.cosine(model_prev.wv[word], model_cur.wv[word])

        return sc