## Make W2V embeddings over time

In [1]:
import numpy as np
import json
from pprint import pprint
from scipy.spatial.distance import pdist
# word2vec
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

In [4]:
years = range(2013,2023)

In [3]:
drug_articles = pickle.load(open("./data/drug_articles.pkl", "rb"))
# filter articles that are 2023
drug_articles = drug_articles[drug_articles["Date"].dt.year != 2023]

In [4]:
models = []

for year in years:
    # get the articles for the year
    articles = drug_articles[drug_articles["Date"].dt.year == year]["clean_text"]
    # train the model
    model = Word2Vec(
        articles, window=15, min_count=1, 
        workers=4, vector_size=300, sg=0, 
        epochs=5, hs=0, sorted_vocab=1)
    # save the model
    # model.save("output/embeddings/word2vec_drug_" + str(year) + ".model")
    # append the model to the list
    models.append(model)

In [5]:
# reduce the vocabulary to top 10000 words per year
for i in range(len(years)):
    models[i].wv.vectors = models[i].wv.vectors[:10000,:]
    models[i].wv.index_to_key = models[i].wv.index_to_key[:10000]
    models[i].wv.key_to_index = {k: models[i].wv.key_to_index[k] for k in models[i].wv.index_to_key[:10000]}
    print(years[i], "top words: \n", models[i].wv.index_to_key[:10])

2013 top words: 
 ['police', 'new', 'drug', 'staff', 'years', 'philadelphia', 'inquirer', 'credit', 'email', 'county']
2014 top words: 
 ['police', 'new', 'drug', 'marijuana', 'county', 'staff', 'state', 'years', 'philadelphia', 'school']
2015 top words: 
 ['police', 'years', 'new', 'drug', 'philadelphia', 'staff', 'marijuana', 'city', 'credit', 'time']
2016 top words: 
 ['new', 'police', 'philadelphia', 'marijuana', 'drug', 'years', 'staff', 'state', 'medical', 'year']
2017 top words: 
 ['philadelphia', 'new', 'drug', 'state', 'city', 'marijuana', 'years', 'police', 'staff', 'like']
2018 top words: 
 ['marijuana', 'city', 'new', 'state', 'philadelphia', 'drug', 'year', 'like', 'years', 'staff']
2019 top words: 
 ['marijuana', 'new', 'drug', 'philadelphia', 'state', 'police', 'years', 'year', 'like', 'city']
2020 top words: 
 ['new', 'police', 'marijuana', 'philadelphia', 'drug', 'city', 'state', 'years', 'like', 'patients']
2021 top words: 
 ['philadelphia', 'new', 'state', 'drug', 'y

In [6]:
# save the models
for i in range(len(years)):
    models[i].save("output/embeddings/word2vec_drug_" + str(years[i]) + ".model")

----

## TEMPORAL EMBEDDING FROM PAPER

In [6]:
from gensim.corpora import Dictionary
from gensim.matutils import corpus2csc
from scipy.sparse import csc_matrix
from scipy.sparse.linalg import svds

In [7]:
dictionary = Dictionary(drug_articles["clean_text"])
dictionary.filter_extremes(no_below=100)
# 2,776 UNIQUE TOKENS

corpus_full = [dictionary.doc2bow(text) for text in drug_articles["clean_text"]]
corpus_full = corpus2csc(corpus_full).transpose()
corpus_full = csc_matrix(corpus_full)


In [18]:
corpora_year = []
for year in years:
    # get the articles for the year
    articles = drug_articles[drug_articles["Date"].dt.year == year]["clean_text"]
    # get the corpus
    corpus = [dictionary.doc2bow(text) for text in articles]
    corpora_year.append(corpus)


In [19]:
cooc_year = []    
for i in range(len(years)):
    corpus = corpora_year[i]
    cooc = np.zeros((len(dictionary), len(dictionary)))
    for doc in corpus:
        for word1 in doc:
            for word2 in doc:
                cooc[word1[0], word2[0]] += 1
    cooc_year.append(cooc)

In [34]:
cooc_year[1].shape

(2776, 2776)

In [35]:
type(cooc_year[1])

numpy.ndarray

PPMI stands for Positive Pointwise Mutual Information. It is a statistical measure used in natural language processing and information retrieval to capture the association between words in a text corpus. PPMI is a variant of the pointwise mutual information (PMI) measure, which quantifies the extent to which two words co-occur more or less frequently than would be expected by chance.

The Positive Pointwise Mutual Information (PPMI) measure enhances the PMI by addressing the issue of negative PMI values. In standard PMI, if the co-occurrence of two words is less frequent than expected, the PMI value can become negative. PPMI resolves this by setting negative PMI values to zero, thus only considering positive associations between words.

The PPMI measure is calculated based on the co-occurrence matrix of words in a corpus. Here's the formula for calculating PPMI:

PPMI(w1, w2) = max(log2(P(w1, w2) / (P(w1) * P(w2))), 0)

where:

P(w1, w2) is the probability of observing the co-occurrence of words w1 and w2 in the corpus.
P(w1) and P(w2) are the probabilities of observing words w1 and w2 individually in the corpus.
The PPMI value reflects how strongly the co-occurrence of two words deviates from what would be expected by chance. *Higher PPMI values indicate a stronger association between words*, while zero values indicate no association.

PPMI is often used in the context of building word embeddings or word representations. It can be used to construct a co-occurrence matrix and then apply dimensionality reduction techniques like singular value decomposition (SVD) to obtain dense, low-dimensional word vectors that capture semantic relationships between words.

In [13]:
def get_ppmi(coocur):

  # Calculate the probability of each word appearing in the corpus.
  p = np.sum(coocur, axis=1) / np.sum(coocur)

  # Calculate the probability of each pair of words appearing in the corpus.
  pq = np.sum(coocur, axis=0) / np.sum(coocur)

  # Compute the PPMI value for each pair of words.
  ppmi_matrix = np.log(pq / (p * p))

  # Set all PPMI values less than 0 to 0.
  ppmi_matrix[ppmi_matrix < 0] = 0

  return ppmi_matrix

In [36]:
# get the co-occurence matrix
def get_ppmi(coocur):
    co_occurence_matrix = coocur
    # get the sum of the rows and columns
    sum_rows = np.array(co_occurence_matrix.sum(axis=1)).flatten()
    sum_columns = np.array(co_occurence_matrix.sum(axis=0)).flatten()
    # get the total sum
    total_sum = sum_rows.sum()
    # get the PPMI matrix
    ppmi_matrix = co_occurence_matrix.copy()
    # get the indices of the non-zero elements
    rows, cols = ppmi_matrix.nonzero()
    # get the PPMI matrix
    for row, col in zip(rows, cols):
        # get the co-occurence
        co_occurence = co_occurence_matrix[row, col]
        # get the sum of the row and column
        sum_row = sum_rows[row]
        sum_col = sum_columns[col]
        # compute the PMI
        pmi = np.log(co_occurence) + np.log(total_sum) - np.log(sum_row) - np.log(sum_col)
        # compute the PPMI
        ppmi = max(0, pmi)
        # set the value
        ppmi_matrix[row, col] = ppmi
    return ppmi_matrix

In [37]:
ppmi_year = []
for i in range(len(years)):
    # get the co-occurence matrix
    coocur = cooc_year[i]
    # get the PPMI matrix
    ppmi = get_ppmi(coocur)
    # append to the list
    ppmi_year.append(ppmi)

In [38]:
ppmi_year[1].shape

(2776, 2776)

In [41]:
from scipy.sparse import linalg
def ppmi_svd(ppmi_matrix, k=300):
  """
  Performs SVD on the ppmi matrix.

  Args:
    cooccurrence_matrix: A sparse matrix of PMI values.

  Returns:
    A tuple of the singular vectors and singular values.
  """

  # Calculate the ppmi for the cooccurrence matrix.
  # ppmi_matrix = ppmi(cooccurrence_matrix)

  # Subtract the minimum value from the ppmi matrix.
  ppmi_matrix -= np.min(ppmi_matrix)

  # Normalize the ppmi matrix.
  ppmi_matrix /= np.sum(ppmi_matrix)

  # Perform SVD on the normalized ppmi matrix.
  u,s,vh = linalg.svds(ppmi_matrix, k=k)

  return u,s,vh

This code first calculates the ppmi for the cooccurrence matrix. Then, it subtracts the minimum value from the ppmi matrix and normalizes it. Finally, it performs SVD on the normalized ppmi matrix and returns the singular vectors and singular values.

The singular vectors represent the word embeddings. The first few singular vectors will capture the most important aspects of the PMI matrix. These vectors can be used to represent words in a high-dimensional space.

In [42]:
embed_year = []
for i in range(len(years)):
    u,s,vh = ppmi_svd(ppmi_year[i])
    embed_year.append(s)


In [43]:
# save embeddings
for i in range(len(years)):
    np.save("output/embeddings/svd_drug_" + str(years[i]) + ".npy", embed_year[i])