Get Dynamic Word Embeddings

In [2]:
import numpy as np
import json
from pprint import pprint
from scipy.spatial.distance import pdist
# word2vec
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle

In [3]:
years = range(2013,2023)
dir = "output/models/word2vec_drug_"

models = []
for year in years:
    models.append(Word2Vec.load(dir+str(year)+".model"))

### From "Diachronic Word Embeddings Reveal Statistical Lawso f Semantic Change" by Jurafsky et al. to explore semantic changes of target words over time

`Low-dimensional embeddings will not be naturally aligned due to the non-unique nature of the SVD and the stochastic nature of SGNS. In particular, both these methods may result in arbitrary orthogonal transformations, which do not affect pairwise cosine-similarities within-years but will preclude comparison of the same word across time.`

We use orthogonal Procrustes to align the learned low-dimensional embeddings. Defining W(t) ∈ Rd×|V| as the matrix of word embeddings learned at year t, we align across time-periods while preserving cosine similarities by optimizing:

In [4]:
drug_articles = pickle.load(open("./data/drug_articles.pkl", "rb"))
# filter articles that are 2023
drug_articles = drug_articles[drug_articles["Date"].dt.year != 2023]
# get the years
years = drug_articles["Date"].dt.year.unique()

In [5]:
models = []

for year in years:
    # get the articles for the year
    articles = drug_articles[drug_articles["Date"].dt.year == year]["clean_text"]
    # train the model
    model = Word2Vec(
        articles, window=15, min_count=1, 
        workers=4, vector_size=300, sg=0, 
        epochs=5, hs=0, sorted_vocab=1)
    # save the model
    model.save("output/models/word2vec_drug_" + str(year) + ".model")
    # append the model to the list
    models.append(model)

In [38]:
extra_stopwords = ["said", "drug"]

# remove stopwords before training
for model in models:
    for stopword in extra_stopwords:
        model.wv.key_to_index.pop(stopword, None)

In [6]:
# reduce the vocabulary to top 10000 words per year
for i in range(len(years)):
    models[i].wv.vectors = models[i].wv.vectors[:10000,:]
    models[i].wv.index_to_key = models[i].wv.index_to_key[:10000]
    models[i].wv.key_to_index = {k: models[i].wv.key_to_index[k] for k in models[i].wv.index_to_key[:10000]}
    print(years[i], "top words: \n", models[i].wv.index_to_key[:10])

2013 top words: 
 ['said', '-', 'said.', 'police', 'new', '.', 'drug', '"i', 'staff', 'inquirer']
2014 top words: 
 ['said', '-', 'said.', 'new', 'drug', 'police', '"i', '.', 'state', 'staff']
2015 top words: 
 ['said', '-', 'said.', 'new', 'police', 'drug', '"i', 'staff', 'marijuana', 'philadelphia']
2016 top words: 
 ['said', '-', 'said.', 'new', 'police', 'drug', 'marijuana', 'people', 'staff', '"i']
2017 top words: 
 ['said', '-', 'said.', 'people', 'new', 'drug', 'staff', 'state', 'philadelphia', 'marijuana']
2018 top words: 
 ['said', '-', 'said.', 'people', 'marijuana', 'new', 'drug', 'city', 'state', 'medical']
2019 top words: 
 ['said', '-', 'said.', 'people', 'new', 'marijuana', 'drug', 'philadelphia', 'state', 'like']
2020 top words: 
 ['said', '-', 'people', 'said.', 'new', 'police', 'marijuana', 'drug', 'state', 'like']
2021 top words: 
 ['said', '—', 'said.', 'people', 'new', 'drug', 'like', 'state', 'marijuana', 'medical']
2022 top words: 
 ['said', '—', 'said.', 'people

2. Alignment

In [9]:
# align the embeddings using Procrustes
from scipy.linalg import orthogonal_procrustes
# def align_embeddings(embeddings):
#     # Get the embedding matrices
#     matrices = [embedding.wv.vectors for embedding in embeddings]

#     # Align the embeddings
#     aligned = orthogonal_procrustes(matrices[0], matrices[1])
#     aligned_matrices = [aligned[1]]  # Initialize with the first aligned matrix

#     # Iterate over the remaining embeddings and align them with the previous aligned matrix
#     for matrix in matrices[2:]:
#         aligned = orthogonal_procrustes(aligned_matrices[-1], matrix)
#         aligned_matrices.append(aligned[1])

#     # Return the aligned embedding matrices
#     return aligned_matrices

# # models have different dimensions
# align them
print(models[0].wv.vectors.shape,
models[1].wv.vectors.shape, models[2].wv.vectors.shape)


(10000, 300) (10000, 300) (10000, 300)


In [17]:
matrices = [embedding.wv.vectors for embedding in models]

# align all 10 embeddings

for i in range(1, len(matrices)):
    aligned = orthogonal_procrustes(matrices[0], matrices[i])
    matrices[i] = aligned[1]

problem: aligning the matrices using orthogonal Procrustes requires the matrices to be of the same size. However, the number of words in the vocabulary changes over time. We solve this problem by aligning the matrices of the top 10,000 words in each time period. We could also remove words that are not present in at least 3 time periods.

In [10]:
aligned = align_embeddings(models)

ValueError: expected ndim to be 2, but observed 0

1. compute cosine distance between word vectors for each year 
2. compute average cosine distance between word vectors for each year
3. compute average cosine distance between word vectors for each year and the previous year
4. compute average cosine distance between word vectors for each year and the first year

Other stuff
----

In [20]:
# use UMAP to reduce the dimensionality
import umap

reducer = umap.UMAP(n_neighbors=15, metric='cosine', random_state=42)
# fit the reducer
red_embedding = reducer.fit_transform(models[0].wv.vectors)

In [21]:
red_embedding.shape

(40458, 2)

In [23]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=42)
tsne_embedding = tsne.fit_transform(models[0].wv.vectors)



In [24]:
tsne_embedding.shape

(40458, 2)

In [17]:
from sklearn.decomposition import PCA

def reduce_embeddings(embeddings):
    # Determine the maximum dimension among the word embeddings
    max_dim = max(embedding.vector_size for embedding in embeddings)
    
    # Concatenate the embeddings into a single matrix with aligned dimensions
    aligned_embeddings = []
    for embedding in embeddings:
        # Pad or truncate the embedding to match the maximum dimension
        padded_embedding = np.pad(embedding.wv.vectors, ((0, 0), (0, max_dim - embedding.vector_size)), mode='constant')
        aligned_embeddings.append(padded_embedding)
    
    combined_embeddings = np.concatenate(aligned_embeddings, axis=1)
    
    # Apply PCA with n_components set to the desired dimension (here, 10,000)
    pca = PCA(n_components=10000)
    reduced_embeddings = pca.fit_transform(combined_embeddings)
    
    return reduced_embeddings


In [None]:
pca = PCA(n_components=10000)
reduced_embeddings = pca.fit_transform(models[0].wv.vectors)

the reduce_embeddings function first determines the maximum dimension among the word embeddings in the input list. Then, each word embedding is either padded or truncated to match the maximum dimension using np.pad. This ensures that all embeddings have the same shape along the feature axis.

The padded or truncated embeddings are stored in the aligned_embeddings list, which is then concatenated along the feature axis to create the combined_embeddings matrix. The rest of the code remains the same as before, applying PCA to reduce the dimensionality.