For citation information, please see the "Source Information" section listed in the associated README file: https://github.com/stephbuon/digital-history/tree/master/hist3368-week12-word-context-vectors

# Hist 3368 - Week 12: Word Context Vectors with Gensim

## Teaching Version

#### By Jo Guldi

#### Load Software

In [1]:
import pandas as pd
import gensim
import string
import csv
import glob
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import scipy.spatial.distance
import matplotlib
import matplotlib.pyplot as plt
import itertools
import multiprocessing
from multiprocessing import Pool
from nltk.tokenize import sent_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer
from nltk.corpus import wordnet as wn

In [80]:
def lemmatize_list(sentence):
    result = [wn.morphy(item) for item in sentence]
    return(result)

In [2]:
def structure_data(sentences, lemma, stopwords, stemmed):
   # smoosh everything together
    one_string = ' '.join(sentences)
     
    # break it into sentences 
    sentences =  sent_tokenize(one_string) 
    
    # remove punctuation
    sentences = [''.join(c for c in sentence if not c in string.punctuation) for sentence in sentences]

    # lowercase
    sentences = [sent.lower() for sent in sentences]

    # tokenize documents with gensim's tokenize() function
    sentences_in_words = [sent.split() for sent in sentences]
    
    # build bigram model
    bigram_mdl = gensim.models.phrases.Phrases(sentences_in_words, min_count=1, threshold=2)

    # lemmatize the tokens
    if lemma == True:
        pool = multiprocessing.Pool()
        sentences_in_words =  pool.map(lemmatize_list, sentences_in_words) #[[wn.morphy(item) for item in list] for list in token_list] 
        sentences_in_words = [[item for item in sentence if item is not None] for sentence in sentences_in_words] 
    sentences_in_words[0][:15]

    # remove stopwords and/or do stemming
    from gensim.parsing.preprocessing import preprocess_string#, remove_stopwords#, #stem_text
    CUSTOM_FILTERS = []
    if stopwords == True:
        from gensim.parsing.preprocessing import remove_stopwords
        CUSTOM_FILTERS.append(remove_stopwords)
    if stemmed == True:
        from gensim.parsing.preprocessing import stem_text
        CUSTOM_FILTERS.append(stem_text)
        
    processed = [preprocess_string(" ".join(sentence), CUSTOM_FILTERS) for sentence in sentences_in_words]
    #processed = [[item for item in list if item] for list in processed]

    # apply bigram model to list
    result = [bigram_mdl[item] for item in processed]
        
    return(result)
   

In [3]:
n = multiprocessing.cpu_count()

In [4]:
def parallelize_operation(df, func, n_cores = n):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

#### Load some Data

In [5]:
cd /scratch/group/history/hist_3368-jguldi

/scratch/group/history/hist_3368-jguldi


In [6]:
congress = pd.read_csv("congress1967-2010.csv")

In [7]:
all_data = congress.copy()

In [8]:
all_data[:5]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,speech,date,speaker,word_count,year,month,month_year
0,0,0,Those who do not enjoy the privilege of the fl...,1967-01-10,The VICE PRESIDENT,16,1967,1,1967-01-01
1,1,1,Mr. President. on the basis of an agreement re...,1967-01-10,Mr. MANSFIELD,35,1967,1,1967-01-01
2,2,2,The Members of the Senate have heard the remar...,1967-01-10,The VICE PRESIDENT,40,1967,1,1967-01-01
3,3,3,The Chair lays before the Senate the following...,1967-01-10,The VICE PRESIDENT,151,1967,1,1967-01-01
4,4,4,Secretary of State.,1967-01-10,Mrs. AGNES BAGGETT,3,1967,1,1967-01-01


In [9]:
all_data['5yrperiod'] = np.floor(all_data['year'] / 5) * 5 # round each year to the nearest 5 -- by dividing by 5 and "flooring" to the lowest integer
all_data = all_data.drop(['date', 'year', 'speaker','Unnamed: 0', 'Unnamed: 0.1', 'word_count', 'month'], 1)

In [10]:
all_data['index'] = np.arange(len(all_data)) # create an 'index' column

In [11]:
all_data.head()

Unnamed: 0,speech,month_year,5yrperiod,index
0,Those who do not enjoy the privilege of the fl...,1967-01-01,1965.0,0
1,Mr. President. on the basis of an agreement re...,1967-01-01,1965.0,1
2,The Members of the Senate have heard the remar...,1967-01-01,1965.0,2
3,The Chair lays before the Senate the following...,1967-01-01,1965.0,3
4,Secretary of State.,1967-01-01,1965.0,4


#### Downsample

In [12]:
sample_l = all_data.sample(500000)
sample_m = sample_l.sample(50000)
sample = sample_m.sample(5000)

## Introducing Gensim, a Tool for Studying Word Embeddings

#### Preprocessing your data into a list of sentences, each sentence a list of words

In [None]:
sentences = structure_data(all_data['speech'], lemma = False, stopwords = True, stemmed = True) # <---- switch out sample_l to all_data, sample_s or sample_m here

In [None]:
sentences[:5]

#### Use Gensim to create a vector model from sentences

In [None]:
congress_model = gensim.models.Word2Vec(
    sentences = sentences,
    workers = n # if you have more computing power available
    min_count = 10 # remove words stated only once
    )

#### Use the lines below to save the model for later use or to load a saved model

In [None]:
filename = 'congress-1967-2010-full-stopworded-bigrammed-stemmed'

In [None]:
congress_model.save(filename) #### save the model you just made

#### Load a Saved Model

In [None]:
filename = 'congress-1967-2010-full-stopworded-bigrammed-stemmed'

In [None]:
congress_model = gensim.models.Word2Vec.load(filename) #### load a saved model

### Explore the Contents of Your Vector Model

#### Find the CONTEXT for One Word

In [None]:
man_vector = congress_model.wv['man']
congress_model.wv.similar_by_vector(man_vector)

In [None]:
woman_vector = congress_model.wv['woman']
congress_model.wv.similar_by_vector(woman_vector)

In [None]:
individual_vector = congress_model.wv['person']
congress_model.wv.similar_by_vector(individual_vector)

In [None]:
soldier_vector = congress_model.wv['soldier']
congress_model.wv.similar_by_vector(soldier_vector)

In [None]:
congress_model.wv.most_similar("women", topn = 20)

#### Interpreting vector similarity

Try your own hand at interpreting these outputs. 

How do you interpret these similarities?

In [None]:
congress_model.wv.most_similar("iraq", topn = 20)

In [None]:
congress_model.wv.most_similar("america", topn = 20)

In [None]:
congress_model.wv.most_similar("britain", topn = 20)

## Subtracting Vectors

In [None]:
diff = congress_model.wv['man'] - congress_model.wv['woman']
congress_model.wv.similar_by_vector(diff)

In [None]:
diff = congress_model.wv['woman'] - congress_model.wv['boy']
congress_model.wv.similar_by_vector(diff)

In [None]:
diff = congress_model.wv['peopl'] - congress_model.wv['person']
congress_model.wv.similar_by_vector(diff)

In [None]:
diff = congress_model.wv['person'] - congress_model.wv['peopl']
congress_model.wv.similar_by_vector(diff)

In [None]:
diff = congress_model.wv['think'] - congress_model.wv['heart']
congress_model.wv.similar_by_vector(diff)

In [None]:
diff = congress_model.wv['feel'] - congress_model.wv['think']
congress_model.wv.similar_by_vector(diff)

### Adding vectors to find synonyms

In [None]:
keyword_context = [word[0] for word in congress_model.wv.most_similar("women", topn = 100)]

sum = congress_model.wv[keyword_context[0]] 

for word in keyword_context[1:len(keyword_context)]:
    next_vector = congress_model.wv[word] 
    sum = sum + next_vector
    
congress_model.wv.similar_by_vector(sum)

In [None]:
keyword_context = [word[0] for word in congress_model.wv.most_similar("soldier", topn = 100)]
sum = congress_model.wv[keyword_context[0]] 
for word in keyword_context[1:len(keyword_context)]:
    next_vector = congress_model.wv[word] 
    sum = sum + next_vector
congress_model.wv.similar_by_vector(sum)

In [None]:
keyword_context = [word[0] for word in congress_model.wv.most_similar("happi", topn = 100)]
sum = congress_model.wv[keyword_context[0]] 
for word in keyword_context[1:len(keyword_context)]:
    next_vector = congress_model.wv[word] 
    sum = sum + next_vector
congress_model.wv.similar_by_vector(sum)

In [None]:
keyword_context = [word[0] for word in congress_model.wv.most_similar("american", topn = 100)]
sum = congress_model.wv[keyword_context[0]] 
for word in keyword_context[1:len(keyword_context)]:
    next_vector = congress_model.wv[word] 
    sum = sum + next_vector
congress_model.wv.similar_by_vector(sum)

### Distance and Similarity with Vectors in GENSIM

With similarity, the higher the number, the more alike two terms are in the context in which they are used. 

In [None]:
congress_model.wv.similarity('women', 'men')

In [None]:
congress_model.wv.similarity('soldier', 'men')

In [None]:
congress_model.wv.similarity('women', 'person')

#### Visualize the similarities as a Dendrogram

In [None]:
keywords = ['dream',  'war',  'wealth', 'happi',  'tomorrow', 'past', 'present', 'futur', 'america',  'britain', 'china', 'democrat', 'welfar', 'commun', 'russia', 'congress', 'protest']

In [None]:
keyword_vectors = congress_model.wv[keywords]

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage
links = linkage(keyword_vectors, method='complete', metric='seuclidean')

In [None]:
from matplotlib import pyplot as plt

l = links

# calculate full dendrogram
plt.figure(figsize=(25, 10))
plt.title('Hierarchical Clustering Dendrogram')
plt.ylabel('word')
plt.xlabel('distance')

dendrogram(
    l,
    leaf_rotation=0,  # rotates the x axis labels
    leaf_font_size=16,  # font size for the x axis labels
    orientation='left',
    leaf_label_func=lambda v: str(keywords[v])
)
plt.show()


*Note: if you get an error above, delete any words from the list.*

### Visualizing Abstract Relatedness

In [None]:
import numpy as np

import matplotlib.pyplot as plt
plt.style.use('ggplot')

from sklearn.decomposition import PCA
from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors

In [None]:
#%matplotlib inline

def display_pca_scatterplot(model, words=None, sample=0):
    if words == None:
        if sample > 0:
            words = np.random.choice(list(model.wv.key_to_index.keys()), sample)
        else:
            words = [ word for word in model.wv.key_to_index ]
        
    word_vectors = np.array([model[w] for w in words])

    twodim = PCA().fit_transform(word_vectors)[:,:2]
    
    plt.figure(figsize=(6,6))
    plt.scatter(twodim[:,0], twodim[:,1], edgecolors='k', c='r')
    for word, (x,y) in zip(words, twodim):
        plt.text(x+0.05, y+0.05, word)

In [None]:
display_pca_scatterplot(congress_model.wv, keywords)

## Research the Changing Context of One Keyword Over Time with GENSIM

#### Make a word embedding model per period

In [None]:
dataname = 'sample-m'

In [None]:
periodnames = all_data['5yrperiod'].unique()

In [None]:
cd '/scratch/group/history/hist_3368-jguldi/congress-embeddings'

In [None]:
for period1 in periodnames:
    print('working on ', period1)

    # grab the data from period1
    period_data = sample_m[sample_m['5yrperiod'] == period1] # select one period at a time
    
    # structure the data for Gensim
    period_sentences = structure_data(period_data['speech'], lemma = False, stopwords = True, stemmed = True)
    
    # make the Gensim model
    period_model = gensim.models.Word2Vec( # make a gensim model for that data
        sentences = period_sentences,
        min_count = 2, 
        size = 100)
    
    # save it
    period_model.save(dataname + '-model-' + str(period1)) # save the model with the name of the period


#### Search each 5-year model for a keyword.

In [None]:
keyword1 = 'black'

In [None]:
#########  after the first run, use this line to call the old data without generating it again
keyword_context = []
dates_found = []

# cycle through each period
for period1 in periodnames:
    print('working on ', period1)
    
    # load the model from period1
    period_model = gensim.models.Word2Vec.load(dataname + '-model-' + str(period1)) # to load a saved model

    ## is the keyword found?
    if keyword1 in period_model.wv.key_to_index:
        print('found ', keyword1)
        
        # get the context vector for keyword1
        keyword_context_period = period_model.wv.most_similar(keyword1, topn = 5000) 
        
        # save it for later
        keyword_context.append(keyword_context_period) # save the context of how women were talked about for later
        dates_found.append(period1)

#### Visualize it

In [None]:
# helper function to abstract only unique values while keeping the list in the same order -- the order of first appearance
def unique2(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [None]:
all_words = []
for i in range(len(dates_found)):
    words = [item[0] for item in keyword_context[i]][:10]
    all_words.append(words)

all_words2 = []
for list in all_words:
    for word in list:
        all_words2.append(word)

numwords = 10


In [None]:
%matplotlib inline
#from matplotlib.colors import ListedColormap, LinearSegmentedColormap

from adjustText import adjust_text
from numpy import linspace
from matplotlib import cm

colors = [ cm.viridis(x) for x in linspace(0, 1, len(unique2(all_words2))+10) ]

# change the figure's size here
plt.figure(figsize=(10,10), dpi = 200)

texts = []

# plt.annotate only plots one label per iteration, so we have to use a for loop 
for i in range(len(dates_found)):    # cycle through the period names
    
    #yyy = int(keyword_per_year[keyword_per_year['5yrperiod'] == int(xx)]['count'])   # how many times was the keyword used that year?
                     
    for j in range(10):     # cycle through the first ten words (you can change this variable)
        
        xx = dates_found[i]        # on the x axis, plot the period name
        yy = [item[1] for item in keyword_context[i]][j]         # on the y axis, plot the distance -- how closely the word is related to the keyword
        txt = [item[0] for item in keyword_context[i]][j]        # grab the name of each collocated word
        colorindex = unique2(all_words2).index(txt)   # this command keeps all dots for the same word the same color
        
        plt.scatter(                                             # plot dots
            xx, #x axis
            yy, # y axis
            linewidth=1, 
            color = colors[colorindex],
            edgecolors = 'darkgray',
            s = 100, # dot size
            alpha=0.8)  # dot transparency

        # make a label for each word
        texts.append(plt.text(xx, yy, txt))

# Code to help with overlapping labels -- may take a minute to run
adjust_text(texts, force_points=0.2, force_text=.7, 
                    expand_points=(1, 1), expand_text=(1, 1),
                    arrowprops=dict(arrowstyle="-", color='black', lw=0.5))

plt.xticks(rotation=90)

# Add titles
plt.title("What words were most similar to ''" + keyword1 + "' in Congress?", fontsize=20, fontweight=0, color='Red')
plt.xlabel("period")
plt.ylabel("similarity to " + keyword1)


filename = 'words-similar-to-' + keyword1 + '-' + dataname
plt.savefig(filename)