# Word Embeddings

Word embeddings can capture the the "context" of a word. A well-trained set of word vectors will place similar words close to each other in meaning. For example: "New York," "California," and "Texas," will cluster in one corner, while "red," "yellow," and "blue" cluster together in another corner. 

In [1]:
import os
import sys
import csv
import re
import gensim

In [33]:
dir_path = '/home/stephbuon/data/hansard_decades/'
n_cores = 24

### Create Word Embedding Models by Period

In [26]:
def data_import(dir_path, fname):
    # Read csv file as list of lists. 
    # Then clean the list of lists 

    with open(dir_path + fname, newline = '') as f:
            reader = csv.reader(f)
            data = list(reader)[1:]
            data = list(map(str, data))
            
    data = [re.sub(r'\b[A-Z]+(?:\s+[A-Z]+)*\b', '', ls) for ls in data] # remove words that are all upper case - so names 
    data = [re.sub(r'\\\\n|\\\\t|\'s', '', ls) for ls in data] # remove line breaks, tab breaks, and possessive "s"
    data = [re.sub(r'[^\w\s]|_', '', ls) for ls in data] # remove punctuation and underscore
    data = [re.sub(r'\d{1, 3}', '', ls) for ls in data] # remove digits that are a minimum of 1 and a maximum of 3
    data = [re.sub(r'\w*\d\w*', '', ls) for ls in data] # remove character strings that contain a digit
        
    data = [word.lower() for word in data]
    data = [ls.split() for ls in data]

    return data


def export_model_by_period(dir_path):
    # create, name, and export word embedding models for each time period. 

    cycle = 0
    for fname in os.listdir(dir_path):
        if '.csv' in fname:
            cycle = cycle + 1

            data = data_import(dir_path, fname)
                
            period_model = gensim.models.Word2Vec(sentences = data,
                                              workers = n_cores, 
                                              min_count = 20, # remove words stated less than 20 times
                                              vector_size = 100) # size of neural net layers; default is 100 - go higher for larger corpora 
                
            extention_position = fname.index('.')
            fname = fname[0:extention_position]
                
            if cycle == 1:
                congress_model = period_model
            else:
                congress_model.build_vocab(data, update = True)
                congress_model.train(data, total_examples = period_model.corpus_count, epochs = period_model.epochs)
        
            save_name = os.path.join(dir_path, fname)
            congress_model.save(save_name + '_model')

CPU times: user 6 µs, sys: 0 ns, total: 6 µs
Wall time: 10.5 µs


In [None]:
%%time

export_model_by_period(dir_path)

### Similarity Vectors

Load a decade of the corpus to see which words are "most similar" to "coal," "crime," "disease," "man," or "woman." Feel free to load a different decade of the corpus or to search your own word!

In [35]:
congress_model_1860 = gensim.models.Word2Vec.load(dir_path + 'hansard_1860_model')

In [36]:
congress_model_1860.wv.most_similar('coal', topn = 10)

[('timber', 0.792468249797821),
 ('coals', 0.783724844455719),
 ('cotton', 0.7810822129249573),
 ('wool', 0.7700701951980591),
 ('ore', 0.7629643082618713),
 ('yarn', 0.7509469985961914),
 ('salt', 0.7386558651924133),
 ('copper', 0.7379512190818787),
 ('flour', 0.7337479591369629),
 ('grain', 0.7298588752746582)]

In [38]:
congress_model_1860.wv.most_similar('crime', topn = 10)

[('drunkenness', 0.7556180357933044),
 ('crimes', 0.7345277070999146),
 ('boycotting', 0.7077376842498779),
 ('insubordination', 0.694895327091217),
 ('outrages', 0.6794412136077881),
 ('outrage', 0.6774588823318481),
 ('conspiracy', 0.675053596496582),
 ('murder', 0.671721875667572),
 ('immorality', 0.6593766808509827),
 ('offences', 0.6577752828598022)]

In [44]:
congress_model_1860.wv.most_similar('disease', topn = 10)

[('infection', 0.7841604351997375),
 ('cholera', 0.7486963272094727),
 ('pleuropneumonia', 0.7468662261962891),
 ('smallpox', 0.7328136563301086),
 ('plague', 0.7072739601135254),
 ('fever', 0.6894288659095764),
 ('sickness', 0.682775616645813),
 ('diseases', 0.6741213202476501),
 ('epidemic', 0.6731758117675781),
 ('contagion', 0.66523277759552)]

In [39]:
congress_model_1860.wv.most_similar('london', topn = 10)

[('dublin', 0.8127764463424683),
 ('glasgow', 0.8059346079826355),
 ('edinburgh', 0.7964843511581421),
 ('belfast', 0.713414192199707),
 ('manchester', 0.7014787197113037),
 ('liverpool', 0.7004226446151733),
 ('loudon', 0.6925938129425049),
 ('bristol', 0.691468358039856),
 ('leeds', 0.6817807555198669),
 ('birmingham', 0.6801958084106445)]

In [40]:
congress_model_1860.wv.most_similar('man', topn = 10)

[('person', 0.8021567463874817),
 ('boy', 0.7457062005996704),
 ('woman', 0.7360450625419617),
 ('policeman', 0.7186694145202637),
 ('lawyer', 0.6999873518943787),
 ('workman', 0.6976069808006287),
 ('nobleman', 0.6952993273735046),
 ('soldier', 0.6944471597671509),
 ('fool', 0.6806564927101135),
 ('lad', 0.6732933521270752)]

In [43]:
congress_model_1860.wv.most_similar('woman', topn = 10)

[('girl', 0.8324806690216064),
 ('boy', 0.7969318628311157),
 ('child', 0.7723746299743652),
 ('husband', 0.7638962268829346),
 ('policeman', 0.7538986802101135),
 ('daughter', 0.7533218860626221),
 ('widow', 0.7492982149124146),
 ('lady', 0.7480725049972534),
 ('man', 0.7360450625419617),
 ('wife', 0.7273796200752258)]

In [45]:
congress_model_1860.wv.most_similar('corn', topn = 10)

[('sugar', 0.8062563538551331),
 ('wheat', 0.7994242906570435),
 ('barley', 0.7566043734550476),
 ('grain', 0.7208844423294067),
 ('wool', 0.7201630473136902),
 ('malt', 0.7100582122802734),
 ('flour', 0.7097944617271423),
 ('hops', 0.6989511251449585),
 ('tobacco', 0.6885563731193542),
 ('maize', 0.6872325539588928)]

### Subtracting Vectors
(e.g.: "first word" - "second word" = ?)

In [46]:
# Which words are similar to woman and not man? 

diff = congress_model_1860.wv['woman'] - congress_model_1860.wv['man']
congress_model_1860.wv.similar_by_vector(diff)

[('illegitimate', 0.3977678418159485),
 ('underfed', 0.3500392735004425),
 ('marriages', 0.34574127197265625),
 ('vaccine', 0.3394405245780945),
 ('grandchildren', 0.33773303031921387),
 ('dying', 0.33594730496406555),
 ('sickness', 0.3323615789413452),
 ('infant', 0.3310065269470215),
 ('deportation', 0.3309376537799835),
 ('zones', 0.3264090120792389)]

In [47]:
# Which words are similar to man and not woman? 

diff = congress_model_1860.wv['man'] - congress_model_1860.wv['woman']
congress_model_1860.wv.similar_by_vector(diff)

[('man', 0.6192151308059692),
 ('statesman', 0.48818546533584595),
 ('lawyer', 0.4785446226596832),
 ('critic', 0.43419334292411804),
 ('person', 0.42894574999809265),
 ('nobleman', 0.4061669707298279),
 ('civilian', 0.40244725346565247),
 ('member', 0.4014107286930084),
 ('anybody', 0.39986884593963623),
 ('anyone', 0.38719943165779114)]

### Find Similarity Score
(e.g.: how similar is "first word" to "second word")

In [51]:
congress_model_1860.wv.similarity('soldiers', 'men')

0.7297269

In [50]:
congress_model_1860.wv.similarity('women', 'men')

0.5561376

In [55]:
congress_model_1860.wv.similarity('prostitute', 'woman')

0.59991306

### Visualize Word Embeddings By Decade

In [None]:
from matplotlib import cm
from numpy import linspace
from adjustText import adjust_text
from matplotlib import pyplot as plt

In [32]:
class w2v_embeddings:
    
    def keyword_context_find_difference(dir_path, keyword_, keyword2):

        keyword_context = []
        for fname in os.listdir(dir_path):
            if '_model' in fname:
                congress_model = gensim.models.Word2Vec.load(dir_path + fname)
                diff = congress_model.wv[keyword_] - congress_model.wv[keyword2] # these two are one word but not the other. 
                keyword_context_period = congress_model.wv.similar_by_vector(diff, topn = 100)
                keyword_context.append(keyword_context_period)
            
            else:
                keyword_context.append([]) 
                
        return keyword_context

    def keyword_context_find_most_similar(dir_path, keyword_):
        
        keyword_context = []
        for fname in os.listdir(dir_path):
            if '_model' in fname:
                congress_model = gensim.models.Word2Vec.load(dir_path + fname)
                keyword_context_period = congress_model.wv.most_similar(keyword_, topn = 100) # most similar 
                keyword_context.append(keyword_context_period)

            else:
                keyword_context.append([]) 
                
        return keyword_context

    
    
class w2v_visualize_scatter_plot:
    
    def label_periods(start, end, interval):
        periods = range(start, end, interval)
        period_names = [str(period) + '.0' for period in periods]
        return period_names


    def collect_text_values(keyword_context, period_names):
        period_words = []
    
        for i in range(0, len(period_names)):
            try:
                words = [value[0] for value in keyword_context[i]]
                period_words.append(words)
            except:
                continue
                
        return period_words
        
        
    def make_1D_list(period_words):
        flat_list = []
    
        for list in period_words:
            for word in list:
                flat_list.append(word)
                
        return flat_list
    
    
    def w2v_scatter_plot(period_names, keyword_context, flat_list, keyword): # add a kw argument for title -- like Hansard debates 
        colors = [ cm.gnuplot(x) for x in linspace(0, 1, len(flat_list)) ]
    
        plt.figure(figsize=(30, 30), dpi = 1000)

        texts = []

        # plt.annotate only plots one label per iteration, so we have to use a for loop 
        for i in range(0,len(period_names)): # cycle through the period names                     
            for j in range(15): # cycle through the first ten words (you can change this variable)
                if keyword_context[i]:
                    xx = period_names[i] # on the x axis, plot the period name
                    yy = [item[1] for item in keyword_context[i]][j] # on the y axis, plot the distance -- how closely the word is related to the keyword
                    txt = [item[0] for item in keyword_context[i]][j] # grab the name of each collocated word
                    colorindex = flat_list.index(txt) # this command keeps all dots for the same word the same color
        
                    plt.scatter(  # plot dots
                        xx, # x axis
                        yy, # y axis
                        linewidth=1, 
                        color = colors[colorindex],
                        s = 300, # dot size
                        alpha=0.5) # dot transparency

                    texts.append(plt.text(xx, yy, txt)) # make a label for each word

        adjust_text(texts, force_points=0.0001, force_text=0.0035, # Code to help with overlapping labels -- may take a minute to run
                            expand_points=(2, 2), expand_text=(2, 2), # from 1, 1 
                            arrowprops=dict(arrowstyle="-", color='black', lw=0.5))

        plt.xticks(rotation=90)
        plt.title("What words were most associated with ''" + keyword + "' in the Hansard debates?", fontsize=20, fontweight=0, color='Red')
        plt.xlabel("period")
        plt.ylabel("similarity to " + keyword)
        plt.savefig(keyword + '_' + period_names[1] + '_' + period_names[-1] +'.pdf')
        plt.show()

In [None]:
%%time 

keyword_context = w2v_embeddings.keyword_context_find_difference(dir_path, 'woman', 'man')
period_names = w2v_visualize_scatter_plot.label_periods(1800, 1900, 5)
period_words = w2v_visualize_scatter_plot.collect_text_values(keyword_context, period_names)
flat_list = w2v_visualize_scatter_plot.make_1D_list(period_words)    

w2v_visualize_scatter_plot.w2v_scatter_plot(period_names, keyword_context, flat_list, 'woman')

In [None]:
%%time 

keywords_list = ['education', 'coal', 'railway', 'corn']

for keyword in keywords_list:
    keyword_context = w2v_embeddings.keyword_context_find_most_similar(dir_path, keyword)
    period_names = w2v_visualize_scatter_plot.label_periods(1800, 1900, 5)
    period_words = w2v_visualize_scatter_plot.collect_text_values(keyword_context, period_names)
    flat_list = w2v_visualize_scatter_plot.make_1D_list(period_words)    
    
    try:
        w2v_visualize_scatter_plot.w2v_scatter_plot(period_names, keyword_context, flat_list, keyword)
    except:
        continue

In [None]:
woman_vector = congress_model.wv['woman']
congress_model.wv.similar_by_vector(woman_vector)

In [107]:
congress_model.wv.most_similar("iraq", topn = 20)

KeyError: "Key 'iraq' not present"

In [None]:
congress_model.wv.most_similar("america", topn = 20)

In [None]:
congress_model.wv.most_similar("britain", topn = 20)

In [None]:
diff = congress_model.wv['man'] - congress_model.wv['woman']
congress_model.wv.similar_by_vector(diff)

In [None]:
diff = congress_model.wv['woman'] - congress_model.wv['boy']
congress_model.wv.similar_by_vector(diff)

In [89]:
keyword_context = [word[0] for word in congress_model.wv.most_similar("the", topn = 100)]

sum = congress_model.wv[keyword_context[0]] 

for word in keyword_context[1:len(keyword_context)]:
    next_vector = congress_model.wv[word] 
    sum = sum + next_vector
    
congress_model.wv.similar_by_vector(sum)

KeyError: "Key 'the' not present"

In [None]:
congress_model.wv.similarity('women', 'men')

In [None]:
congress_model.wv.similarity('soldier', 'men')

In [None]:
congress_model.wv.similarity('women', 'person')

In [None]:
periodnames = all_data['5yrperiod'].unique()

In [None]:
for period1 in periodnames:
    print('working on ', period1)

    # grab the data from period1
    period_data = sample_m[sample_m['5yrperiod'] == period1] # select one period at a time
    
    # structure the data for Gensim
    period_sentences = structure_data(period_data['speech'], lemma = False, stopwords = True, stemmed = True)
    
    # make the Gensim model
    period_model = gensim.models.Word2Vec( # make a gensim model for that data
        sentences = period_sentences,
        min_count = 2, 
        size = 100)
    
    # save it
    period_model.save(dataname + '-model-' + str(period1)) # save the model with the name of the period


In [None]:
keyword1 = 'black'

In [None]:
#########  after the first run, use this line to call the old data without generating it again
keyword_context = []
dates_found = []

# cycle through each period
for period1 in periodnames:
    print('working on ', period1)
    
    # load the model from period1
    period_model = gensim.models.Word2Vec.load(dataname + '-model-' + str(period1)) # to load a saved model

    ## is the keyword found?
    if keyword1 in period_model.wv.key_to_index:
        print('found ', keyword1)
        
        # get the context vector for keyword1
        keyword_context_period = period_model.wv.most_similar(keyword1, topn = 5000) 
        
        # save it for later
        keyword_context.append(keyword_context_period) # save the context of how women were talked about for later
        dates_found.append(period1)

In [None]:
# helper function to abstract only unique values while keeping the list in the same order -- the order of first appearance
def unique2(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [None]:
all_words = []
for i in range(len(dates_found)):
    words = [item[0] for item in keyword_context[i]][:10]
    all_words.append(words)

all_words2 = []
for list in all_words:
    for word in list:
        all_words2.append(word)

numwords = 10


In [6]:
%matplotlib inline
#from matplotlib.colors import ListedColormap, LinearSegmentedColormap

from adjustText import adjust_text
from numpy import linspace
from matplotlib import cm

colors = [ cm.viridis(x) for x in linspace(0, 1, len(unique2(all_words2))+10) ]

# change the figure's size here
plt.figure(figsize=(10,10), dpi = 200)

texts = []

# plt.annotate only plots one label per iteration, so we have to use a for loop 
for i in range(len(dates_found)):    # cycle through the period names
    
    #yyy = int(keyword_per_year[keyword_per_year['5yrperiod'] == int(xx)]['count'])   # how many times was the keyword used that year?
                     
    for j in range(10):     # cycle through the first ten words (you can change this variable)
        
        xx = dates_found[i]        # on the x axis, plot the period name
        yy = [item[1] for item in keyword_context[i]][j]         # on the y axis, plot the distance -- how closely the word is related to the keyword
        txt = [item[0] for item in keyword_context[i]][j]        # grab the name of each collocated word
        colorindex = unique2(all_words2).index(txt)   # this command keeps all dots for the same word the same color
        
        plt.scatter(                                             # plot dots
            xx, #x axis
            yy, # y axis
            linewidth=1, 
            color = colors[colorindex],
            edgecolors = 'darkgray',
            s = 100, # dot size
            alpha=0.8)  # dot transparency

        # make a label for each word
        texts.append(plt.text(xx, yy, txt))

# Code to help with overlapping labels -- may take a minute to run
adjust_text(texts, force_points=0.2, force_text=.7, 
                    expand_points=(1, 1), expand_text=(1, 1),
                    arrowprops=dict(arrowstyle="-", color='black', lw=0.5))

plt.xticks(rotation=90)

# Add titles
plt.title("What words were most similar to ''" + keyword1 + "' in Congress?", fontsize=20, fontweight=0, color='Red')
plt.xlabel("period")
plt.ylabel("similarity to " + keyword1)


filename = 'words-similar-to-' + keyword1 + '-' + dataname
plt.savefig(filename)

NameError: name 'unique2' is not defined

### Citation
This Notebook was developed by Steph Buongiorno. Code to visualize the word embeddings was taken from Jo Guldi's course on Digital History (see: https://github.com/stephbuon/digital-history/tree/master/hist3368-week12-word-context-vectors)