# Word Embeddings

In [None]:
import os
import sys
import csv
import re
import gensim

In [None]:
def data_import(dir_path, fname):
    # Read csv file as list of lists. 
    # Then clean the list of lists 

    with open(dir_path + fname, newline = '') as f:
            reader = csv.reader(f)
            data = list(reader)[1:]
            data = list(map(str, data))
            
    data = [re.sub(r'\b[A-Z]+(?:\s+[A-Z]+)*\b', '', ls) for ls in data] # remove words that are all upper case - so names 
    data = [re.sub(r'\\\\n|\\\\t|\'s', '', ls) for ls in data] # remove line breaks, tab breaks, and possessive "s"
    data = [re.sub(r'[^\w\s]|_', '', ls) for ls in data] # remove punctuation and underscore
    data = [re.sub(r'\d{1, 3}', '', ls) for ls in data] # remove digits that are a minimum of 1 and a maximum of 3
    data = [re.sub(r'\w*\d\w*', '', ls) for ls in data] # remove character strings that contain a digit
        
    data = [word.lower() for word in data]
    data = [ls.split() for ls in data]

    return data

data = data_import(dir_path, fname)
                
period_model = gensim.models.Word2Vec(sentences = data, workers = 8, min_count = 20, vector_size = 100) # remove words stated less than 20 times, size of neural net layers; default is 100 - go higher for larger corpora 
     
period_model.save('congress_model_keyword_women_2001')

In [None]:
dir_path = '~/faha/'

In [None]:
%%time

period_model = gensim.models.Word2Vec(sentences = data, workers = 8, min_count = 20, vector_size = 100) # remove words stated less than 20 times, size of neural net layers; default is 100 - go higher for larger corpora 


That sure takes awhile to run. Now imagine if we had an even larger data set! In an ideal situation, we would only run that code once -- not every time we want to analyze word embeddings. In luck

We can save our model 

In [None]:
period_model.save('congress_women_model')

And we can load our model

In [None]:
congress_model_1860 = gensim.models.Word2Vec.load(dir_path + 'hansard_1860_model')

In [None]:
congress_model_1860.wv.most_similar('crime', topn = 10)

In [None]:
congress_model_1860.wv.most_similar('crime', topn = 10)

In [None]:
congress_model_1860.wv.most_similar('crime', topn = 10)

In [None]:
congress_model_1860.wv.most_similar('crime', topn = 10)

In [None]:
congress_model_1860.wv.most_similar('crime', topn = 10)

### Subtracting Vectors

In [None]:
# Which words are similar to woman and not man? 

diff = congress_model_1860.wv['woman'] - congress_model_1860.wv['man']
congress_model_1860.wv.similar_by_vector(diff)

### Find Similarity Score

In [None]:
congress_model_1860.wv.similarity('soldiers', 'men')