# Parallelization with Word Context Vectors

#### By Jo Guldi - 11/2021

Workhorse script to download the Congressional testimony, produce a 5-yr word2vec model for showing change over time, run in parallel

## Setup

In [20]:
import multiprocessing
from multiprocessing import Pool

In [21]:
startdate = 1870
enddate = 2010
n = multiprocessing.cpu_count()
print(multiprocessing.cpu_count())

24


In [22]:
def parallelize_operation(df, func, n_cores = n):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

## Loading data

In [23]:
import pandas as pd
import gensim 
import csv
import glob
import numpy as np
import multiprocessing
from sklearn.feature_extraction.text import CountVectorizer
import scipy.spatial.distance
import matplotlib
import matplotlib.pyplot as plt
import itertools
from nltk.tokenize import sent_tokenize
from nltk.tokenize.treebank import TreebankWordTokenizer
#!pip install wordsegment --user
from wordsegment import load, segment, clean
import string
load()
import re

The following lines load some data from Congress. 

In [24]:
all_speech_files = glob.glob('/scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_*.txt')
CONGRESS_MIN_THRESHOLD = 1
CONGRESS_MAX_THRESHOLD = 115

speech_files = []

for fn in all_speech_files:
    number = int(fn.rsplit('_', 1)[-1].split('.')[0])
    if CONGRESS_MIN_THRESHOLD <= number <= CONGRESS_MAX_THRESHOLD:
        speech_files.append(fn)

speech_files.sort()
        
def parse_one(fn):
    print(f'Reading {fn}...')
    return pd.read_csv(fn, sep='|', encoding="ISO-8859-1", error_bad_lines=False, warn_bad_lines=False, quoting=csv.QUOTE_NONE)

speeches_df = pd.concat((parse_one(fn) for fn in speech_files))
speeches_df.dropna(how='any', inplace=True)

all_description_files = glob.glob('/scratch/group/oit_research_data/stanford_congress/hein-bound/descr_*.txt')
                                  
description_files = []

for fn in all_description_files:
    number = int(fn.rsplit('_', 1)[-1].split('.')[0])
    if CONGRESS_MIN_THRESHOLD <= number <= CONGRESS_MAX_THRESHOLD:
        description_files.append(fn)
        description_files.sort()
        
description_df = pd.concat((parse_one(fn) for fn in description_files))

all_data = pd.merge(speeches_df, description_df, on = 'speech_id')
all_data.fillna(0, inplace=True)
all_data = all_data.drop(['chamber', 'speech_id', 'number_within_file', 'speaker', 'first_name'], 1)
all_data = all_data.drop(['last_name', 'state', 'gender', 'line_start', 'line_end', 'file', 'char_count', 'word_count'], 1)
all_data['date']=pd.to_datetime(all_data['date'],format='%Y%m%d')
all_data['year'] = pd.to_datetime(all_data['date']).dt.year
all_data['5yrperiod'] = np.floor(all_data['year'] / 5) * 5 # round each year to the nearest 5 -- by dividing by 5 and "flooring" to the lowest integer
all_data = all_data.drop(['date', 'year'], 1)
all_data['index'] = np.arange(len(all_data)) # create an 'index' column
all_data.head()

Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_043.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_044.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_045.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_046.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_047.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_048.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_049.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_050.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_051.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_052.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_053.txt...
Reading /s

Unnamed: 0,speech,5yrperiod,index
0,The Secretary will read the names of the newly...,1870.0,0
1,said: Mr. President. owing to some inadvertenc...,1870.0,1
2,The question is on the motion of the Senator f...,1870.0,2
3,The order of proceedings will now be formed. f...,1870.0,3
4,of the United States. the PRESIDENTELEcT. The ...,1870.0,4


In [25]:
sample_l = all_data.sample(50000)
sample_m = sample_l.sample(5000)
sample = sample_m.sample(500)

In [26]:
all_data

Unnamed: 0,speech,5yrperiod,index
0,The Secretary will read the names of the newly...,1870.0,0
1,said: Mr. President. owing to some inadvertenc...,1870.0,1
2,The question is on the motion of the Senator f...,1870.0,2
3,The order of proceedings will now be formed. f...,1870.0,3
4,of the United States. the PRESIDENTELEcT. The ...,1870.0,4
...,...,...,...
17394636,Madam Speaker. on rollcall Nos. 662 and 661. I...,2010.0,17394636
17394637,Madam Speaker. as I leave Congress as the peop...,2010.0,17394637
17394638,Madam Speaker. on rolicall No. 658. I was unav...,2010.0,17394638
17394639,Madam Speaker. on rollcall No. 658 my flight w...,2010.0,17394639


In [27]:
sample

Unnamed: 0,speech,5yrperiod,index
4493644,After five and onehalf hours have been consume...,1915.0,4493644
10129556,If this amendment is defeated and the original...,1955.0,10129556
15073002,I yield to the gentleman from New York.,1985.0,15073002
12840264,Mr. President. will the Senator yield?,1975.0,12840264
1675097,To be accurate. $1.38. and that saving is stil...,1890.0,1675097
...,...,...,...
15965567,Who yields time?,1995.0,15965567
6520483,Mr. President. I send to the desk a resolution...,1930.0,6520483
12467993,As the gentleman knows. while we will have pas...,1970.0,12467993
1610205,That offer has nevcr come to the Committee on ...,1890.0,1610205


## Create function for cleaning & structuring the data in parallel

In this section and the next, we will create a function, then launch that function with parallelize_operation.

In [28]:
def split_strings_into_sentences(data): # WORKING 4-16
   
    df = data
    
    # Getting s as pandas series which has split on full stop and new sentence a new line
    s = df['speech'].str.split('.').apply(pd.Series,1).stack()
    s.index = s.index.droplevel(-1) # to line up with df's index
    s.name = 'sentence' # needs a name to join

    del df['speech']
    df = df.join(s)
    del df['index']

    
    return df


In [29]:
def split_sentences_into_words(data): #  works 11-12-21
    
    new_column = [row.split() for row in data['sentence']]
    data['sentence'] = new_column
    
    return(data)

In [30]:
# WORKING 4-17
def cleanup(df):

    df = sentences_df2.reset_index()
    df2 = df  
    
    # To remove punctuation:
    for i, sentence in enumerate(df['sentence']):
        sentence2 = []
        for word in sentence:
            word2 = re.sub('\W', '', word).lower()
            if len(word2)>0:
                sentence2.append(word2)
        #df2['sentence'][index] = sentence2 #<---- ERROR HERE
        #df2.at[index, 'sentence'] = sentence2 
        df2.at[df2.index[i],'sentence'] = sentence2 
        #df2['5yrperiod'][index] = df['5yrperiod'][index]

        

    # To remove any strings shorter than 10 words:
    df2 = df2[
        df2['sentence'].apply(lambda x: len(x)>10) ]
        
    return(df2)

In [31]:
def structure_data(period_data):
    sentences_df = parallelize_operation(period_data, split_strings_into_sentences) #  split speech into sentences
    sentences_df2 = parallelize_operation(sentences_df, split_sentences_into_words) # split sentences into words
    sentences_df3 = cleanup(sentences_df2) # cleanup punctuation and empty lines

    return(sentences_df3)

## Making GENSIM Word Embeddings for all Congress

In [46]:
cd '/scratch/group/history/hist_3368-jguldi/congress-embeddings'

/scratch/group/history/hist_3368-jguldi/congress-embeddings


In [None]:
#  split speech into sentences, split sentences into words, cleanup punctuation and empty lines
structured_data = structure_data(all_data) 
    
# make a gensim model for that data
congress_model = gensim.models.Word2Vec( 
        sentences = structured_data['sentence'],
        workers= n,
        iter = 15,
        min_count = 20, 
        size = 100)  
    
# save the model with the name of the period
congress_model.save('congress_model-1870-2010') 

## Making GENSIM Word Embeddings for every 5yr period

In [17]:
periodnames = all_data['5yrperiod'].unique().tolist()

In [45]:
periodnames

[1975.0, 1980.0, 1985.0, 1990.0, 1995.0, 2000.0, 2005.0, 2010.0]

In [40]:
cd '/scratch/group/history/hist_3368-jguldi/congress-embeddings'

/scratch/group/history/hist_3368-jguldi/congress-embeddings


In [None]:
keyword_context = [] # create an empty dummy variable

for period1 in periodnames:
    
    # get just the data in the period in question
    period_data = all_data[all_data['5yrperiod'] == period1]

    #  split speech into sentences, split sentences into words, cleanup punctuation and empty lines
    structured_data = structure_data(period_data) 
    
    # make a gensim model for that data
    period_model = gensim.models.Word2Vec( 
        sentences = structured_data['sentence'],
        workers= n,
        iter = 15,
        min_count = 20, 
        size = 100)  
    
    # save the model with the name of the period
    period_model.save('model-' + str(period1)) 
    
    # load model for each 5 yr period - one period per cycle of the for loop
    #period_model = gensim.models.Word2Vec.load('model-' + str(period1)) # to load a saved model

    # append each period to a larger model of all congress
    if period1 == periodnames[0]:
        congress_model = period_model # for the first time, save period_model as congress model
    else:    
        congress_model.build_vocab(sentences_df3['sentence'], # after the first period, add new period data to the congress model
                               update = True)
        congress_model.train(sentences_df3['sentence'], total_examples=period_model.corpus_count, epochs=period_model.epochs) 

    # store the model with the name of the period
    congress_model.save('congress_model-' + str(startdate) + '-' + str(enddate))        