In [5]:
import glob
import os
import csv
import pandas as pd
import numpy as np 

def read_speeches():
    all_speech_files = glob.glob('/scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_*.txt')
    CONGRESS_MIN_THRESHOLD = 1
    CONGRESS_MAX_THRESHOLD = 115
    
    speech_files = []
    
    for fn in all_speech_files:
        number = int(fn.rsplit('_', 1)[-1].split('.')[0])
        
        if CONGRESS_MIN_THRESHOLD <= number <= CONGRESS_MAX_THRESHOLD:
            speech_files.append(fn)
            speech_files.sort()
    return speech_files

def read_descriptions():
    all_description_files = glob.glob('/scratch/group/oit_research_data/stanford_congress/hein-bound/descr_*.txt')
    CONGRESS_MIN_THRESHOLD = 1
    CONGRESS_MAX_THRESHOLD = 115
    
    description_files = []
    
    for fn in all_description_files:
        number = int(fn.rsplit('_', 1)[-1].split('.')[0])
        
        if CONGRESS_MIN_THRESHOLD <= number <= CONGRESS_MAX_THRESHOLD:
            description_files.append(fn)
            description_files.sort()
    return description_files
        
def reader(fn):
    print(f'Reading {fn}...')
    return pd.read_csv(fn, sep='|', encoding="ISO-8859-1", error_bad_lines=False, warn_bad_lines=False, quoting=csv.QUOTE_NONE)

def clean_data(all_data):
    all_data = all_data.drop(['chamber', 'speech_id', 'number_within_file', 'speaker', 'first_name'], 1)
    all_data = all_data.drop(['last_name', 'state', 'gender', 'line_start', 'line_end', 'file', 'char_count', 'word_count'], 1)
    all_data['date']=pd.to_datetime(all_data['date'],format='%Y%m%d')
    all_data['year'] = pd.to_datetime(all_data['date']).dt.year
    all_data['5yrperiod'] = np.floor(all_data['year'] / 5) * 5 # round each year to the nearest 5 -- by dividing by 5 and "flooring" to the lowest integer
    all_data = all_data.drop(['date', '5yrperiod'], 1)
    all_data['index'] = np.arange(len(all_data)) # create an 'index' column
    return all_data
    
def import_congressional_data(*args, **kwargs):
    cd = kwargs.get('clean_data', None)
    speech_files = read_speeches()
    description_files = read_descriptions()
    
    speeches_df = pd.concat((reader(fn) for fn in speech_files))
    speeches_df.dropna(how='any', inplace=True)
    
    description_df = pd.concat((reader(fn) for fn in description_files))
    
    all_data = pd.merge(speeches_df, description_df, on = 'speech_id')
    all_data.fillna(0, inplace=True)
    
    if cd == True:
        all_data = clean_data(all_data)
    
    return all_data

congressional_data = import_congressional_data(clean_data = True)

Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_043.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_044.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_045.txt...


KeyboardInterrupt: 

In [8]:
import os 
import pandas as pd 
from varname import nameof

def set_dir(data): 
    path = os.getcwd()
    current_folder = os.path.basename(path)
    target_folder = nameof(data) + '_subsets'

    if current_folder != target_folder:
        os.makedirs(target_folder)
        os.chdir(target_folder)
        
def interval_subset(data, col_name, start, end, intv):
    set_dir(data)
    
    start = start
    end = end

    while start <= end:
        start = start + intv
        subset = data[(data[col_name] >= start - intv) & (data[col_name] <= start - 1)]
        
        descr = str(subset[col_name].iloc[0])
        descr_2 = str(subset[col_name]. iloc[-1])
        
        file_name = "stanford_congressional_records_" + descr + "_" + descr_2
        
        subset.to_csv(file_name + ".csv", index = False)
        
interval_subset(congressional_data, 'year', 1870, 2010, 5)

In [68]:
### starting here

In [1]:
#import nltk
from multiprocessing import Process, Queue, cpu_count, Pool

n = 1

def parallelize_operation(df, function, n_cores = n):
    split_df = np.array_split(df, n_cores)
    pool = Pool(n)
    df = pd.concat(pool.map(function, split_df))
    pool.close()
    pool.join()
    return df

In [2]:
def split_sentences(df):
    split_rule = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
    
    df['speech'] = df['speech'].apply(lambda x: re.split(split_rule, x))
    
    df = df.explode('speech')
    df.reset_index()
    df.rename(columns = {'speech': 'sentence'}, inplace = True)
    
    return df

In [3]:
# The gensim Word2Vec model does not expect strings as its text examples (sentences), 
# but lists-of-tokens. Thus, it's up to your code to tokenize your text, 
# before passing it to Word2Vec. 

In [6]:
import pandas as pd
import numpy as np
import os
import re
import gensim

def export_congressional_gensim_models(dir_path):
    file_names = []
    cycle = 0
    
    for fname in os.listdir(dir_path):
        file_names.append(fname)
        
    for fname in file_names:
        cycle = cycle + 1
        
        imported_data = pd.read_csv(dir_path + fname)
        
        sentences_df = parallelize_operation(imported_data, split_sentences)
        
        sentences_df['sentence'] = sentences_df['sentence'].str.split()
        
        period_model = gensim.models.Word2Vec(sentences = sentences_df['sentence'],
                                             workers = n, 
                                             min_count = 20, # remove words stated less than 20 times
                                             size = 100) # size of neuralnet layers; default is 100 - go higher for larger corpora 
        
        extention_position = fname.index('.')
        fname = fname[0:extention_position]
        
        #period_model.save(add somethign about period + '_model')
        
        if cycle == 1:
            congress_model = period_model
        else:
            congress_model.build_vocab(sentences_df['sentence'], update = True)
            congress_model.train(sentences_df['sentence'], total_examples = period_model.corpus_count, epochs = period_model.epochs)
            
        congress_model.save(fname + '_model')
                      

export_congressional_gensim_models('data_subsets/')  

                                                sentence  year  index
0      [The, Secretary, will, read, the, names, of, t...  1873      0
0            [The, list, was, read, as, follows:, lion.]  1873      0
0                                [Bainbridge, Wadleigh.]  1873      0
0                                  [of, New, Hampshire.]  1873      0
0                                                 [Hon.]  1873      0
...                                                  ...   ...    ...
83131  [for, the, expenses, of, said, coninittee, whi...  1874  83131
83131                                            [1874.]  1874  83131
83131  [and, the, receipts, of, the, SergeantatArms, ...  1874  83131
83132  [I, would, like, to, ask, the, gentleman, from...  1874  83132
83132  [[Mr., ]IUFFINTON.], the, chairman, of, the, C...  1874  83132

[906218 rows x 3 columns]


KeyboardInterrupt: 

In [7]:
congress_model = gensim.models.Word2Vec.load('stanford_congressional_records_1873_1874_model')

In [8]:
# Load Google news vectors
#word2vec_path = "path_to_the_vectors/GoogleNews-vectors-negative300.bin"
#word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

congress_model = gensim.models.Word2Vec.load('stanford_congressional_records_1873_1874_model')# contains the list of all unique words in pre-trained word2vec vectors
congress_w2v_vocabulary = congress_model.wv.vocab

In [9]:
congress_model.wv.index2word[:25]

['the',
 'of',
 'to',
 'and',
 'that',
 'I',
 'in',
 'a',
 'is',
 'it',
 'be',
 'for',
 'this',
 'not',
 'as',
 'by',
 'have',
 'was',
 'from',
 'on',
 'The',
 'which',
 'will',
 'or',
 'at']

In [30]:
len(congress_model.wv.index2word)

12786

In [31]:
congress_model.wv.vectors

array([[ 0.60460347,  0.30577746,  0.04732674, ..., -0.1637509 ,
         0.8122656 ,  0.91422427],
       [ 1.8640128 , -0.4627957 , -0.00588125, ..., -0.17580304,
         1.1726897 ,  3.2119944 ],
       [ 1.7400634 ,  0.27179623, -0.41134694, ...,  0.5012224 ,
         1.3753427 ,  2.5045655 ],
       ...,
       [-0.36400697,  0.21801034,  0.12883958, ..., -0.03926761,
         0.19388047, -0.12139821],
       [ 0.00443427,  0.26798034,  0.11281913, ...,  0.02487435,
         0.21857229, -0.00399325],
       [-0.14769058,  0.15812978, -0.0885343 , ..., -0.09518862,
         0.12002692, -0.07296944]], dtype=float32)

In [32]:
congress_model.wv.index2word[3]

'to'

In [34]:
congress_model.wv['to']

array([ 1.5259914 , -0.06501204,  1.5409822 ,  1.3083979 , -0.09762071,
       -0.3784739 , -0.65320337, -0.45095688, -0.20189238,  0.47988546,
       -0.79922223, -0.70096254, -0.55282915,  0.67890483,  0.28057677,
        3.623499  ,  0.24380912, -0.6631685 , -1.2019287 ,  1.8694769 ,
        0.26237366,  2.2759106 ,  2.0318587 , -0.02667681, -0.83172095,
        0.81786513,  3.2583039 ,  0.12102979, -0.23105001, -0.20997049,
        0.11903524,  1.3914397 , -1.15164   , -0.63356674,  1.4139063 ,
       -0.34143248,  0.6979728 , -0.21829842, -0.43481293, -1.7108552 ,
        0.39861017,  0.5982425 ,  0.858112  ,  0.37887698,  2.215464  ,
       -0.06052481, -1.7100105 , -0.980291  , -1.0175716 , -1.2064841 ,
       -0.8480001 ,  1.4861001 , -0.06477726, -0.13914345, -1.2290816 ,
        0.92969143,  0.09116568, -0.5941392 , -0.8047061 ,  0.8558362 ,
       -0.3209721 ,  3.0910513 ,  0.46904764,  0.93448955, -1.3295171 ,
        1.48655   , -1.2432182 ,  1.239149  ,  1.1468074 , -0.11

In [35]:
congress_model.wv.vectors[3]

array([ 1.5259914 , -0.06501204,  1.5409822 ,  1.3083979 , -0.09762071,
       -0.3784739 , -0.65320337, -0.45095688, -0.20189238,  0.47988546,
       -0.79922223, -0.70096254, -0.55282915,  0.67890483,  0.28057677,
        3.623499  ,  0.24380912, -0.6631685 , -1.2019287 ,  1.8694769 ,
        0.26237366,  2.2759106 ,  2.0318587 , -0.02667681, -0.83172095,
        0.81786513,  3.2583039 ,  0.12102979, -0.23105001, -0.20997049,
        0.11903524,  1.3914397 , -1.15164   , -0.63356674,  1.4139063 ,
       -0.34143248,  0.6979728 , -0.21829842, -0.43481293, -1.7108552 ,
        0.39861017,  0.5982425 ,  0.858112  ,  0.37887698,  2.215464  ,
       -0.06052481, -1.7100105 , -0.980291  , -1.0175716 , -1.2064841 ,
       -0.8480001 ,  1.4861001 , -0.06477726, -0.13914345, -1.2290816 ,
        0.92969143,  0.09116568, -0.5941392 , -0.8047061 ,  0.8558362 ,
       -0.3209721 ,  3.0910513 ,  0.46904764,  0.93448955, -1.3295171 ,
        1.48655   , -1.2432182 ,  1.239149  ,  1.1468074 , -0.11

In [36]:
man_vector = congress_model.wv['man']
congress_model.wv.similar_by_vector(man_vector)

[('man', 1.0),
 ('mai', 0.7953096032142639),
 ('lawyer', 0.794982373714447),
 ('mian', 0.7246367931365967),
 ('merchant', 0.7235336303710938),
 ('person', 0.7084944248199463),
 ('citizen', 0.7060525417327881),
 ('woman', 0.6985549926757812),
 ('witness', 0.6958960890769958),
 ('mani', 0.6927763819694519)]

In [37]:
woman_vector = congress_model.wv['woman']
congress_model.wv.similar_by_vector(woman_vector)

[('woman', 1.0),
 ('mau', 0.7448790073394775),
 ('nan', 0.7285056710243225),
 ('citizen', 0.721676230430603),
 ('foreigner', 0.7166454792022705),
 ('lawyer', 0.711015522480011),
 ('man', 0.698555052280426),
 ('mal', 0.6902548670768738),
 ('statesman', 0.6847317814826965),
 ('child', 0.682125985622406)]

In [38]:
individual_vector = congress_model.wv['individual']
congress_model.wv.similar_by_vector(individual_vector)

[('individual', 1.0),
 ('invasion', 0.6480956077575684),
 ('honest', 0.6223812103271484),
 ('citizen', 0.5964864492416382),
 ('enemy', 0.5819460153579712),
 ('eminent', 0.5773656368255615),
 ('absolute', 0.5759495496749878),
 ('creditor', 0.5704253911972046),
 ('impartial', 0.5657567977905273),
 ('institution', 0.562224268913269)]

In [39]:
soldier_vector = congress_model.wv['soldier']
congress_model.wv.similar_by_vector(soldier_vector)

[('soldier', 1.0),
 ('banker', 0.735933244228363),
 ('merchant', 0.7165051698684692),
 ('pensioner', 0.7065895795822144),
 ('child', 0.7055156826972961),
 ('captain', 0.7017030119895935),
 ('citizen', 0.6945666670799255),
 ('master', 0.684952437877655),
 ('widow', 0.6798447370529175),
 ('man', 0.678787350654602)]

In [40]:
congress_model.wv.similarity('women', 'men')

0.660766

In [41]:
congress_model.wv.most_similar("women", topn = 50)

[('children', 0.7390205264091492),
 ('mechanics', 0.696312665939331),
 ('merchants', 0.6934593915939331),
 ('soldiers', 0.662136971950531),
 ('men', 0.6607660055160522),
 ('rich', 0.6575027704238892),
 ('poor', 0.6511082649230957),
 ('white', 0.6481199264526367),
 ('wealthy', 0.6472270488739014),
 ('farmers', 0.6381540298461914),
 ('capitalists', 0.6340653300285339),
 ('colored', 0.6272273659706116),
 ('widows', 0.6266400218009949),
 ('whites', 0.6213538646697998),
 ('race', 0.6209286451339722),
 ('laboring', 0.6203826069831848),
 ('skilled', 0.6200368404388428),
 ('brave', 0.6170817017555237),
 ('noble', 0.6160266399383545),
 ('sailors', 0.6157126426696777),
 ('families', 0.6148433089256287),
 ('politicians', 0.611767053604126),
 ('struggling', 0.6080377101898193),
 ('pursuits', 0.5966960191726685),
 ('fields', 0.5959955453872681),
 ('laborers', 0.5951179265975952),
 ('foreigners', 0.5948503017425537),
 ('miners', 0.590263843536377),
 ('orphans', 0.5888699293136597),
 ('destitute', 0.

In [42]:
congress_model.wv.most_similar("men", topn = 50)

[('persons', 0.716057538986206),
 ('mcn', 0.6782583594322205),
 ('merchants', 0.6716645956039429),
 ('lawyers', 0.6679767370223999),
 ('women', 0.6607660055160522),
 ('soldiers', 0.6498055458068848),
 ('nen', 0.6392128467559814),
 ('gentlemen', 0.6223151683807373),
 ('people', 0.6207624673843384),
 ('statesmen', 0.6085804104804993),
 ('mechanics', 0.6075679063796997),
 ('Men', 0.6060754060745239),
 ('farmers', 0.6058502793312073),
 ('parties', 0.5998297333717346),
 ('capitalists', 0.5988258123397827),
 ('mei', 0.5976186990737915),
 ('claimants', 0.5947614312171936),
 ('mon', 0.5866626501083374),
 ('boys', 0.584881067276001),
 ('ladies', 0.5812183618545532),
 ('children', 0.577019453048706),
 ('laborers', 0.563194751739502),
 ('citizens', 0.562316358089447),
 ('voters', 0.5585533380508423),
 ('man', 0.5551823377609253),
 ('mien', 0.5535681247711182),
 ('republicans', 0.5527560710906982),
 ('ones', 0.549206018447876),
 ('officers', 0.5481641888618469),
 ('others', 0.545403242111206),
 ('

In [45]:
diff = congress_model.wv['man'] - congress_model.wv['woman']
congress_model.wv.similar_by_vector(diff, topn=40)

[('man', 0.9851939678192139),
 ('mai', 0.7532967925071716),
 ('lawyer', 0.7459124326705933),
 ('person', 0.6866796612739563),
 ('mian', 0.6854915618896484),
 ('merchant', 0.6839532852172852),
 ('witness', 0.667205810546875),
 ('citizen', 0.6408608555793762),
 ('mani', 0.6379607915878296),
 ('farmer', 0.628496527671814),
 ('soldier', 0.623019278049469),
 ('mal', 0.6229250431060791),
 ('mau', 0.5995610356330872),
 ('banker', 0.5969903469085693),
 ('statesman', 0.5964738130569458),
 ('claimant', 0.5928665399551392),
 ('mans', 0.5873139500617981),
 ('nian', 0.5828565955162048),
 ('member', 0.5819664001464844),
 ('child', 0.5807590484619141),
 ('officer', 0.5775875449180603),
 ('woman', 0.5655345916748047),
 ('creditor', 0.5430001020431519),
 ('party', 0.5425457954406738),
 ('men', 0.5381265878677368),
 ('pensioner', 0.5282629728317261),
 ('importer', 0.5280255079269409),
 ('inventor', 0.5202513933181763),
 ('crime', 0.5167615413665771),
 ('nan', 0.5114675760269165),
 ('everybody', 0.495893

In [46]:
diff = congress_model.wv['woman'] - congress_model.wv['man']
congress_model.wv.similar_by_vector(diff, topn = 40)

[('bythe', 0.44771862030029297),
 ('thc', 0.42590487003326416),
 ('seventeenth', 0.42080157995224),
 ('regulating', 0.42070892453193665),
 ('enlarging', 0.4206809997558594),
 ('limiting', 0.4198291599750519),
 ('thie', 0.4162326455116272),
 ('rite', 0.4155820608139038),
 ('tlte', 0.4118894934654236),
 ('rhe', 0.41102635860443115),
 ('bytho', 0.4047302305698395),
 ('thatthe', 0.3940526843070984),
 ('fromthe', 0.3919954299926758),
 ('repealing', 0.3863179385662079),
 ('samo', 0.3858068883419037),
 ('onthe', 0.3821149170398712),
 ('ibis', 0.3818950951099396),
 ('che', 0.3814570903778076),
 ('thme', 0.3813951313495636),
 ('tlhis', 0.38039737939834595),
 ('3', 0.3798547685146332),
 ('Tite', 0.379599928855896),
 ('Revised', 0.3795681595802307),
 ('tb', 0.37943193316459656),
 ('adhering', 0.37870073318481445),
 ('otter', 0.3784489035606384),
 ('instructing', 0.37661123275756836),
 ('thrat', 0.37167686223983765),
 ('flom', 0.3710923194885254),
 ('tihe', 0.36981987953186035),
 ('tbo', 0.3668579

In [None]:
# come back to rest: https://github.com/stephbuon/democracy-lab/blob/main/word-embeddings/workhorse-parallel-context-vectors.ipynb