In [1]:
import glob
import os
import csv
import pandas as pd
import numpy as np 

def read_speeches():
    all_speech_files = glob.glob('/scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_*.txt')
    CONGRESS_MIN_THRESHOLD = 1
    CONGRESS_MAX_THRESHOLD = 115
    
    speech_files = []
    
    for fn in all_speech_files:
        number = int(fn.rsplit('_', 1)[-1].split('.')[0])
        
        if CONGRESS_MIN_THRESHOLD <= number <= CONGRESS_MAX_THRESHOLD:
            speech_files.append(fn)
            speech_files.sort()
    return speech_files

def read_descriptions():
    all_description_files = glob.glob('/scratch/group/oit_research_data/stanford_congress/hein-bound/descr_*.txt')
    CONGRESS_MIN_THRESHOLD = 1
    CONGRESS_MAX_THRESHOLD = 115
    
    description_files = []
    
    for fn in all_description_files:
        number = int(fn.rsplit('_', 1)[-1].split('.')[0])
        
        if CONGRESS_MIN_THRESHOLD <= number <= CONGRESS_MAX_THRESHOLD:
            description_files.append(fn)
            description_files.sort()
    return description_files
        
def reader(fn):
    print(f'Reading {fn}...')
    return pd.read_csv(fn, sep='|', encoding="ISO-8859-1", error_bad_lines=False, warn_bad_lines=False, quoting=csv.QUOTE_NONE)

def clean_data(all_data):
    all_data = all_data.drop(['chamber', 'speech_id', 'number_within_file', 'speaker', 'first_name'], 1)
    all_data = all_data.drop(['last_name', 'state', 'gender', 'line_start', 'line_end', 'file', 'char_count', 'word_count'], 1)
    all_data['date']=pd.to_datetime(all_data['date'],format='%Y%m%d')
    all_data['year'] = pd.to_datetime(all_data['date']).dt.year
    all_data['5yrperiod'] = np.floor(all_data['year'] / 5) * 5 # round each year to the nearest 5 -- by dividing by 5 and "flooring" to the lowest integer
    all_data = all_data.drop(['date', '5yrperiod'], 1)
    all_data['index'] = np.arange(len(all_data)) # create an 'index' column
    return all_data
    
def import_congressional_data(*args, **kwargs):
    cd = kwargs.get('clean_data', None)
    speech_files = read_speeches()
    description_files = read_descriptions()
    
    speeches_df = pd.concat((reader(fn) for fn in speech_files))
    speeches_df.dropna(how='any', inplace=True)
    
    description_df = pd.concat((reader(fn) for fn in description_files))
    
    all_data = pd.merge(speeches_df, description_df, on = 'speech_id')
    all_data.fillna(0, inplace=True)
    
    if cd == True:
        all_data = clean_data(all_data)
    
    return all_data

congressional_data = import_congressional_data(clean_data = True)

Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_043.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_044.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_045.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_046.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_047.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_048.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_049.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_050.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_051.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_052.txt...
Reading /scratch/group/oit_research_data/stanford_congress/hein-bound/speeches_053.txt...
Reading /s

In [8]:
import os 
import pandas as pd 
from varname import nameof

def set_dir(data): 
    path = os.getcwd()
    current_folder = os.path.basename(path)
    target_folder = nameof(data) + '_subsets'

    if current_folder != target_folder:
        os.makedirs(target_folder)
        os.chdir(target_folder)
        
def interval_subset(data, col_name, start, end, intv):
    set_dir(data)
    
    start = start
    end = end

    while start <= end:
        start = start + intv
        subset = data[(data[col_name] >= start - intv) & (data[col_name] <= start - 1)]
        
        descr = str(subset[col_name].iloc[0])
        descr_2 = str(subset[col_name]. iloc[-1])
        
        file_name = "stanford_congressional_records_" + descr + "_" + descr_2
        
        subset.to_csv(file_name + ".csv", index = False)
        
interval_subset(congressional_data, 'year', 1870, 2010, 5)

In [1]:
#import nltk
from multiprocessing import Process, Queue, cpu_count, Pool

n = 1

def parallelize_operation(df, function, n_cores = n):
    split_df = np.array_split(df, n_cores)
    pool = Pool(n)
    df = pd.concat(pool.map(function, split_df))
    pool.close()
    pool.join()
    return df

In [2]:
def split_sentences(df):
    split_rule = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s"
    
    df['speech'] = df['speech'].apply(lambda x: re.split(split_rule, x))
    
    df = df.explode('speech')
    df.reset_index()
    df.rename(columns = {'speech': 'sentence'}, inplace = True)
    
    return df

In [3]:
# The gensim Word2Vec model does not expect strings as its text examples (sentences), 
# but lists-of-tokens. Thus, it's up to your code to tokenize your text, 
# before passing it to Word2Vec. 

In [3]:
import pandas as pd
import numpy as np
import os
import re
import gensim

def export_congressional_gensim_models(dir_path):
    file_names = []
    cycle = 0
    
    for fname in os.listdir(dir_path):
        file_names.append(fname)
        
    for fname in file_names:
        cycle = cycle + 1
        
        imported_data = pd.read_csv(dir_path + fname)
        
        sentences_df = parallelize_operation(imported_data, split_sentences)
        
        sentences_df['sentence'] = sentences_df['sentence'].str.split()
        
        period_model = gensim.models.Word2Vec(sentences = sentences_df['sentence'],
                                             workers = n, 
                                             min_count = 20, # remove words stated less than 20 times
                                             size = 100) # size of neuralnet layers; default is 100 - go higher for larger corpora 
        
        extention_position = fname.index('.')
        fname = fname[0:extention_position]
        
        #period_model.save(add somethign about period + '_model')
        
        if cycle == 1:
            congress_model = period_model
        else:
            congress_model.build_vocab(sentences_df['sentence'], update = True)
            congress_model.train(sentences_df['sentence'], total_examples = period_model.corpus_count, epochs = period_model.epochs)
            
        congress_model.save(fname + '_model')
                      

export_congressional_gensim_models('data_subsets/')  

NameError: name 'parallelize_operation' is not defined

In [4]:
congress_model = gensim.models.Word2Vec.load('stanford_congressional_records_1873_1874_model')

In [5]:
# Load Google news vectors
#word2vec_path = "path_to_the_vectors/GoogleNews-vectors-negative300.bin"
#word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

congress_model = gensim.models.Word2Vec.load('stanford_congressional_records_1873_1874_model')# contains the list of all unique words in pre-trained word2vec vectors
congress_w2v_vocabulary = congress_model.wv.vocab

In [6]:
congress_model.wv.index2word[:25]

['the',
 'of',
 'to',
 'and',
 'that',
 'I',
 'in',
 'a',
 'is',
 'it',
 'be',
 'for',
 'this',
 'not',
 'as',
 'by',
 'have',
 'was',
 'from',
 'on',
 'The',
 'which',
 'will',
 'or',
 'at']

In [7]:
len(congress_model.wv.index2word)

15327

In [8]:
congress_model.wv.vectors

array([[-1.2137219 ,  1.3901031 ,  1.6810625 , ...,  0.49358177,
        -0.534212  ,  0.26035666],
       [-0.33640045,  0.0874925 ,  0.90505594, ...,  1.7964767 ,
         0.469082  , -0.7950482 ],
       [-0.11258058,  0.15636025,  1.1120806 , ...,  0.12656668,
        -0.36181334, -1.1143477 ],
       ...,
       [ 0.07897078,  0.00347791,  0.02401185, ..., -0.08176875,
        -0.10715071,  0.03626795],
       [-0.0440694 , -0.06141869,  0.22473088, ..., -0.05709316,
        -0.03517441, -0.05756548],
       [-0.03267128, -0.10327776,  0.15210785, ..., -0.01559925,
         0.08148099,  0.15885909]], dtype=float32)

In [9]:
congress_model.wv.index2word[3]

'and'

In [10]:
congress_model.wv['to']

array([-1.12580575e-01,  1.56360254e-01,  1.11208057e+00, -1.63694394e+00,
       -1.96732950e+00, -9.60842311e-01,  2.29880214e+00, -2.83676028e+00,
       -1.08457066e-01,  2.01339412e+00,  5.86272538e-01,  1.07936954e+00,
        6.83029294e-02, -3.49307925e-01,  6.25689387e-01, -1.29493311e-01,
       -5.87438643e-01, -3.05765662e-02,  1.52911568e+00, -7.10536778e-01,
       -1.39310700e-03, -1.02884495e+00, -2.01690340e+00,  1.61613047e-01,
       -3.70174587e-01, -8.63505006e-01, -4.49686497e-01,  1.22544158e+00,
        3.75140369e-01,  9.08234239e-01,  2.79250693e+00,  2.99032950e+00,
       -1.03324664e+00,  1.56622362e+00, -6.45567298e-01, -1.08611798e+00,
        3.35500926e-01, -1.13525748e+00,  6.65919900e-01, -1.99482679e-01,
       -7.95930326e-01, -1.40282226e+00, -4.30072755e-01,  4.89553243e-01,
       -1.32687807e+00, -1.02504277e+00,  8.35403919e-01,  4.09150362e-01,
        1.36140645e+00, -5.99500835e-01, -2.03080368e+00, -1.65794754e+00,
       -1.89322144e-01,  

In [11]:
congress_model.wv.vectors[3]

array([ 0.4981839 , -0.07244259, -0.30060878, -1.3876089 , -0.7268536 ,
        0.2225371 ,  1.3775698 ,  0.0778371 , -1.0745714 ,  0.70276153,
        0.08910023, -0.6716979 , -0.37361118, -0.2846855 ,  0.66448456,
        0.3170037 , -0.07228577,  1.2633101 ,  0.35974845, -0.10054018,
        0.69000363, -0.03926645, -2.575601  ,  0.06809434, -0.78677446,
       -2.218929  , -0.24234462,  0.69691366, -0.17579308,  0.90760595,
        0.215567  ,  1.2573235 ,  0.57957834, -0.01310704, -0.22670114,
       -1.0446719 , -0.44389293,  0.35116285, -0.34281754, -0.5387338 ,
       -0.79898363, -2.0082116 ,  0.9730359 , -0.45276877, -0.7648684 ,
       -0.9282157 ,  0.81879985,  0.15924817,  1.2325851 ,  0.7370203 ,
        0.75480956,  0.11558668,  0.08720962,  0.0324798 ,  0.94672304,
       -1.1605217 , -0.82023114,  1.3042344 ,  1.7554764 , -0.71666425,
       -0.34761587, -1.6350881 ,  0.80026555, -0.1075739 ,  0.43415645,
       -0.44709942,  0.46373656,  0.0563006 , -0.2595159 , -0.36

In [12]:
man_vector = congress_model.wv['man']
congress_model.wv.similar_by_vector(man_vector)

[('man', 1.0),
 ('lawyer', 0.784612774848938),
 ('mai', 0.7743773460388184),
 ('merchant', 0.7623230218887329),
 ('mian', 0.7392277717590332),
 ('soldier', 0.7370642423629761),
 ('mani', 0.7255149483680725),
 ('man.', 0.7229173183441162),
 ('witness', 0.7200114727020264),
 ('citizen', 0.7081575393676758)]

In [13]:
woman_vector = congress_model.wv['woman']
congress_model.wv.similar_by_vector(woman_vector)

[('woman', 1.0),
 ('mau', 0.7833964228630066),
 ('citizen', 0.7581619620323181),
 ('soldier', 0.7527076601982117),
 ('nian', 0.7463874816894531),
 ('man.', 0.7352272868156433),
 ('child', 0.7283610105514526),
 ('mian', 0.7250927090644836),
 ('foreigner', 0.7204998731613159),
 ('nan', 0.7189040184020996)]

In [14]:
individual_vector = congress_model.wv['individual']
congress_model.wv.similar_by_vector(individual_vector)

[('individual', 1.0),
 ('honest', 0.6170173287391663),
 ('creditor', 0.6117383241653442),
 ('individual.', 0.607198178768158),
 ('superior', 0.5966612100601196),
 ('fraudulent', 0.5930872559547424),
 ('citizen.', 0.5888493061065674),
 ('society', 0.5841152667999268),
 ('mans', 0.5802187919616699),
 ('equality', 0.5768246054649353)]

In [15]:
soldier_vector = congress_model.wv['soldier']
congress_model.wv.similar_by_vector(soldier_vector)

[('soldier', 1.0),
 ('mian', 0.7534946799278259),
 ('woman', 0.7527076005935669),
 ('merchant', 0.7440078258514404),
 ('pensioner', 0.7385962009429932),
 ('man', 0.7370642423629761),
 ('nian', 0.720649003982544),
 ('master', 0.7148724794387817),
 ('soldier.', 0.7008633017539978),
 ('captain', 0.7007896304130554)]

In [16]:
congress_model.wv.similarity('women', 'men')

0.629509

In [17]:
congress_model.wv.most_similar("women", topn = 50)

[('children', 0.7644491195678711),
 ('merchants', 0.7066177129745483),
 ('rich', 0.6883387565612793),
 ('children.', 0.6830962896347046),
 ('colored', 0.6641597747802734),
 ('soldiers', 0.6595255136489868),
 ('wealthy', 0.6543371677398682),
 ('destitute', 0.6537994742393494),
 ('struggling', 0.6519593000411987),
 ('poor', 0.6401433348655701),
 ('traders', 0.6373857259750366),
 ('farmers', 0.6362103223800659),
 ('noble', 0.6332833766937256),
 ('capitalists', 0.6331146955490112),
 ('men', 0.6295090913772583),
 ('white', 0.6290037631988525),
 ('widows', 0.6285388469696045),
 ('skilled', 0.6273373365402222),
 ('brave', 0.6266660690307617),
 ('women.', 0.6257854104042053),
 ('fields', 0.6237730979919434),
 ('mechanics', 0.6204156279563904),
 ('bankers', 0.6146213412284851),
 ('orphans', 0.6089208722114563),
 ('families', 0.6071287393569946),
 ('laborers', 0.6053229570388794),
 ('black', 0.6006523370742798),
 ('bread', 0.6006162166595459),
 ('men.', 0.599524736404419),
 ('patriotic', 0.59881

In [18]:
congress_model.wv.most_similar("men", topn = 50)

[('men.', 0.7377591729164124),
 ('lawyers', 0.6918026208877563),
 ('persons', 0.6915714144706726),
 ('people', 0.6704407930374146),
 ('merchants', 0.6628286242485046),
 ('gentlemen', 0.6529461145401001),
 ('Men', 0.6475223302841187),
 ('ones', 0.6338933110237122),
 ('women', 0.6295089721679688),
 ('parties', 0.6241623163223267),
 ('nen', 0.6230642795562744),
 ('farmers', 0.6145846843719482),
 ('capitalists', 0.6073158383369446),
 ('others', 0.6071662902832031),
 ('claimants', 0.606265664100647),
 ('children', 0.605920672416687),
 ('foreigners', 0.6052873134613037),
 ('soldiers', 0.5934457778930664),
 ('ladies', 0.5914616584777832),
 ('statesmen', 0.5890417695045471),
 ('witnesses', 0.5890220403671265),
 ('politicians', 0.5853266716003418),
 ('mcn', 0.584814190864563),
 ('laborers', 0.5819270610809326),
 ('boys', 0.569283127784729),
 ('man', 0.5680898427963257),
 ('bankers', 0.5622221827507019),
 ('officers', 0.5527613162994385),
 ('Indians', 0.5470938682556152),
 ('negroes', 0.54686820

In [19]:
diff = congress_model.wv['man'] - congress_model.wv['woman']
congress_model.wv.similar_by_vector(diff, topn=40)

[('man', 0.9883699417114258),
 ('lawyer', 0.7484763264656067),
 ('mai', 0.7310010194778442),
 ('merchant', 0.727809727191925),
 ('witness', 0.6992734670639038),
 ('mian', 0.6851624250411987),
 ('mani', 0.6778616905212402),
 ('soldier', 0.6769345998764038),
 ('person', 0.6657634973526001),
 ('man.', 0.6645764112472534),
 ('mal', 0.6608470678329468),
 ('farmer', 0.6439114809036255),
 ('citizen', 0.6430673599243164),
 ('nian', 0.6202700138092041),
 ('mans', 0.6109440326690674),
 ('mau', 0.6074048280715942),
 ('creditor', 0.5955703258514404),
 ('foreigner', 0.5867884159088135),
 ('manufacturer', 0.5841833353042603),
 ('claimant', 0.5834363698959351),
 ('officer', 0.5812118053436279),
 ('party', 0.5800226926803589),
 ('everybody', 0.5799200534820557),
 ('child', 0.5793112516403198),
 ('contractor', 0.5698662996292114),
 ('anybody', 0.5691004991531372),
 ('woman', 0.5684463977813721),
 ('men', 0.5620081424713135),
 ('importer', 0.559678316116333),
 ('inventor', 0.552960216999054),
 ('banker'

In [20]:
diff = congress_model.wv['woman'] - congress_model.wv['man']
congress_model.wv.similar_by_vector(diff, topn = 40)

[('Whereas', 0.4274500608444214),
 ('limiting', 0.41570499539375305),
 ('numbered', 0.41557809710502625),
 ('instructing', 0.4072474241256714),
 ('following', 0.4035813808441162),
 ('adopted:', 0.3961528539657593),
 ('regulating', 0.39190608263015747),
 ('thatthe', 0.3797934949398041),
 ('Tite', 0.3781622052192688),
 ('joint', 0.3772182762622833),
 ('revised', 0.37709760665893555),
 ('tse', 0.37491974234580994),
 ('thie', 0.37036949396133423),
 ('bythe', 0.3683358430862427),
 ('Tile', 0.36373409628868103),
 ('recommend', 0.3621600866317749),
 ('thme', 0.35904407501220703),
 ('concurrent', 0.35872596502304077),
 ('approved', 0.35697197914123535),
 ('construing', 0.3568660318851471),
 ('directing', 0.35644006729125977),
 ('By', 0.3561759889125824),
 ('thc', 0.3554948568344116),
 ('relating', 0.35503607988357544),
 ('refers.', 0.3539797365665436),
 ('embraced', 0.3507007360458374),
 ('changing', 0.34920966625213623),
 ('askc', 0.3470391631126404),
 ('fifteenth', 0.34639692306518555),
 ('a

In [21]:
# come back to rest: https://github.com/stephbuon/democracy-lab/blob/main/word-embeddings/workhorse-parallel-context-vectors.ipynb

In [None]:
#keyword1 = 'feminine'  # define the keyword you're looking for. you can change this variable as many times as you want.
#enddate = 1950
#########  after the first run, use this line to call the old data without generating it again
#keyword_context = []
#dates_found = []

#for p in range(0, 18) :

#    period1 = periodnames[p]
#    print('working on ', period1)
#    period_model = gensim.models.Word2Vec.load('model-' + str(period1)) # to load a saved model

    ## analysis
#    if keyword1 in period_model.wv.vocab:
#        print('found ', keyword1)
#        keyword_context_period = period_model.wv.most_similar(keyword1, topn = 5000) # extract the context of how women were talked about in that period
#        keyword_context.append(keyword_context_period) # save the context of how women were talked about for later
#        dates_found.append(period1)



In [48]:
keyword = 'woman'

In [49]:
keyword_context = []

dir_path = '/users/sbuongiorno/diagnostics'
    
for fname in os.listdir(dir_path):
    if '_model' in fname:
        congress_model = gensim.models.Word2Vec.load(fname)
        if keyword in congress_model.wv.vocab:
            keyword_context_period = congress_model.wv.most_similar(keyword, topn = 1000)
            keyword_context.append(keyword_context_period)
        else: 
            pass


In [53]:
keyword_context[0][:15]

[('mau', 0.7833964228630066),
 ('citizen', 0.7581620216369629),
 ('soldier', 0.7527076601982117),
 ('nian', 0.7463874816894531),
 ('man.', 0.7352272868156433),
 ('child', 0.7283610105514526),
 ('mian', 0.7250927090644836),
 ('foreigner', 0.7204998731613159),
 ('nan', 0.7189040184020996),
 ('lady', 0.7056132555007935),
 ('citizen.', 0.7028462290763855),
 ('mai', 0.696204662322998),
 ('man', 0.6869451999664307),
 ('mani', 0.6857932209968567),
 ('statesman', 0.6771718263626099)]

In [59]:
[elem[0] for elem in keyword_context[0]][:5]

['mau', 'citizen', 'soldier', 'nian', 'man.']

In [62]:
[elem[1] for elem in keyword_context[0]][:5]

[0.7833964228630066,
 0.7581620216369629,
 0.7527076601982117,
 0.7463874816894531,
 0.7352272868156433]

In [None]:
# maybe I want a df with 10 words per period and their scores 

In [32]:
pd.set_option('display.max_columns', None)  

In [7]:
# unrelated: 
# you are too sensitive
test = congressional_data[congressional_data['speech'].str.contains('you made me')]

In [29]:
test = congressional_data[congressional_data['speech'].str.contains('that never happened')]

In [40]:
test.to_csv('test.csv', index=False)

In [39]:
test

Unnamed: 0,speech,year,index
251740,The Senator from Vermont has fancied a case th...,1878,251740
1028929,Perhaps it may. I will not undertake to say in...,1887,1028929
1164184,Why bring this forthj ust on this particular o...,1888,1164184
1425529,There are a great many things happening nowada...,1891,1425529
1634636,I am much obliged to the Senator for his state...,1893,1634636
...,...,...,...
17318611,Mr. President. I rise to talk about a very imp...,2009,17318611
17341846,I thank the minority leader for allowing me to...,2010,17341846
17344143,sylvania. I thank the chairman for yielding. I...,2010,17344143
17391401,Madam President. I thank my new colleague from...,2010,17391401


In [5]:
test_2 = congressional_data[congressional_data['speech'].str.contains('I always tell the truth')]

In [6]:
test_2

Unnamed: 0,speech,year,index
2560612,I always tell the truth. here and everywhere. ...,1904,2560612
4752200,I always tell the truth. and there was so much...,1919,4752200
7016836,I thank the Senator from Nevada. he knows that...,1935,7016836
11717223,Mr. Speaker. since my election to Congress in ...,1968,11717223
