In [1]:
import sys
import os
import csv
import gensim
import pandas as pd

from matplotlib import pyplot as plt
from matplotlib import cm
from numpy import linspace
from adjustText import adjust_text

os.chdir('/users/sbuongiorno/democracy-lab/util/')
# add: import_congressional_data
from pyfunctions.interval_subsetter import interval_subset
from pyfunctions.parallelize_operation import parallelize_operation
from pyfunctions.preprocess_functions import standardize_spelling_df
from pyfunctions.str_functions import lemmatize_df_text, str_split_df_sentences
from pyfunctions.w2v_gensim_functions import w2v_export_gensim_models, w2v_embeddings, w2v_visualize_scatter_plot
from pyfunctions.import_kw import import_keywords_list

os.chdir('/users/sbuongiorno/democracy-lab/word-embeddings/')

In [None]:
def data_process(target_fpath, label, find_replace):
    if not os.path.exists(target_fpath + label + '_subsets'):
        
        print('Importing data...')
        data = import_congressional_data(clean_data = True)
        print('Selecting columns...')
        data = data[['speech','year']].copy() # this might not be the right date column name --check -- but it probably should be transformed to year and text
        #print('Standardizing spelling...')
        #data = standardize_spelling_df(data, 
        #                               text_col='speech', 
        #                               fpath_replace_list=find_replace)
        print('Generating ' + label + '_subsets')
        os.mkdir(target_fpath + label + '_subsets')

    if os.path.exists(target_fpath + label + '_subsets'):
        if 'data' in locals():
            print('Exporting period subsets to ' + label + '_subsets...') 
            interval_subset(data, 'year', 1800, 2010, 5, fname=label)
        print('Exporting models to ' + label + '_subsets...')
        w2v_export_gensim_models(label +'_subsets/', n_cores=36)

In [None]:
data_process('/users/sbuongiorno/democracy-lab/word-embeddings/', 
             'us_congress', 
             '/users/sbuongiorno/name.csv')#,
             #'/users/sbuongiorno/preprocess_propertywords.csv')

In [None]:
keywords_list = import_keywords_list('/users/sbuongiorno/propertywords_cleaned_for_w2v_1grams.csv')

In [None]:
for keyword in keywords_list:
    keyword_context = w2v_embeddings.keyword_context('/users/sbuongiorno/democracy-lab/word-embeddings/c19_20_hansard_subsets/', keyword)
    
    period_names = w2v_visualize_scatter_plot.define_periods(1800, 2005, 5)
    
    period_words = w2v_visualize_scatter_plot.collect_text_values(keyword_context, period_names)
    
    flat_list = w2v_visualize_scatter_plot.make_1D_list(period_words)    
    
    try:
        w2v_visualize_scatter_plot.w2v_scatter_plot(period_names, keyword_context, flat_list, keyword)
    except:
        continue

## Exploration

In [22]:
#congress_model = gensim.models.Word2Vec.load('/users/sbuongiorno/democracy-lab/word-embeddings/stanford_congressional_records_subsets/stanford_congressional_records1873_1874_model') # contains the list of all unique words in pre-trained word2vec vectors

In [24]:
#congress_w2v_vocabulary = congress_model.wv.vocab

In [25]:
#congress_model.wv.index2word[:25]

['the',
 'of',
 'to',
 'and',
 'that',
 'I',
 'in',
 'a',
 'is',
 'it',
 'be',
 'for',
 'this',
 'not',
 'as',
 'by',
 'have',
 'was',
 'from',
 'on',
 'The',
 'which',
 'will',
 'or',
 'at']

In [26]:
#len(congress_model.wv.index2word)

15327

In [27]:
#congress_model.wv.vectors

array([[ 1.73035562e+00, -6.83970094e-01, -2.69009709e-01, ...,
         3.92869741e-01, -8.13241079e-02, -6.10144734e-01],
       [ 1.76975167e+00, -1.46722076e-02, -4.42213297e-01, ...,
         6.75639212e-01,  1.39195370e-02, -7.84029961e-01],
       [ 9.16211843e-01, -8.73500168e-01,  1.89974701e+00, ...,
         4.91572767e-01,  1.01502645e+00, -1.67135492e-01],
       ...,
       [ 2.01857984e-02, -4.97993231e-02, -2.06858143e-02, ...,
        -1.01056248e-01,  3.14819254e-02, -3.12528685e-02],
       [ 5.89295104e-02, -1.70368829e-03,  1.27697602e-01, ...,
        -2.35911578e-01, -1.18444666e-01, -7.70635530e-02],
       [ 6.27761930e-02, -1.31842718e-01,  1.25894830e-01, ...,
        -1.82212010e-01, -8.32660496e-02,  4.40259390e-02]], dtype=float32)

In [28]:
#congress_model.wv.index2word[3]

'and'

In [29]:
#congress_model.wv['to']

array([ 0.91621184, -0.87350017,  1.899747  ,  1.905248  , -0.13984899,
       -1.0875859 ,  1.6942468 , -0.01040702,  1.8601773 ,  2.709001  ,
       -0.24671529, -2.486799  , -0.9311729 ,  0.28369635,  1.1143824 ,
       -1.2657373 ,  0.58928347, -0.147861  ,  0.11638104, -0.80986816,
        0.9408491 , -2.6817534 , -0.44196448, -2.5782387 ,  0.9019148 ,
        0.04292612,  0.29616672,  1.1208102 , -0.63703114, -0.28486174,
        1.4621372 , -0.74971825, -0.9556688 , -0.32937533,  0.1302492 ,
        1.4080564 ,  1.229292  ,  0.18882783,  1.1709342 , -1.3881642 ,
        1.0525256 ,  0.5111599 ,  0.05191733, -1.5173302 , -0.93335575,
       -0.50936973, -0.20901261,  1.8754948 ,  0.5607795 , -1.0483916 ,
       -0.47771373,  1.0070051 , -0.47526026, -1.0236734 , -1.7460467 ,
       -2.1290226 , -1.4244438 ,  0.11539432,  0.90179926,  1.3103766 ,
        0.512915  , -0.16312021, -1.2417433 ,  1.3376906 , -0.49068815,
       -1.2018003 ,  1.584356  ,  0.31289983, -0.28954217,  1.37

In [30]:
#congress_model.wv.vectors[3]

array([ 0.25987232, -1.0454421 ,  0.55559415, -1.0159976 ,  0.20438798,
       -0.2817539 , -0.444001  ,  0.6758486 ,  0.7846123 ,  1.1649584 ,
       -0.24571837, -0.9608447 ,  0.566566  ,  0.59756935,  1.2640162 ,
        0.01420574,  0.6141216 , -0.6792223 , -0.39184752, -0.32092163,
        0.7508321 , -0.63225496, -1.4306395 ,  0.21366134,  0.2802836 ,
        0.3817243 , -1.0148244 ,  0.15586749,  1.0278411 , -1.3139789 ,
        0.13459414,  0.02274267,  0.99231356,  0.2764427 , -0.5354334 ,
        0.11765855,  0.8595166 , -1.9107814 ,  2.4422991 , -0.6925129 ,
       -0.07926845,  0.586401  ,  0.09067354, -0.6419713 ,  0.3716558 ,
       -1.0815433 ,  0.41559333,  1.065362  , -0.7503217 ,  0.45964354,
        0.87521446,  0.27853864,  0.29807135, -0.18738072,  0.5378493 ,
        0.5734402 , -0.6965925 , -0.19923292, -1.0329779 ,  0.89816874,
        0.49004483, -0.77272165, -1.9950471 ,  0.9752638 , -0.5072653 ,
        0.1952957 ,  0.02216992,  1.2731632 , -0.05519395,  1.17

In [31]:
#man_vector = congress_model.wv['man']
#congress_model.wv.similar_by_vector(man_vector)

[('man', 1.0),
 ('lawyer', 0.7873023748397827),
 ('mai', 0.7688322067260742),
 ('merchant', 0.7556109428405762),
 ('mian', 0.7486206293106079),
 ('soldier', 0.7367701530456543),
 ('mani', 0.7314112186431885),
 ('man.', 0.7275295257568359),
 ('witness', 0.722841739654541),
 ('citizen', 0.7098455429077148)]

In [32]:
#woman_vector = congress_model.wv['woman']
#congress_model.wv.similar_by_vector(woman_vector)

[('woman', 1.0),
 ('mau', 0.7746826410293579),
 ('citizen', 0.7594645023345947),
 ('soldier', 0.7473379373550415),
 ('man.', 0.7440284490585327),
 ('nian', 0.739978551864624),
 ('mian', 0.7210935950279236),
 ('foreigner', 0.7194973230361938),
 ('nan', 0.7192388772964478),
 ('child', 0.7159484624862671)]

In [33]:
#individual_vector = congress_model.wv['individual']
#congress_model.wv.similar_by_vector(individual_vector)

[('individual', 1.0),
 ('honest', 0.6153244376182556),
 ('creditor', 0.6087626814842224),
 ('individual.', 0.6070312857627869),
 ('fraudulent', 0.5945761799812317),
 ('superior', 0.5931428670883179),
 ('mans', 0.5908780097961426),
 ('race', 0.5881892442703247),
 ('society', 0.5822649002075195),
 ('citizen.', 0.5816092491149902)]

In [34]:
#soldier_vector = congress_model.wv['soldier']
#congress_model.wv.similar_by_vector(soldier_vector)

[('soldier', 1.0),
 ('merchant', 0.7489620447158813),
 ('woman', 0.7473379373550415),
 ('mian', 0.742435097694397),
 ('pensioner', 0.7392129898071289),
 ('man', 0.7367701530456543),
 ('master', 0.7333453893661499),
 ('nian', 0.7064769268035889),
 ('soldier.', 0.7063614130020142),
 ('captain', 0.7043235301971436)]

In [35]:
#congress_model.wv.similarity('women', 'men')

0.6369535

In [36]:
#congress_model.wv.most_similar("women", topn = 50)

[('children', 0.7649115324020386),
 ('merchants', 0.7290546298027039),
 ('children.', 0.6868888735771179),
 ('rich', 0.6865991353988647),
 ('wealthy', 0.6686532497406006),
 ('soldiers', 0.6671132445335388),
 ('destitute', 0.6626975536346436),
 ('colored', 0.6594951152801514),
 ('struggling', 0.6561284065246582),
 ('noble', 0.6490404605865479),
 ('poor', 0.6459184288978577),
 ('widows', 0.644420862197876),
 ('traders', 0.6371326446533203),
 ('men', 0.6369534730911255),
 ('skilled', 0.6349139213562012),
 ('capitalists', 0.6344571113586426),
 ('fields', 0.6341192126274109),
 ('farmers', 0.6315756440162659),
 ('mechanics', 0.6282700300216675),
 ('women.', 0.6276668906211853),
 ('brave', 0.6266326904296875),
 ('white', 0.6262197494506836),
 ('laborers', 0.6215879917144775),
 ('families', 0.618699312210083),
 ('orphans', 0.6185200214385986),
 ('bankers', 0.6162791848182678),
 ('habits', 0.6152911186218262),
 ('trained', 0.6091819405555725),
 ('men.', 0.6084530353546143),
 ('miners', 0.608290

In [37]:
#congress_model.wv.most_similar("men", topn = 50)

[('men.', 0.7389687299728394),
 ('persons', 0.691087007522583),
 ('lawyers', 0.6904367804527283),
 ('Men', 0.6618365049362183),
 ('merchants', 0.6598010659217834),
 ('people', 0.6544091701507568),
 ('gentlemen', 0.6472547054290771),
 ('nen', 0.6388335824012756),
 ('women', 0.6369534134864807),
 ('ones', 0.6297272443771362),
 ('farmers', 0.6238963603973389),
 ('capitalists', 0.6147263050079346),
 ('parties', 0.6117362380027771),
 ('others', 0.6092349290847778),
 ('claimants', 0.6037822961807251),
 ('children', 0.6009438037872314),
 ('soldiers', 0.5979388952255249),
 ('foreigners', 0.5976136326789856),
 ('politicians', 0.5932735204696655),
 ('ladies', 0.5877074003219604),
 ('statesmen', 0.5874553918838501),
 ('witnesses', 0.5851725339889526),
 ('mcn', 0.5825263261795044),
 ('man', 0.5695891380310059),
 ('laborers', 0.5683018565177917),
 ('meu', 0.5656101703643799),
 ('mon', 0.5635080337524414),
 ('boys', 0.5634727478027344),
 ('negroes', 0.5616649985313416),
 ('mei', 0.557658314704895),


In [38]:
#diff = congress_model.wv['man'] - congress_model.wv['woman']
#congress_model.wv.similar_by_vector(diff, topn=40)

[('man', 0.9886401295661926),
 ('lawyer', 0.7529689073562622),
 ('mai', 0.726211428642273),
 ('merchant', 0.7248859405517578),
 ('witness', 0.7051510214805603),
 ('mian', 0.6982424259185791),
 ('mani', 0.6860320568084717),
 ('soldier', 0.6793063879013062),
 ('man.', 0.6695171594619751),
 ('person', 0.6633116006851196),
 ('mal', 0.6597539186477661),
 ('citizen', 0.6462264060974121),
 ('farmer', 0.6450856924057007),
 ('nian', 0.631976842880249),
 ('mans', 0.6207333207130432),
 ('mau', 0.6102208495140076),
 ('claimant', 0.60492342710495),
 ('creditor', 0.601830005645752),
 ('foreigner', 0.5999503135681152),
 ('child', 0.59288489818573),
 ('party', 0.5820441842079163),
 ('everybody', 0.5819985866546631),
 ('manufacturer', 0.5808119177818298),
 ('woman', 0.580191969871521),
 ('officer', 0.5729767084121704),
 ('anybody', 0.5680667757987976),
 ('importer', 0.5605961680412292),
 ('men', 0.5604574084281921),
 ('contractor', 0.5591752529144287),
 ('inventor', 0.5537365078926086),
 ('banker', 0.5

In [39]:
#diff = congress_model.wv['woman'] - congress_model.wv['man']
#congress_model.wv.similar_by_vector(diff, topn = 40)

[('instructing', 0.42829954624176025),
 ('Whereas', 0.42052528262138367),
 ('limiting', 0.4200958013534546),
 ('regulating', 0.41683223843574524),
 ('numbered', 0.4070498049259186),
 ('following', 0.4062387943267822),
 ('revised', 0.3852912187576294),
 ('adopted:', 0.38496482372283936),
 ('thie', 0.38468122482299805),
 ('Tite', 0.38423842191696167),
 ('recommend', 0.38337522745132446),
 ('thatthe', 0.3787490129470825),
 ('tse', 0.37820374965667725),
 ('joint', 0.376299649477005),
 ('bythe', 0.37561750411987305),
 ('changing', 0.3745558261871338),
 ('fifteenth', 0.3718433380126953),
 ('refers.', 0.3667224049568176),
 ('By', 0.36655232310295105),
 ('relating', 0.36471980810165405),
 ('Tile', 0.3620380759239197),
 ('directing', 0.3603741526603699),
 ('approved', 0.357888400554657),
 ('embraced', 0.3556845784187317),
 ('rejecting', 0.35543444752693176),
 ('fixing', 0.3542177677154541),
 ('concurrent', 0.3537256121635437),
 ('requiring', 0.3499775528907776),
 ('refers', 0.3488360345363617),

In [40]:
# come back to rest: https://github.com/stephbuon/democracy-lab/blob/main/word-embeddings/workhorse-parallel-context-vectors.ipynb

In [34]:
keyword_context[0][:15]

[('mau', 0.7809287905693054),
 ('citizen', 0.7573187351226807),
 ('soldier', 0.7517971992492676),
 ('man.', 0.7454029321670532),
 ('nian', 0.7337642908096313),
 ('child', 0.7282170057296753),
 ('foreigner', 0.7275236248970032),
 ('nan', 0.7158505320549011),
 ('mian', 0.7087287902832031),
 ('lady', 0.70368492603302),
 ('citizen.', 0.7031761407852173),
 ('man', 0.6877122521400452),
 ('mai', 0.6828106641769409),
 ('mani', 0.6816163063049316),
 ('lawyer', 0.6712002158164978)]

In [35]:
keyword_context[2][:15]

[('child', 0.8186684846878052),
 ('girl', 0.8168043494224548),
 ('woman.', 0.8089375495910645),
 ('lady', 0.7855381965637207),
 ('boy', 0.7776932716369629),
 ('man', 0.7576967477798462),
 ('soldier', 0.7317655682563782),
 ('sailor', 0.7316795587539673),
 ('child.', 0.7156210541725159),
 ('Chinaman', 0.7138143181800842),
 ('mai', 0.7091273069381714),
 ('mau', 0.6919853687286377),
 ('foreigner', 0.6905137896537781),
 ('preacher', 0.6904029846191406),
 ('wife', 0.686038613319397)]

In [59]:
[elem[0] for elem in keyword_context[0]][:5]

['mau', 'citizen', 'soldier', 'nian', 'man.']

In [62]:
[elem[1] for elem in keyword_context[0]][:5]

[0.7833964228630066,
 0.7581620216369629,
 0.7527076601982117,
 0.7463874816894531,
 0.7352272868156433]

In [None]:
# maybe I want a df with 10 words per period and their scores 