In [4]:
import sys
import os
import csv
import gensim
import pandas as pd

from matplotlib import pyplot as plt
from matplotlib import cm
from numpy import linspace
from adjustText import adjust_text

os.chdir('/users/sbuongiorno/democracy-lab/util/')
from pyfunctions.interval_subsetter import interval_subset
from pyfunctions.parallelize_operation import parallelize_operation
from pyfunctions.preprocess_functions import standardize_spelling_df
from pyfunctions.str_functions import lemmatize_df_text, str_split_df_sentences
from pyfunctions.w2v_gensim_functions import w2v_export_gensim_models, w2v_embeddings, w2v_visualize_scatter_plot
from pyfunctions.import_kw import import_keywords_list

os.chdir('/users/sbuongiorno/democracy-lab/word-embeddings/')

In [3]:
def data_process(target_fpath, label, csv_data, find_replace):
    if not os.path.exists(target_fpath + label + '_subsets'):
        
        print('Importing data...')
        data = pd.read_csv(csv_data)
        print('Selecting columns...')
        data = data[['text','year']].copy()
        print('Standardizing column names...')
        data = data.rename(columns = {'text': 'speech'})
        print('Standardizing spelling...')
        data = standardize_spelling_df(data, 
                                       text_col='speech', 
                                       fpath_replace_list=find_replace)
        print('Generating ' + label + '_subsets')
        os.mkdir(target_fpath + label + '_subsets')

    if os.path.exists(target_fpath + label + '_subsets'):
        if 'data' in locals():
            print('Exporting period subsets to ' + label + '_subsets...') 
            interval_subset(data, 'year', 1800, 2010, 5, fname=label)
        print('Exporting models to ' + label + '_subsets...')
        w2v_export_gensim_models(label +'_subsets/', n_cores=36)

In [None]:
data_process('/users/sbuongiorno/democracy-lab/word-embeddings/', 
             'c19_20_hansard', 
             '/users/sbuongiorno/hansard_20191119_w_year.csv',
             '/users/sbuongiorno/preprocess_propertywords.csv')

In [2]:
keywords_list = import_keywords_list('/users/sbuongiorno/propertywords_cleaned_for_w2v_1grams.csv')

In [None]:
for keyword in keywords_list:
    keyword_context = w2v_embeddings.keyword_context('/users/sbuongiorno/democracy-lab/word-embeddings/c19_20_hansard_subsets/', keyword)
    
    period_names = w2v_visualize_scatter_plot.define_periods(1800, 2005, 5)
    
    period_words = w2v_visualize_scatter_plot.collect_text_values(keyword_context, period_names)
    
    flat_list = w2v_visualize_scatter_plot.make_1D_list(period_words)    
    
    try:
        w2v_visualize_scatter_plot.w2v_scatter_plot(period_names, keyword_context, flat_list, keyword)
    except:
        continue

In [10]:
#os.chdir('/users/sbuongiorno/democracy-lab/word-embeddings/hansard')
#!jupyter nbconvert --to script hansard_data_word2vec.ipynb

[NbConvertApp] Converting notebook hansard_data_word2vec.ipynb to script
[NbConvertApp] Writing 11561 bytes to hansard_data_word2vec.py
