In [20]:
import json
import os
import glob
import re
import pandas as pd
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta
from pathlib import Path

from fuzzywuzzy import fuzz
from fuzzywuzzy import process

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/rmh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:
def get_election_data(start_date, end_date, election_year, subdir):

    data = []
    dayrange = range((end_date - start_date).days + 1)

    ARTICLES_DIR = join('data', 'guardian', subdir, election_year)

    for daycount in dayrange:
        dt = start_date + timedelta(days=daycount)
        datestr = dt.strftime('%Y-%m-%d')
        fname = join(ARTICLES_DIR, datestr + '.json')
        with open(fname) as f:
            for hd in json.load(f):
                data.append({'headline': hd.lower(), 'date': datestr})

    return data

In [22]:
def identify_aspects_with_fuzzy_match(aspects, news_hd):
    asp_match = []
    pos = []
    lbls = []
    hdls = []
    dates = []

    stopWords = set(stopwords.words('english'))
    for hd in news_hd:
        
        hd_text = hd['headline']
        #re.sub(r'\W+', '', hd_text)
        hd_text = hd_text.replace('\n','')
        hd_text = hd_text.replace('\t','')
        hd_date = hd['date']
        
        for w in hd_text.split():
            if w not in stopWords:
                for kw in aspects:
                    score = fuzz.partial_ratio(kw, w)

                    if score > 80:
                        s_idx = hd_text.find(w)
                        asp_match.append(kw)
                        pos.append(str(s_idx) + ',' + str(s_idx + len(w)))
                        lbls.append('positive')
                        hdls.append(hd_text)
                        dates.append(hd_date)
                        
    return hdls, asp_match, lbls, pos, dates

In [23]:
def write_data(year, hds, pos, asp, lbl, dates, subdir):
    
    path = './data/processed-data/' + subdir + '/' + year
    Path(path).mkdir(parents=True, exist_ok=True)
    
    df = pd.DataFrame(
    {'headlines': hds,
     'terms': asp,
     'dates': dates
    })
    
    df.to_csv(path + '/hds.csv', index=False)
    
    
    with open(path + '/headlines.txt', "w") as output:
        for row in hds:
            output.write(str(row.rstrip()) + '\n')

    with open(path + '/position.txt', "w") as output:
        for row in pos:
            output.write(str(row) + '\n')

    with open(path + '/term.txt', "w") as output:
        for row in asp:
            output.write(str(row) + '\n')

    with open(path + '/label.txt', "w") as output:
        for row in lbl:
            output.write(str(row) + '\n')
            
    with open(path + '/dates.txt', "w") as output:
        for row in dates:
            output.write(str(row) + '\n')

In [24]:
def gen_trainable_data(start_date, end_date, year, fz_kw, subdir):
    print('----' + year + '----------')
    news_headlines = get_election_data(start_date, end_date, year, subdir)
    headlines, aspects, labels, positions, dates = identify_aspects_with_fuzzy_match(fz_kw, news_headlines)
    write_data(year, headlines, positions, aspects, labels, dates, subdir)

In [12]:
aspect_key_words_candidates_2016 = ['trump', 'donald', 'hillary', 'clinton']
aspect_key_words_candidates_2012 = ['barack', 'obama', 'mitt', 'romney']
aspect_key_words_candidates_2008 = ['barack', 'obama', 'john', 'mccain']
aspect_key_words_candidates_2004 = ['george', 'bush', 'john', 'kerry']
aspect_key_words_candidates_2000 = ['george', 'bush', 'gore']
aspect_key_words_candidates_2020 = ['trump', 'donald', 'joe', 'biden']

gen_trainable_data(date(2016, 8, 8), date(2016, 11, 8), '2016', aspect_key_words_candidates_2016, 'candidates')
gen_trainable_data(date(2012, 8, 6), date(2012, 11, 6), '2012', aspect_key_words_candidates_2012, 'candidates')
gen_trainable_data(date(2008, 8, 4), date(2008, 11, 4), '2008', aspect_key_words_candidates_2008, 'candidates')
gen_trainable_data(date(2004, 8, 2), date(2004, 11, 2), '2004', aspect_key_words_candidates_2004, 'candidates')
gen_trainable_data(date(2000, 8, 7), date(2000, 11, 7), '2000', aspect_key_words_candidates_2000, 'candidates')
gen_trainable_data(date(2020, 3, 12), date(2020, 6, 12), '2020', aspect_key_words_candidates_2020, 'candidates')

----2016----------
----2012----------
----2008----------
----2004----------
----2000----------
----2020----------


In [25]:
search_key_imm = ['immigration', 'undocumented', 'mexico', 'asylum', 'south', 'border', 
                  'deport', 'ICE', 'USCIS', 'refugee', 'migrant', 'visa', 'green', 'card']

gen_trainable_data(date(2016, 8, 8), date(2016, 11, 8), '2016', search_key_imm, 'immigration')
gen_trainable_data(date(2012, 8, 6), date(2012, 11, 6), '2012', search_key_imm, 'immigration')
gen_trainable_data(date(2008, 8, 4), date(2008, 11, 4), '2008', search_key_imm, 'immigration')
gen_trainable_data(date(2004, 8, 2), date(2004, 11, 2), '2004', search_key_imm, 'immigration')
gen_trainable_data(date(2000, 8, 7), date(2000, 11, 7), '2000', search_key_imm, 'immigration')
gen_trainable_data(date(2020, 3, 12), date(2020, 6, 12), '2020', search_key_imm, 'immigration')

----2016----------
----2012----------
----2008----------
----2004----------
----2000----------
----2020----------


In [26]:
search_key_health = ['healthcare', 'insurance', 'coverage', 'prescription', 'preexisting', 
                     'condition', 'medicare', 'pocket', 'cost', 'aca', 'affordable', 'medicaid'] 

gen_trainable_data(date(2016, 8, 8), date(2016, 11, 8), '2016', search_key_health, 'health')
gen_trainable_data(date(2012, 8, 6), date(2012, 11, 6), '2012', search_key_health, 'health')
gen_trainable_data(date(2008, 8, 4), date(2008, 11, 4), '2008', search_key_health, 'health')
gen_trainable_data(date(2004, 8, 2), date(2004, 11, 2), '2004', search_key_health, 'health')
gen_trainable_data(date(2000, 8, 7), date(2000, 11, 7), '2000', search_key_health, 'health')
gen_trainable_data(date(2020, 3, 12), date(2020, 6, 12), '2020', search_key_health, 'health')

----2016----------
----2012----------
----2008----------
----2004----------
----2000----------
----2020----------


In [15]:
search_key_race = ['job', 'unemployment', 'race', 'black', 'white', 'discrimination'] 

gen_trainable_data(date(2016, 8, 8), date(2016, 11, 8), '2016', search_key_race, 'jobs-race')
gen_trainable_data(date(2012, 8, 6), date(2012, 11, 6), '2012', search_key_race, 'jobs-race')
gen_trainable_data(date(2008, 8, 4), date(2008, 11, 4), '2008', search_key_race, 'jobs-race')
gen_trainable_data(date(2004, 8, 2), date(2004, 11, 2), '2004', search_key_race, 'jobs-race')
gen_trainable_data(date(2000, 8, 7), date(2000, 11, 7), '2000', search_key_race, 'jobs-race')
gen_trainable_data(date(2020, 3, 12), date(2020, 6, 12), '2020', search_key_race, 'jobs-race')

----2016----------
----2012----------
----2008----------
----2004----------
----2000----------
----2020----------


In [16]:
search_key_env = ['global', 'warming', 'green', 'deal', 'environment', 'coal', 'job', 'fossil', 'fuel']


gen_trainable_data(date(2016, 8, 8), date(2016, 11, 8), '2016', search_key_env, 'environment')
gen_trainable_data(date(2012, 8, 6), date(2012, 11, 6), '2012', search_key_env, 'environment')
gen_trainable_data(date(2008, 8, 4), date(2008, 11, 4), '2008', search_key_env, 'environment')
gen_trainable_data(date(2004, 8, 2), date(2004, 11, 2), '2004', search_key_env, 'environment')
gen_trainable_data(date(2000, 8, 7), date(2000, 11, 7), '2000', search_key_env, 'environment')
gen_trainable_data(date(2020, 3, 12), date(2020, 6, 12), '2020', search_key_env, 'environment')

----2016----------
----2012----------
----2008----------
----2004----------
----2000----------
----2020----------


In [17]:
search_key_guns = ['gun', 'control', 'law', 'legislation', 'background', 'check', 'shooting', 'semi', 'automatic']

gen_trainable_data(date(2016, 8, 8), date(2016, 11, 8), '2016', search_key_guns, 'guns')
gen_trainable_data(date(2012, 8, 6), date(2012, 11, 6), '2012', search_key_guns, 'guns')
gen_trainable_data(date(2008, 8, 4), date(2008, 11, 4), '2008', search_key_guns, 'guns')
gen_trainable_data(date(2004, 8, 2), date(2004, 11, 2), '2004', search_key_guns, 'guns')
gen_trainable_data(date(2000, 8, 7), date(2000, 11, 7), '2000', search_key_guns, 'guns')
gen_trainable_data(date(2020, 3, 12), date(2020, 6, 12), '2020', search_key_guns, 'guns')

----2016----------
----2012----------
----2008----------
----2004----------
----2000----------
----2020----------


In [27]:
search_key_party = ['democrat', 'republican', 'senate', 'house', 'campaign', 'super', 'pac', 'rallies', 'protest']

gen_trainable_data(date(2016, 8, 8), date(2016, 11, 8), '2016', search_key_party, 'party')
gen_trainable_data(date(2012, 8, 6), date(2012, 11, 6), '2012', search_key_party, 'party')
gen_trainable_data(date(2008, 8, 4), date(2008, 11, 4), '2008', search_key_party, 'party')
gen_trainable_data(date(2004, 8, 2), date(2004, 11, 2), '2004', search_key_party, 'party')
gen_trainable_data(date(2000, 8, 7), date(2000, 11, 7), '2000', search_key_party, 'party')
gen_trainable_data(date(2020, 3, 12), date(2020, 6, 12), '2020', search_key_party, 'party')

----2016----------
----2012----------
----2008----------
----2004----------
----2000----------
----2020----------


In [28]:
search_key_economy = ['economy', 'gdp', 'stock', 'market', '401k', 'retirement', 'national', 'debt'
                  'student', 'loan', 'bailout', 'bankruptcy', 'stimulus', 'stock', 'market', 'retirement',
                     'trade', 'export', 'import', 'tax', 'manufacture', 'package', 'tariff']

gen_trainable_data(date(2016, 8, 8), date(2016, 11, 8), '2016', search_key_economy, 'economy')
gen_trainable_data(date(2012, 8, 6), date(2012, 11, 6), '2012', search_key_economy, 'economy')
gen_trainable_data(date(2008, 8, 4), date(2008, 11, 4), '2008', search_key_economy, 'economy')
gen_trainable_data(date(2004, 8, 2), date(2004, 11, 2), '2004', search_key_economy, 'economy')
gen_trainable_data(date(2000, 8, 7), date(2000, 11, 7), '2000', search_key_economy, 'economy')
gen_trainable_data(date(2020, 3, 12), date(2020, 6, 12), '2020', search_key_economy, 'economy')

----2016----------
----2012----------
----2008----------
----2004----------
----2000----------
----2020----------


In [70]:
#os.mkdir('./data/processed-data/candidates/2016')
#Path('./data/processed-data/candidates/2016').mkdir(parents=True, exist_ok=True)