In [1]:
import json
import os
import glob
import re
import pandas as pd
from os import makedirs
from os.path import join, exists
from datetime import date, timedelta
from pathlib import Path
from pycorenlp import StanfordCoreNLP
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/rmh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [3]:
def get_sentiment(text):
    res = nlp.annotate(text,
                       properties={'annotators': 'sentiment',
                                   'outputFormat': 'json',
                                   'timeout': 1000,
                       })
    
    try:
        if len(res['sentences']) == 0:
            return -1
        else:
            return res['sentences'][0]['sentimentValue']
    except:
        return False
    else:
        return 2 #neutral sentiment

In [15]:
def get_and_save_data(start_date, end_date, path):

    data = []
    dayrange = range((end_date - start_date).days + 1)

    ARTICLES_DIR = join('data', 'guardian', path)

    for daycount in dayrange:
        dt = start_date + timedelta(days=daycount)
        datestr = dt.strftime('%Y-%m-%d')
        
        fname = join(ARTICLES_DIR, datestr + '.json')
        with open(fname) as f:
            for hd in json.load(f):
                score = int(get_sentiment(hd.lower()))
                if score == -1:
                    score = 2
                    
                data.append({'headline': hd.lower(), 'date': datestr, 'score': score})

    os.makedirs(join('data', 'processed', path), exist_ok=True)
    df = pd.DataFrame(data, columns=['headline', 'date', 'score'])
    df.to_csv(join('data', 'processed', path, 'headlines.csv'), index=False)

In [16]:
get_and_save_data(date(2016, 8, 8), date(2016, 11, 8), join('candidates', '2016', 'trump'))
get_and_save_data(date(2016, 8, 8), date(2016, 11, 8), join('candidates', '2016', 'hillary'))

In [18]:
get_and_save_data(date(2012, 8, 6), date(2012, 11, 6), join('candidates', '2012', 'obama'))
get_and_save_data(date(2012, 8, 6), date(2012, 11, 6), join('candidates', '2012', 'romney'))

In [19]:
get_and_save_data(date(2008, 8, 4), date(2008, 11, 4), join('candidates', '2008', 'obama'))
get_and_save_data(date(2008, 8, 4), date(2008, 11, 4), join('candidates', '2008', 'mccain'))

In [20]:
get_and_save_data(date(2004, 8, 2), date(2004, 11, 2), join('candidates', '2004', 'bush'))
get_and_save_data(date(2004, 8, 2), date(2004, 11, 2), join('candidates', '2004', 'john'))

In [21]:
get_and_save_data(date(2000, 8, 7), date(2000, 11, 7), join('candidates', '2000', 'bush'))
get_and_save_data(date(2000, 8, 7), date(2000, 11, 7), join('candidates', '2000', 'gore'))

In [22]:
st_date = date(2020, 4, 2)
en_date = date(2020, 7, 1)

In [23]:
get_and_save_data(st_date, en_date, join('candidates', '2020', 'trump'))
get_and_save_data(st_date, en_date, join('candidates', '2020', 'biden'))

In [24]:
get_and_save_data(st_date, en_date, join('economy', '2020', 'trump'))
get_and_save_data(st_date, en_date, join('economy', '2020', 'biden'))

In [25]:
get_and_save_data(st_date, en_date, join('party', '2020', 'trump'))
get_and_save_data(st_date, en_date, join('party', '2020', 'biden'))

In [26]:
get_and_save_data(st_date, en_date, join('environment', '2020', 'trump'))
get_and_save_data(st_date, en_date, join('environment', '2020', 'biden'))

In [27]:
get_and_save_data(st_date, en_date, join('foreign-policy', '2020', 'trump'))
get_and_save_data(st_date, en_date, join('foreign-policy', '2020', 'biden'))

In [28]:
get_and_save_data(st_date, en_date, join('guns', '2020', 'trump'))
get_and_save_data(st_date, en_date, join('guns', '2020', 'biden'))

In [29]:
get_and_save_data(st_date, en_date, join('health', '2020', 'trump'))
get_and_save_data(st_date, en_date, join('health', '2020', 'biden'))

In [30]:
get_and_save_data(st_date, en_date, join('immigration', '2020', 'trump'))
get_and_save_data(st_date, en_date, join('immigration', '2020', 'biden'))