In [56]:
from pycorenlp import StanfordCoreNLP
import pandas as pd
from pathlib import Path

In [57]:
nlp = StanfordCoreNLP('http://localhost:9000')

In [58]:
def get_sentiment(text):
    res = nlp.annotate(text,
                       properties={'annotators': 'sentiment',
                                   'outputFormat': 'json',
                                   'timeout': 1000,
                       })
    #print(text)
    #print('Sentiment:', res['sentences'][0]['sentiment'])
    #print('Sentiment score:', res['sentences'][0]['sentimentValue'])
    #print('Sentiment distribution (0-v. negative, 5-v. positive:', res['sentences'][0]['sentimentDistribution'])
    
    if len(res['sentences']) == 0:
        return -1
    else:
        return res['sentences'][0]['sentimentValue']

In [59]:
#get_sentiment('Mark Singer on Trump: The wheels are starting to come off dramatically')

In [60]:
def get_data_by_topic_year(topic, year):

    path_hd = 'data/processed-data/'+topic+'/'+year+'/headlines.txt'
    headlines = []
    with open(path_hd) as f:
        headlines = [line.rstrip() for line in f]

    path_t = 'data/processed-data/'+topic+'/'+year+'/term.txt'
    terms = []
    with open(path_t) as f:
        terms = [line.rstrip() for line in f]

    path_d = 'data/processed-data/'+topic+'/'+year+'/dates.txt'
    dates = []
    with open(path_d) as f:
        dates = [line.rstrip() for line in f]

    return headlines, terms, dates
    

In [61]:
def get_data(topic, year):
    path_hd = 'data/processed-data/'+topic+'/'+year+'/hds.csv'
    df = pd.read_csv(path_hd)
    return df

In [62]:
def get_scores(df):
    
    scores = []
    for index, row in df.iterrows():
        if isinstance(row['headlines'], str):
            score = int(get_sentiment(row['headlines']))
            if score != -1:
                scores.append(score)
            else:
                scores.append(2)
        else:
            scores.append(2)

    df['scores'] = scores
    
    return df

In [63]:
def save_data(df, topic, year):
    
    path = './data/processed-sent/' + topic + '/' + year
    Path(path).mkdir(parents=True, exist_ok=True)
    
    df.to_csv(path + '/hdsentiment.csv', index=False)

In [64]:
def save_sent_data(topic, year):
    #headlines, terms, dates = get_data_by_topic_year(topic, year)
    df = get_data(topic, year)
    df = get_scores(df)
    save_data(df, topic, year)

In [65]:
for year in ['2016', '2012', '2008', '2004', '2000', '2020']:
    save_sent_data('candidates', year)

In [66]:
for year in ['2016', '2012', '2008', '2004', '2000', '2020']:
    print(year)
    save_sent_data('immigration', year)

2016
2012
2008
2004
2000
2020


In [67]:
for year in ['2016', '2012', '2008', '2004', '2000', '2020']:
    print(year)
    save_sent_data('health', year)

2016
2012
2008
2004
2000
2020


In [68]:
for year in ['2016', '2012', '2008', '2004', '2000', '2020']:
    print(year)
    save_sent_data('jobs-race', year)

2016
2012
2008
2004
2000
2020


In [69]:
for year in ['2016', '2012', '2008', '2004', '2000', '2020']:
    print(year)
    save_sent_data('environment', year)

2016
2012
2008
2004
2000
2020


In [70]:
for year in ['2016', '2012', '2008', '2004', '2000', '2020']:
    print(year)
    save_sent_data('guns', year)

2016
2012
2008
2004
2000
2020


In [71]:
for year in ['2016', '2012', '2008', '2004', '2000', '2020']:
    print(year)
    save_sent_data('party', year)

2016
2012
2008
2004
2000
2020


In [72]:
for year in ['2016', '2012', '2008', '2004', '2000', '2020']:
    print(year)
    save_sent_data('economy', year)

2016
2012
2008
2004
2000
2020


In [84]:
#candidate_df.head()

Unnamed: 0,headlines,terms,scores
0,gore selects lieberman as running mate,gore,2
1,mini-bounce leaves bush in danger,bush,2
2,gore goes for safety,gore,2
3,gore allies himself to integrity,gore,2
4,gore ditches clinton legacy,gore,2


In [85]:
#candidate_df[['terms']].groupby(['scores']).agg(['mean', 'count'])

#candidate_df.groupby(['terms']).agg({'scores': ['mean', 'count', 'min', 'max']})

Unnamed: 0_level_0,scores,scores,scores,scores
Unnamed: 0_level_1,mean,count,min,max
terms,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
bush,1.9375,80,1,3
george,1.866667,15,1,3
gore,1.857143,91,1,3
