In [1]:
import os
import re
import sys
from datetime import datetime
from time import time
import numpy as np
import pandas as pd
import multiprocessing as mp
import collections
import nltk
from nltk.corpus import stopwords
import pickle
from timeit import default_timer as timer
from operator import itemgetter
import more_itertools as mit
from multiprocessing import Pool

In [2]:
path_to_data = '/scratch/spf248/news/data/'
input_file   = 'reuters-econ-fin-mkt-'+datetime.today().strftime('%Y-%m')+'.pkl.xz'

In [3]:
with open(path_to_data+'tone2keywords.pkl','rb') as f:
    tone2keywords = pickle.load(f)

In [4]:
for name in tone2keywords:
    tone2keywords[name] = tone2keywords[name].set_index('word')['IDF'].to_dict()

In [5]:
print("Word Lists:", ', '.join(list(tone2keywords)))

Word Lists: strong, positive, negative, uncertainty, weak


In [6]:
print("Import News...")
start = timer()
articles = pd.read_pickle(path_to_data+input_file,compression='xz')
articles = articles[['date', 'regions', 'subjects', 'title', 'snippet', 'body']].copy()
print("Done in", round(timer() - start),"Sec")

Import News...
Done in 244 Sec


In [None]:
def get_counts(idx):
    
    text = ' '.join(list(articles.loc[idx, ['title','snippet','body']].replace(np.nan,'')))
    
    # Split into words and remove non-letter characters
    tokens = re.sub("[^a-zA-Z]"," ", text.lower()).split()
    
    # Return Words and Their Count
    counter = collections.Counter(tokens)

    # Word Count
    T = sum(counter.values())

    values = [T]
    index  = ['# words']

    for name in sorted(tone2keywords):

        # Tonal Words In the Text
        words  = list(set(counter.keys())&set(tone2keywords[name].keys()))

        if words:

            # Tonal Words Counts
            counts = itemgetter(*words)(counter)

            # Tonal Words IDFs
            idfs = itemgetter(*words)(tone2keywords[name])

            if len(words) > 1:
                tf = sum(counts)/T
            else:
                tf = counts/T
                
            tfidf = np.dot(counts,idfs)/T

        else:

            tf = 0
            tfidf  = 0

        values.append(tf)
        index.append('% '+name)

        values.append(tfidf)
        index.append('% '+name+' tfidf')
        
    return pd.Series(values,index=index, name=idx)

In [None]:
print("Compute Sentiment...\n")
start = timer()

with mp.Pool() as pool:
    sentiments = pd.DataFrame(pool.map(get_counts, articles.index))
    
end = timer()
print("Done In", round(end - start),"Sec")

In [None]:
print("Save...\n")
start = timer()

sentiments.to_pickle(path_to_data+'sentiment-'+input_file,compression='xz')

end = timer()
print("Done In", round(end - start),"Sec")