In [124]:
import csv
from datetime import date, timedelta
from pickle import dump, load
import calendar
import numpy as np


STOCK_INDEX_FILES = ['data/DJIA.csv', 'data/NASDAQCOM.csv', 'data/SP500.csv']

# Labels
NO_CURRENT_VALUE = 'No current value'
NO_NEXT_VALUE = 'No next value'
UP = 'Up'
DOWN = 'Down'

def generate_date_dicts():
    for name in STOCK_INDEX_FILES:
        with open(name, 'r') as f:
            date_dict = {}
            reader = csv.reader(f)
            reader.__next__() # get rid of header
            for row in reader:
                if row[1] == '.': # no entry in stock file
                    continue
                d = date(*list(map(int, row[0].split('-')))) # format date into datetime object
                value = float(row[1])
                date_dict[d] = value

        with open(name.replace('.csv', NAME_EXTENSION), 'wb') as f:
            dump(date_dict, f, protocol=2)


def get_date_dict(path):
    date_dict = {}
    with open(path, 'r') as f:
            reader = csv.reader(f)
            reader.__next__() # get rid of header
            for row in reader:
                if row[1] == '.': # no entry in stock file
                    continue
                d = date(*list(map(int, row[0].split('-')))) # format date into datetime object
                value = float(row[1])
                date_dict[d] = value
    return date_dict

def get_date_window(date_dict, center_date, window_size=4):
    values = []
    i = 0
    while len(values) < window_size:
        time_delta = timedelta(days=i, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0)
        d = center_date - time_delta
        if d in date_dict:
            values.append(date_dict[d])
        i += 1
    values.reverse()
    return values

def get_next_date_value(date_dict, center_date):
    time_delta = timedelta(days=1, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0)
    d = center_date + time_delta
    if d not in date_dict:
        return None
    return date_dict[d]

def get_wh_data():
    month_to_int = {name: num for num, name in enumerate(calendar.month_name) if num}
    dates = []
    titles = []
    bodies = []
    with open('data/WH_posts.csv', 'r') as f:
        reader = csv.reader(f)
        reader.__next__()
        for row in reader:
            d = row[0].replace(',','').split()
            d = date(*[int(d[2]), month_to_int[d[0]], int(d[1])])
            dates.append(d)
            titles.append(row[1])
            bodies.append(row[2])

    data = {'dates':dates, 'titles':titles, 'bodies':bodies}
    with open('data/WH_posts_structured', 'wb') as f:
        dump(data, f, protocol=2)
        
    return dates, titles, bodies


def generate_labels(date_dict, wh_dates):
    labels = []
    for d in wh_dates:
        if d in test_dict:
            cur_value = date_dict[d]
        else:
            labels.append(NO_CURRENT_VALUE)
            continue

        next_value = get_next_date_value(test_dict, d)
        if next_value:
            labels.append(UP if (next_value - cur_value) >= 0 else DOWN)
        else:
            labels.append(NO_NEXT_VALUE)
    return labels

def filter_labels(labels):
    labels = np.array(labels)
    labels = labels[np.where(labels != NO_CURRENT_VALUE)[0]]
    labels = labels[np.where(labels != NO_NEXT_VALUE)[0]]
    return labels


In [125]:
generate_wh_data()

How do we want to handle labeling? Do we want to average the three indexes together or generate 3 different labels?

In [126]:
wh_dates, wh_titles, wh_bodies = get_wh_data()
dj_dict = get_date_dict('data/DJIA.csv')
nd_dict = get_date_dict('data/NASDAQCOM.csv')
sp_dict = get_date_dict('data/SP500.csv')

dj_labels = filter_labels(generate_labels(dj_dict, wh_dates))
nd_labels = filter_labels(generate_labels(nd_dict, wh_dates))
sp_labels = filter_labels(generate_labels(sp_dict, wh_dates))

In [130]:

print("There are %d labels total" %len(dj_labels))
print("No label: %d" % (len(np.where(dj_labels == NO_CURRENT_VALUE)[0]) + len(np.where(dj_labels == NO_NEXT_VALUE)[0])))
print("There are %d positive labels" % (499-159))

index_differences = np.where(dj_labels != nd_labels)[0]
print(len(np.where(dj_labels != nd_labels)[0]))
print(len(np.where(dj_labels != sp_labels)[0]))
print(len(np.where(nd_labels != sp_labels)[0]))
print(dj_labels[index_differences])
print(sp_labels[index_differences])

There are 340 labels total
No label: 0
There are 340 positive labels
169
169
0
['Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down