In [301]:
import csv
from datetime import date, timedelta
from pickle import dump, load
import numpy as np
import pandas as pd
from yahoo_finance import Share
import requests
import bs4


STOCK_INDEX_FILES = ['data/DJIA.csv', 'data/NASDAQCOM.csv', 'data/SP500.csv']
INDEX_NAMES = {'Nasdaq': '^IXIC', 'Dow Jones': '^DJI', 'S&P 500': '^GSPC'}


# Labels
NO_CURRENT_VALUE = 'No current value'
NO_NEXT_VALUE = 'No next value'
UP = 'Up'
DOWN = 'Down'

def get_date_window(date_dict, center_date, window_size=4):
    values = []
    i = 0
    while len(values) < window_size:
        time_delta = timedelta(days=i, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0)
        d = center_date - time_delta
        if d in date_dict:
            values.append(date_dict[d])
        i += 1
    values.reverse()
    return values

def get_next_date_value(date_dict, center_date):
    time_delta = timedelta(days=1, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=0, weeks=0)
    d = center_date + time_delta
    if d not in date_dict:
        return None
    return date_dict[d]


def generate_labels(date_dict, wh_dates):
    labels = []
    for d in wh_dates:
        if d in date_dict:
            cur_value = date_dict[d]
        else:
            labels.append(NO_CURRENT_VALUE)
            continue

        next_value = get_next_date_value(date_dict, d)
        if next_value:
            labels.append(UP if (next_value - cur_value) >= 0 else DOWN)
        else:
            labels.append(NO_NEXT_VALUE)
    return labels

def filter_labels(labels):
    labels = np.array(labels)
    labels = labels[np.where(labels != NO_CURRENT_VALUE)[0]]
    labels = labels[np.where(labels != NO_NEXT_VALUE)[0]]
    return labels

def get_wh_data():
    # new list to append urls
    post_urls = []
    url = "https://www.whitehouse.gov"

    # 50 pages of posts
    for i in np.arange(50):
        #grab url and append page number
        page_url = url + "/blog?page=" + str(i)
        r = requests.get(page_url)
        soup = bs4.BeautifulSoup(r.content.decode('utf-8'), "html.parser")
        # h3 field-content is the tag to get post urls
        page_posts = soup.find_all("h3", "field-content")
        # dates for the posts
        post_dates = soup.find_all("span", "field-content")
        # append page's post urls to our list
        post_urls.append([(post.text, post.find('a')['href'], date.text) for post, date in zip(page_posts, post_dates)])

    # flatten list
    all_posts = [p for post in post_urls for p in post]

    posts = []

    # loop through every post and get text in post
    for post_data in all_posts:
        req = requests.get(url + post_data[1])
        req_soup = bs4.BeautifulSoup(req.text, "html.parser")
        post_body = req_soup.find("div","pane-entity-field").text\
        .encode('ascii', 'ignore').decode('UTF-8').replace("\n", "").replace("\t", "")
        posts.append(post_body)

    df_out = pd.DataFrame({'a': [post[2].strip() for post in all_posts], 'b': [post[0] for post in all_posts], 'c': [post.strip() for post in posts]})
    df_out.columns = ['Date', 'Title', 'Body']

    df_out.to_csv('data/WH_posts.csv', index=False)
    return df_out

def get_stock_values(stock_abbrv):
    """
    Given the stock abbrevation, this function
    will pull from Yahoo Finance the history of that 
    stock from 2016-12-25 to present day
    """
    share = Share(stock_abbrv)
    share_history = share.get_historical('2016-12-25', date.isoformat(date.today()))
    df = pd.DataFrame([[s['Date'], float(s['Close'])] for s in share_history], columns=['Date', 'Value'])
    df['Date'] = pd.to_datetime(df['Date'])
    return df
    
def merge_texts(wh_df):
    """
    Merges the text fields of posts
    that are on the same date.
    """
    date_to_bodies = {}
    date_to_titles = {}
    dates = pd.to_datetime(wh_df['Date'])
    titles = wh_df['Title']
    bodies = wh_df['Body']
    for i in range(len(dates)):
        if dates[i] not in date_to_titles:
            date_to_titles[dates[i]] = ''
            date_to_bodies[dates[i]] = ''
        date_to_titles[dates[i]] += titles[i] + ' '
        date_to_bodies[dates[i]] += bodies[i] + ' '
        
    title_df = pd.DataFrame(list(date_to_titles.items()), columns=['Date', 'Title'])
    body_df = pd.DataFrame(list(date_to_bodies.items()), columns=['Date', 'Body'])
    wh_df = pd.merge(title_df, body_df, how='inner', on=['Date'])
    
    return wh_df

def create_dataset():
    """
    Pulls closing values from Yahoo finance and
    matches those values to white house posts using
    the date field. Returns a dataframe for easy
    manipulation.
    """
    wh_df = get_wh_data()
    wh_df['Date'] = pd.to_datetime(wh_df['Date'])
        
    stock_dfs = {name:get_stock_values(INDEX_NAMES[name]) for name in INDEX_NAMES}
    processed_stock_dfs = []
    for name in stock_dfs:
        stock_df = stock_dfs[name]
        dates = stock_df['Date']
        values = stock_df['Value'].rename(name + ' Value')
        stock_df = pd.concat([dates, values], axis=1, join_axes=[dates.index])
        stock_df.sort_values(by='Date')
        
        stock_df[name + ' Delta'] = stock_df[name + ' Value'].diff(periods=1)
        processed_stock_dfs.append(stock_df)     

        
    wh_df['Date'] = pd.to_datetime(wh_df['Date'])
    stock_df = processed_stock_dfs[0]
    for i in range(1, len(stock_dfs)):
        stock_df = pd.merge(stock_df, processed_stock_dfs[i], how='inner', on=['Date'])
    
    dataset = pd.merge(wh_df, stock_df, how='inner', on=['Date'])
    
    return dataset.sort_values(by='Date').reset_index(drop=True)
    
    
    

How do we want to handle labeling? Do we want to average the three indexes together or generate 3 different labels?

In [22]:
wh_dates, wh_titles, wh_bodies = get_wh_data()
dj_dict = get_date_dict('data/DJIA.csv')
nd_dict = get_date_dict('data/NASDAQCOM.csv')
sp_dict = get_date_dict('data/SP500.csv')

dj_labels = filter_labels(generate_labels(dj_dict, wh_dates))
nd_labels = filter_labels(generate_labels(nd_dict, wh_dates))
sp_labels = filter_labels(generate_labels(sp_dict, wh_dates))

In [24]:
print("There are %d labels total" %len(dj_labels))
print("No label: %d" % (len(np.where(dj_labels == NO_CURRENT_VALUE)[0]) + len(np.where(dj_labels == NO_NEXT_VALUE)[0])))
print("There are %d positive labels" % (499-159))

index_differences = np.where(dj_labels != nd_labels)[0]
print(len(np.where(dj_labels != nd_labels)[0]))
print(len(np.where(dj_labels != sp_labels)[0]))
print(len(np.where(nd_labels != sp_labels)[0]))
print(dj_labels[index_differences])


There are 340 labels total
No label: 0
There are 340 positive labels
121
64
77
['Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up'
 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Down'
 'Down' 'Down' 'Down' 'Down' 'Down' 'Down' 'Up' 'Up' 'Up' 'Up' 'Up' 'Up'
 'Up' 'Up' 'Up' 'Up']


In [302]:
dataset = create_dataset()
dataset

Unnamed: 0,Date,Title,Body,S&P 500 Value,S&P 500 Delta,Nasdaq Value,Nasdaq Delta,Dow Jones Value,Dow Jones Delta
0,2017-02-08,Remarks by President Trump at MCCA Winter Conf...,"J.W. MarriottWashington, D.C.9:18 A.M. ESTTHE ...",2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
1,2017-02-08,Statement from the Press Secretary,President Donald J. Trump today provided a let...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
2,2017-02-08,First Lady Melania Trump Announces the Appoint...,First Lady Melania Trumpis pleased to announce...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
3,2017-02-08,Widespread Support for Judge Neil Gorsuch,All across the country editorial boards are pr...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
4,2017-02-08,Judge Neil Gorsuch is the Mainstream Choice fo...,Gorsuch in the MainstreamHe was upheld at the ...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
5,2017-02-08,Remarks by President Trump and Intel CEO Brian...,Oval Office 12:52 P.M. ESTTHE PRESIDENT: You'v...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
6,2017-02-08,Intel Announces $7 Billion Investment Creating...,The past 8 years have taken a toll on American...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
7,2017-02-08,"Press Briefing by Press Secretary Sean Spicer,...",James S. Brady Press Briefing Room1:48 P.M. ES...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
8,2017-02-09,Readout of the President’s Call with President...,President Donald J. Trump and President Xi Jin...,2307.870117,-8.229981,5715.180176,-18.949707,20172.400391,-96.968750
9,2017-02-09,Remarks by the Vice President at the Henry O. ...,"The United States Military AcademyWest Point, ...",2307.870117,-8.229981,5715.180176,-18.949707,20172.400391,-96.968750


In [284]:
test = pd.DataFrame(['January 5, 2017'])

In [303]:
dataset.dropna()

Unnamed: 0,Date,Title,Body,S&P 500 Value,S&P 500 Delta,Nasdaq Value,Nasdaq Delta,Dow Jones Value,Dow Jones Delta
0,2017-02-08,Remarks by President Trump at MCCA Winter Conf...,"J.W. MarriottWashington, D.C.9:18 A.M. ESTTHE ...",2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
1,2017-02-08,Statement from the Press Secretary,President Donald J. Trump today provided a let...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
2,2017-02-08,First Lady Melania Trump Announces the Appoint...,First Lady Melania Trumpis pleased to announce...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
3,2017-02-08,Widespread Support for Judge Neil Gorsuch,All across the country editorial boards are pr...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
4,2017-02-08,Judge Neil Gorsuch is the Mainstream Choice fo...,Gorsuch in the MainstreamHe was upheld at the ...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
5,2017-02-08,Remarks by President Trump and Intel CEO Brian...,Oval Office 12:52 P.M. ESTTHE PRESIDENT: You'v...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
6,2017-02-08,Intel Announces $7 Billion Investment Creating...,The past 8 years have taken a toll on American...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
7,2017-02-08,"Press Briefing by Press Secretary Sean Spicer,...",James S. Brady Press Briefing Room1:48 P.M. ES...,2294.669922,-13.200195,5682.450195,-32.729981,20054.339844,-118.060547
8,2017-02-09,Readout of the President’s Call with President...,President Donald J. Trump and President Xi Jin...,2307.870117,-8.229981,5715.180176,-18.949707,20172.400391,-96.968750
9,2017-02-09,Remarks by the Vice President at the Henry O. ...,"The United States Military AcademyWest Point, ...",2307.870117,-8.229981,5715.180176,-18.949707,20172.400391,-96.968750


In [None]:
d