## Prepare training data
Training data will be a pandas dataframe grouped by dates. Each date will correspond to all the words of all the articles that day. 
1. TODO: How do we handle weekends? (May cause problems in continuos data)
2. TODO: Drop the days that do not have any articles.

In [1]:
import pickle
import pandas as pd
import numpy as np
import sys
sys.path.insert(0, '../')
import stock_data

In [2]:
import datetime

In [3]:
with open("nyt_1800.pkl", "rb") as fp:   # Unpickling
    raw = pickle.load(fp)

In [4]:
df = pd.DataFrame(raw, columns = ["link", "time", "words"])

In [5]:
#Convert to date time type. Sort by date
df['time'] = pd.to_datetime(df.time)
df = df.sort_values('time')
df = df.set_index('time')

In [6]:
df.head()

Unnamed: 0_level_0,link,words
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-27 00:31:19,https://www.nytimes.com/2018/09/26/sports/manc...,"[MANCHESTER, England, It, was, past, midnight,..."
2018-09-27 04:03:26,https://www.nytimes.com/2018/09/27/briefing/eu...,"[Want, to, get, this, briefing, by, email, Her..."
2018-09-27 04:41:51,https://www.nytimes.com/2018/09/27/world/austr...,"[The, Australia, Letter, is, a, weekly, newsle..."
2018-09-27 09:00:05,https://www.nytimes.com/2018/09/27/travel/how-...,"[When, looking, for, restaurant, recommendatio..."
2018-09-27 09:44:58,https://www.nytimes.com/2018/09/27/briefing/br...,"[Want, to, get, this, briefing, by, email, Her..."


In [7]:
#Convert to String. Handle None type
df['liststring'] = [','.join(map(str, l)) if l is not None else "" for l in df['words'] ]
df.drop(columns= ['words'], inplace = True)

In [8]:
#Concat all the strings of a single day together
post_df = df.groupby(pd.Grouper(freq='D'))['liststring'].apply(lambda x: x.sum())

In [9]:
#Range of dates
print(post_df.head(1).index)
print(post_df.tail(1).index)

DatetimeIndex(['2018-09-27'], dtype='datetime64[ns]', name='time', freq='D')
DatetimeIndex(['2019-05-17'], dtype='datetime64[ns]', name='time', freq='D')


## Prepare labels (which is the stock data) 

In [64]:
#Get the stock data to use as our labels
label = stock_data.get_stock_data('GOOG', '94Z49Z19XNL1GGGP')

In [65]:
#Range of dates
print(label.head(1).index)
print(label.tail(1).index)

DatetimeIndex(['2014-03-27'], dtype='datetime64[ns]', name='date', freq=None)
DatetimeIndex(['2019-05-21'], dtype='datetime64[ns]', name='date', freq=None)


In [66]:
#Truncate the date range to fit the post_df dataframe
label = label.truncate(before=pd.Timestamp('2018-09-27'), after=pd.Timestamp('2019-05-17'))

In [67]:
label.drop(columns=['ds'], inplace=True)
label.head(3)

Unnamed: 0_level_0,open,high,low,close,volume
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-09-27,1186.73,1202.1,1183.63,1194.64,1260754
2018-09-28,1191.87,1195.41,1184.5,1193.47,1380629
2018-10-01,1199.89,1209.9,1190.3,1195.31,1357604


This data doesn't have weekends. Need to think of some kind of way to handle this.

# Applying labels to article DataFrame

In [68]:
# Get days of the week
sdf = pd.DataFrame({'words': post_df})
sdf['dayofweek'] = sdf.index.dayofweek
sdf.head()

Unnamed: 0_level_0,words,dayofweek
time,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-27,"MANCHESTER,England,It,was,past,midnight,when,J...",3
2018-09-28,"You,may,have,noticed,a,bold,advertisement,in,T...",4
2018-09-29,"LONDON,The,British,telephone,box,is,not,dead,y...",5
2018-09-30,"Stephanie,Audrey,Friedman,and,Stanley,Gartshei...",6
2018-10-01,"A,new,sitcom,airs,on,CBS,while,a,new,documenta...",0


In [69]:
# Shift dates back by 1 day, stored into column "last"
label['last'] = label.index - pd.Timedelta(days=1)
label.drop(columns=['high', 'low', 'close', 'volume'], inplace=True)
label.head()

Unnamed: 0_level_0,open,last
date,Unnamed: 1_level_1,Unnamed: 2_level_1
2018-09-27,1186.73,2018-09-26
2018-09-28,1191.87,2018-09-27
2018-10-01,1199.89,2018-09-30
2018-10-02,1190.96,2018-10-01
2018-10-03,1205.0,2018-10-02


In [83]:
# Subtract the open prices between the merged "open" columns
deltas = label.merge(label, left_on='last', right_on='date', suffixes=('_left', '_right'))
deltas.rename({'open_left':'date'}, inplace=True)
deltas.set_index('last_left', drop=True, inplace=True)
deltas['delta'] = pd.to_numeric(deltas.open_left) - pd.to_numeric(deltas.open_right)
deltas.drop(columns=['open_left', 'open_right', 'last_right'], inplace=True)
display(deltas.shape)
deltas.head()

(122, 1)

Unnamed: 0_level_0,delta
last_left,Unnamed: 1_level_1
2018-09-27,5.14
2018-10-01,-8.93
2018-10-02,14.04
2018-10-03,-9.67
2018-10-04,-27.83


In [86]:
# Join the delta data with the original df of article data
proc_df = sdf.merge(label.drop(columns=['last']), how='left', left_index=True, right_index=True)
display(proc_df.head())
display(proc_df.shape)
proc_df.dtypes

Unnamed: 0_level_0,words,dayofweek,open
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-09-27,"MANCHESTER,England,It,was,past,midnight,when,J...",3,1186.73
2018-09-28,"You,may,have,noticed,a,bold,advertisement,in,T...",4,1191.87
2018-09-29,"LONDON,The,British,telephone,box,is,not,dead,y...",5,
2018-09-30,"Stephanie,Audrey,Friedman,and,Stanley,Gartshei...",6,
2018-10-01,"A,new,sitcom,airs,on,CBS,while,a,new,documenta...",0,1199.89


(233, 3)

words        object
dayofweek     int64
open         object
dtype: object

In [None]:
proc_df