In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize import word_tokenize




In [6]:
df = pd.read_csv('data/dataset.csv').dropna()

In [7]:
df

Unnamed: 0,Date,Title,Body,Dow Jones Value,Dow Jones Delta,Nasdaq Value,Nasdaq Delta,S&P 500 Value,S&P 500 Delta
0,2017-04-06,Notice Regarding the Continuation of the Natio...,NOTICE- - - - - - -CONTINUATION OF THE NATIONA...,20662.949219,-6.849610,5878.950195,-1.140136,2357.489990,-1.949951
1,2017-04-06,Message to the Congress Regarding the Continua...,TO THE CONGRESS OF THE UNITED STATES:Section20...,20662.949219,-6.849610,5878.950195,-1.140136,2357.489990,-1.949951
2,2017-04-06,1600 Daily: Everything White House for 4/6/17,"Summary:Get news, events and updates from the ...",20662.949219,-6.849610,5878.950195,-1.140136,2357.489990,-1.949951
3,2017-04-06,Vice President Mike Pence to Travel to the Rep...,"WASHINGTON, DC - Vice President Mike Pence wil...",20662.949219,-6.849610,5878.950195,-1.140136,2357.489990,-1.949951
4,2017-04-06,White House History: White House Easter Egg Roll,Continuing the timeless tradition of the White...,20662.949219,-6.849610,5878.950195,-1.140136,2357.489990,-1.949951
5,2017-04-06,Remarks by President Trump and Vice President ...,East Room9:28 A.M. EDTTHE VICE PRESIDENT: To a...,20662.949219,-6.849610,5878.950195,-1.140136,2357.489990,-1.949951
6,2017-04-06,"Photo of the Day: April 6, 2017",President Donald Trump and King Abdullah II of...,20662.949219,-6.849610,5878.950195,-1.140136,2357.489990,-1.949951
7,2017-04-05,Readout of President Donald J. Trump’s Call wi...,President Donald J. Trump spoke today with Pri...,20648.150391,14.798828,5864.479980,14.470215,2352.949951,4.540039
8,2017-04-05,Statement from the Press Secretary on Gold Sta...,"On this day in 1945, nearing the end of World ...",20648.150391,14.798828,5864.479980,14.470215,2352.949951,4.540039
9,2017-04-05,Memorandum: Implementing Executive Order 13771...,M-17-21MEMORANDUM FOR:REGULATORY POLICY OFFICE...,20648.150391,14.798828,5864.479980,14.470215,2352.949951,4.540039


In [42]:
dates = set(df['Date'])

In [43]:
posts = {}
titles = {}
for index,row in df.iterrows():
    date = row['Date']
    if date not in posts:
        posts[date] = []
        titles[date] = []
    posts[date].append(row['Body'])
    titles[date].append(row['Title'])
    
posts = {date:' '.join(posts[date]) for date in posts}
titles = {date: ' '.join(titles[date]) for date in titles}

posts = pd.DataFrame(list(posts.items()))
posts.columns = ['Date', 'Body']

titles = pd.DataFrame(list(titles.items()))
titles.columns = ['Date', 'Title']

dj_deltas = df[['Date', 'Dow Jones Delta']].drop_duplicates()
nd_deltas = df[['Date', 'Nasdaq Delta']].drop_duplicates()
sp_deltas = df[['Date', 'S&P 500 Delta']].drop_duplicates()

In [45]:
dataset = pd.merge(posts, titles, how='inner', on=['Date'])
dataset = pd.merge(dataset, dj_deltas, how='inner', on=['Date'])
dataset = pd.merge(dataset, nd_deltas, how='inner', on=['Date'])
dataset = pd.merge(dataset, sp_deltas, how='inner', on=['Date'])

dataset['Body'] = dataset['Body'].str.replace('\d+', '')
dataset['Title'] = dataset['Title'].str.replace('\d+', '')
dataset['Mean'] = (dataset['Dow Jones Delta'] + dataset['Nasdaq Delta'] + dataset['S&P 500 Delta']) / 3
dataset['Labels'] = (dataset['Mean'] >= 0)


dataset

Unnamed: 0,Date,Body,Title,Dow Jones Delta,Nasdaq Delta,S&P 500 Delta,Mean
0,2017-03-02,Aboard Air Force OneEn Route Joint Base Andrew...,Press Gaggle by Press Secretary Sean Spicer en...,-2.740235,-9.529785,-1.200195,-4.490072
1,2017-04-03,President Donald J. Trump today welcomed Presi...,Readout of President Donald J. Trump’s Meeting...,-39.029296,-3.929687,-1.319824,-14.759602
2,2017-03-27,James S. Brady Press Briefing Room: P.M. EDTSE...,Background Briefing on the President's Energy ...,-150.519531,-34.77002,-16.97998,-67.423177
3,2017-03-20,"Today, President Donald J. Trump welcomed Iraq...",Joint Readout of Meeting Between President Don...,237.849609,107.699707,29.449951,124.999756
4,2017-03-22,The Vice President met today with Secretary of...,Readout of the Vice President's Meeting with U...,4.720703,3.950196,2.48999,3.720296
5,2017-03-06,President Donald J. Trump spoke separately tod...,Readout of the President's Calls with Prime Mi...,29.580078,15.25,6.920166,17.250081
6,2017-03-03,Its been just six weeks since President Donald...,President Donald J. Trump Delivers the Weekly ...,51.371094,21.569824,7.810058,26.916992
7,2017-02-08,President Donald J. Trump today provided a let...,Statement from the Press Secretary First Lady ...,-118.060547,-32.729981,-13.200195,-54.663574
8,2017-02-15,"Today, President Donald J. Trump welcomed Isra...",Joint Readout of Meeting Between President Don...,-7.910156,4.540039,2.030029,-0.446696
9,2017-03-30,The President today welcomed Prime Minister La...,Joint Readout of Meeting Between President Don...,65.269531,2.59961,5.340088,24.403076


In [81]:
vectorizer = CountVectorizer(input=u'content', encoding=u'utf-8', 
                             decode_error=u'strict', strip_accents=None, 
                             lowercase=True, preprocessor=None, 
                             tokenizer=word_tokenize, stop_words='english', 
                             token_pattern=u'(?u)\b\w\w+\b', 
                             ngram_range=(1, 1), analyzer=u'word', 
                             max_df=1.0, min_df=1, max_features=None)

dataset['Body Tokenized'] = vectorizer.fit_transform(dataset['Body'])
dataset['Title Tokenized'] = vectorizer.transform(dataset['Title'])


In [82]:
dataset

Unnamed: 0,Date,Body,Title,Dow Jones Delta,Nasdaq Delta,S&P 500 Delta,Mean,Labels,Body Tokenized,Title Tokenized
0,2017-03-02,Aboard Air Force OneEn Route Joint Base Andrew...,Press Gaggle by Press Secretary Sean Spicer en...,-2.740235,-9.529785,-1.200195,-4.490072,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
1,2017-04-03,President Donald J. Trump today welcomed Presi...,Readout of President Donald J. Trump’s Meeting...,-39.029296,-3.929687,-1.319824,-14.759602,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
2,2017-03-27,James S. Brady Press Briefing Room: P.M. EDTSE...,Background Briefing on the President's Energy ...,-150.519531,-34.77002,-16.97998,-67.423177,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
3,2017-03-20,"Today, President Donald J. Trump welcomed Iraq...",Joint Readout of Meeting Between President Don...,237.849609,107.699707,29.449951,124.999756,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
4,2017-03-22,The Vice President met today with Secretary of...,Readout of the Vice President's Meeting with U...,4.720703,3.950196,2.48999,3.720296,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
5,2017-03-06,President Donald J. Trump spoke separately tod...,Readout of the President's Calls with Prime Mi...,29.580078,15.25,6.920166,17.250081,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
6,2017-03-03,Its been just six weeks since President Donald...,President Donald J. Trump Delivers the Weekly ...,51.371094,21.569824,7.810058,26.916992,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
7,2017-02-08,President Donald J. Trump today provided a let...,Statement from the Press Secretary First Lady ...,-118.060547,-32.729981,-13.200195,-54.663574,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
8,2017-02-15,"Today, President Donald J. Trump welcomed Isra...",Joint Readout of Meeting Between President Don...,-7.910156,4.540039,2.030029,-0.446696,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
9,2017-03-30,The President today welcomed Prime Minister La...,Joint Readout of Meeting Between President Don...,65.269531,2.59961,5.340088,24.403076,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."


### Preprocessing
- ~~Label~~ 
- ~~Remove Stopwords~~
- Given a label, which words are most common
- What were the sentiments of the most common words per category
- Name-Entities mentioned
- Cluster topics where we saw the difference in movements, cluster then color

### Featurizing

### Models

In [46]:
dataset['Labels'] = (dataset['Mean'] >= 0)
dataset

Unnamed: 0,Date,Body,Title,Dow Jones Delta,Nasdaq Delta,S&P 500 Delta,Mean,Labels
0,2017-03-02,Aboard Air Force OneEn Route Joint Base Andrew...,Press Gaggle by Press Secretary Sean Spicer en...,-2.740235,-9.529785,-1.200195,-4.490072,False
1,2017-04-03,President Donald J. Trump today welcomed Presi...,Readout of President Donald J. Trump’s Meeting...,-39.029296,-3.929687,-1.319824,-14.759602,False
2,2017-03-27,James S. Brady Press Briefing Room: P.M. EDTSE...,Background Briefing on the President's Energy ...,-150.519531,-34.77002,-16.97998,-67.423177,False
3,2017-03-20,"Today, President Donald J. Trump welcomed Iraq...",Joint Readout of Meeting Between President Don...,237.849609,107.699707,29.449951,124.999756,True
4,2017-03-22,The Vice President met today with Secretary of...,Readout of the Vice President's Meeting with U...,4.720703,3.950196,2.48999,3.720296,True
5,2017-03-06,President Donald J. Trump spoke separately tod...,Readout of the President's Calls with Prime Mi...,29.580078,15.25,6.920166,17.250081,True
6,2017-03-03,Its been just six weeks since President Donald...,President Donald J. Trump Delivers the Weekly ...,51.371094,21.569824,7.810058,26.916992,True
7,2017-02-08,President Donald J. Trump today provided a let...,Statement from the Press Secretary First Lady ...,-118.060547,-32.729981,-13.200195,-54.663574,False
8,2017-02-15,"Today, President Donald J. Trump welcomed Isra...",Joint Readout of Meeting Between President Don...,-7.910156,4.540039,2.030029,-0.446696,False
9,2017-03-30,The President today welcomed Prime Minister La...,Joint Readout of Meeting Between President Don...,65.269531,2.59961,5.340088,24.403076,True


In [61]:
body_list = dataset['Body'].tolist()

In [75]:
tokenized_body = vectorizer.fit_transform(body_list)

In [76]:
tokenized_title = vectorizer.fit_transform(dataset['Title'])

In [77]:
tokenized_title

<40x935 sparse matrix of type '<class 'numpy.int64'>'
	with 2558 stored elements in Compressed Sparse Row format>

In [78]:
tokenized_body.

<40x20221 sparse matrix of type '<class 'numpy.int64'>'
	with 86400 stored elements in Compressed Sparse Row format>

In [80]:
vectorizer.get_feature_names()

['#',
 '$',
 '&',
 "'",
 "''",
 "'s",
 '(',
 ')',
 ',',
 '-',
 '--',
 '.',
 '//',
 '/faqs/talking',
 '/interim',
 ':',
 '?',
 '``',
 'abbas',
 'abdel',
 'abdulaziz',
 'abdullah',
 'abe',
 'aboard',
 'abuse',
 'access',
 'accountability',
 'accurate',
 'achieving',
 'acosta',
 'act',
 'acting',
 'action',
 'actions',
 'activities',
 'addiction',
 'additional',
 'additions',
 'address',
 'administration',
 'administrator',
 'advisor',
 'advocates',
 'aeronautics',
 'affairs',
 'afghanistan',
 'africa',
 'african',
 'agencies”',
 'agenda',
 'agriculture',
 'aircraft',
 'al',
 'al-abadi',
 'al-ahmad',
 'al-jabir',
 'al-sabah',
 'al-sisi',
 'al-thani',
 'alan',
 'alexander',
 'ally',
 'ambassador',
 'america',
 'american',
 'americans',
 'amir',
 'amul',
 'analysis',
 'andrew',
 'andrews',
 'angela',
 'anna',
 'anniversary',
 'announce',
 'announcement',
 'announces',
 'anti-crime',
 'antidumping',
 'ap',
 'appeals',
 'appointment',
 'appointments',
 'approach',
 'appropriations',
 'approve