In [1]:
import numpy as np
import pandas as pd
import seaborn as sb
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize import word_tokenize


In [37]:
datatypes = {'Dow Jones Value': np.float32,
            'Dow Jones Delta': np.float32,
            'Dow Jones Proportion': np.float32,
            'Nasdaq Value': np.float32,
            'Nasdaq Delta': np.float32,
            'Nasdaq Proportion': np.float32,
            'S&P 500 Value': np.float32,
            'S&P 500 Delta': np.float32,
            'S&P 500 Proportion': np.float32 
            }

df = pd.read_csv('data/dataset.csv', dtype=datatypes).dropna()

In [38]:
df

Unnamed: 0,Date,Title,Body,Nasdaq Value,Nasdaq Delta,Nasdaq Proportion,Dow Jones Value,Dow Jones Delta,Dow Jones Proportion,S&P 500 Value,S&P 500 Delta,S&P 500 Proportion
0,2017-04-06,Notice Regarding the Continuation of the Natio...,NOTICE- - - - - - -CONTINUATION OF THE NATIONA...,5878.950195,-1.140136,-0.000194,20662.949219,-6.849610,-0.000331,2357.489990,-1.949951,-0.000827
1,2017-04-06,Message to the Congress Regarding the Continua...,TO THE CONGRESS OF THE UNITED STATES:Section20...,5878.950195,-1.140136,-0.000194,20662.949219,-6.849610,-0.000331,2357.489990,-1.949951,-0.000827
2,2017-04-06,1600 Daily: Everything White House for 4/6/17,"Summary:Get news, events and updates from the ...",5878.950195,-1.140136,-0.000194,20662.949219,-6.849610,-0.000331,2357.489990,-1.949951,-0.000827
3,2017-04-06,Vice President Mike Pence to Travel to the Rep...,"WASHINGTON, DC - Vice President Mike Pence wil...",5878.950195,-1.140136,-0.000194,20662.949219,-6.849610,-0.000331,2357.489990,-1.949951,-0.000827
4,2017-04-06,White House History: White House Easter Egg Roll,Continuing the timeless tradition of the White...,5878.950195,-1.140136,-0.000194,20662.949219,-6.849610,-0.000331,2357.489990,-1.949951,-0.000827
5,2017-04-06,Remarks by President Trump and Vice President ...,East Room9:28 A.M. EDTTHE VICE PRESIDENT: To a...,5878.950195,-1.140136,-0.000194,20662.949219,-6.849610,-0.000331,2357.489990,-1.949951,-0.000827
6,2017-04-06,"Photo of the Day: April 6, 2017",President Donald Trump and King Abdullah II of...,5878.950195,-1.140136,-0.000194,20662.949219,-6.849610,-0.000331,2357.489990,-1.949951,-0.000827
7,2017-04-05,Readout of President Donald J. Trump’s Call wi...,President Donald J. Trump spoke today with Pri...,5864.479980,14.470215,0.002467,20648.150391,14.798828,0.000717,2352.949951,4.540039,0.001930
8,2017-04-05,Statement from the Press Secretary on Gold Sta...,"On this day in 1945, nearing the end of World ...",5864.479980,14.470215,0.002467,20648.150391,14.798828,0.000717,2352.949951,4.540039,0.001930
9,2017-04-05,Memorandum: Implementing Executive Order 13771...,M-17-21MEMORANDUM FOR:REGULATORY POLICY OFFICE...,5864.479980,14.470215,0.002467,20648.150391,14.798828,0.000717,2352.949951,4.540039,0.001930


In [20]:
dates = set(df['Date'])

In [39]:
def join_blog_posts_on_date(df):
    posts = {}
    titles = {}
    for index,row in df.iterrows():
        date = row['Date']
        if date not in posts:
            posts[date] = []
            titles[date] = []
        posts[date].append(row['Body'])
        titles[date].append(row['Title'])

    posts = {date:' '.join(posts[date]) for date in posts}
    titles = {date: ' '.join(titles[date]) for date in titles}

    posts = pd.DataFrame(list(posts.items()))
    posts.columns = ['Date', 'Body']

    titles = pd.DataFrame(list(titles.items()))
    titles.columns = ['Date', 'Title']

    dj_deltas = df[['Date', 'Dow Jones Delta']].drop_duplicates()
    nd_deltas = df[['Date', 'Nasdaq Delta']].drop_duplicates()
    sp_deltas = df[['Date', 'S&P 500 Delta']].drop_duplicates()

    dj_delta_prop = (df['Dow Jones Delta'] / df['Dow Jones Value']).drop_duplicates()
    nd_delta_prop = (df['Nasdaq Delta'] / df['Nasdaq Value']).drop_duplicates()
    sp_delta_prop = (df['S&P 500 Delta'] / df['S&P 500 Value']).drop_duplicates()

    dj_deltas['Dow Jones Proportion'] = dj_delta_prop
    nd_deltas['Nasdaq Proportion'] = nd_delta_prop
    sp_deltas['S&P 500 Proportion'] = sp_delta_prop
    
    dataset = pd.merge(posts, titles, how='inner', on=['Date'])
    dataset = pd.merge(dataset, dj_deltas, how='inner', on=['Date'])
    dataset = pd.merge(dataset, nd_deltas, how='inner', on=['Date'])
    dataset = pd.merge(dataset, sp_deltas, how='inner', on=['Date'])

    dataset['Body'] = dataset['Body'].str.replace('\d+', '')
    dataset['Title'] = dataset['Title'].str.replace('\d+', '')
    dataset['Mean'] = (dataset['Dow Jones Delta'] + dataset['Nasdaq Delta'] + dataset['S&P 500 Delta']) / 3
    dataset['Mean Proportion'] = (dataset['Dow Jones Proportion'] + 
                                  dataset['Nasdaq Proportion'] + 
                                  dataset['S&P 500 Proportion']) / 3
    dataset['Labels'] = (dataset['Mean'] >= 0)

    return dataset

Unnamed: 0,Date,Body,Title,Dow Jones Delta,Dow Jones Proportion,Nasdaq Delta,Nasdaq Proportion,S&P 500 Delta,S&P 500 Proportion,Mean,Mean Proportion,Labels
0,2017-03-20,"Today, President Donald J. Trump welcomed Iraq...",Joint Readout of Meeting Between President Don...,-237.849609,-0.011377,-107.699707,-0.018249,-29.449951,-0.012408,-124.999756,-0.014012,False
1,2017-03-07,President Donald J. Trump welcomed the House R...,Readout of President Trump's Meeting with the ...,-69.029297,-0.003299,3.619629,0.00062,-5.409913,-0.002284,-23.606527,-0.001654,False
2,2017-03-14,President Donald J. Trump today announced his ...,President Donald J. Trump Announces Intent to ...,112.730469,0.00541,43.22998,0.007381,19.810059,0.008375,58.590168,0.007055,True
3,2017-03-21,President Donald J. Trump will travel to Bruss...,Statement by the Press Secretary on President ...,-6.708985,-0.000325,27.810059,0.0048,4.429931,0.00189,8.510335,0.002122,True
4,2017-02-27,The President and First Lady are pleased to an...,President Donald J. Trump and First Lady Melan...,-25.199219,-0.001209,-36.459961,-0.00622,-6.090087,-0.00257,-22.58309,-0.003333,False
5,2017-02-28,"U.S. CapitolWashington, D.C.: P.M. ESTTHE PRES...",Remarks by President Trump in Joint Address to...,303.310547,0.014574,78.589844,0.013491,32.320068,0.013674,138.073486,0.013913,True
6,2017-03-30,The President today welcomed Prime Minister La...,Joint Readout of Meeting Between President Don...,-65.269531,-0.003149,-2.59961,-0.00044,-5.340088,-0.002255,-24.403076,-0.001948,False
7,2017-02-07,President Donald J. Trump today spoke with Pri...,Readout of the President's Call with Prime Min...,-35.949219,-0.001789,8.22998,0.00145,1.589844,0.000693,-8.709798,0.000118,False
8,2017-03-06,President Donald J. Trump spoke separately tod...,Readout of the President's Calls with Prime Mi...,-29.580078,-0.001412,-15.25,-0.002607,-6.920166,-0.002913,-17.250082,-0.002311,False
9,2017-04-04,NOMINATIONS SENT TO THE SENATE:Sigal Mandelker...,Two Nominations Delivered to the Senate Today ...,-41.089844,-0.001986,-34.129883,-0.005786,-7.209961,-0.003055,-27.476562,-0.003609,False


In [81]:
vectorizer = CountVectorizer(input=u'content', encoding=u'utf-8', 
                             decode_error=u'strict', strip_accents=None, 
                             lowercase=True, preprocessor=None, 
                             tokenizer=word_tokenize, stop_words='english', 
                             token_pattern=u'(?u)\b\w\w+\b', 
                             ngram_range=(1, 1), analyzer=u'word', 
                             max_df=1.0, min_df=1, max_features=None)

dataset['Body Tokenized'] = vectorizer.fit_transform(dataset['Body'])
dataset['Title Tokenized'] = vectorizer.transform(dataset['Title'])


In [82]:
dataset

Unnamed: 0,Date,Body,Title,Dow Jones Delta,Nasdaq Delta,S&P 500 Delta,Mean,Labels,Body Tokenized,Title Tokenized
0,2017-03-02,Aboard Air Force OneEn Route Joint Base Andrew...,Press Gaggle by Press Secretary Sean Spicer en...,-2.740235,-9.529785,-1.200195,-4.490072,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
1,2017-04-03,President Donald J. Trump today welcomed Presi...,Readout of President Donald J. Trump’s Meeting...,-39.029296,-3.929687,-1.319824,-14.759602,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
2,2017-03-27,James S. Brady Press Briefing Room: P.M. EDTSE...,Background Briefing on the President's Energy ...,-150.519531,-34.77002,-16.97998,-67.423177,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
3,2017-03-20,"Today, President Donald J. Trump welcomed Iraq...",Joint Readout of Meeting Between President Don...,237.849609,107.699707,29.449951,124.999756,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
4,2017-03-22,The Vice President met today with Secretary of...,Readout of the Vice President's Meeting with U...,4.720703,3.950196,2.48999,3.720296,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
5,2017-03-06,President Donald J. Trump spoke separately tod...,Readout of the President's Calls with Prime Mi...,29.580078,15.25,6.920166,17.250081,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
6,2017-03-03,Its been just six weeks since President Donald...,President Donald J. Trump Delivers the Weekly ...,51.371094,21.569824,7.810058,26.916992,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
7,2017-02-08,President Donald J. Trump today provided a let...,Statement from the Press Secretary First Lady ...,-118.060547,-32.729981,-13.200195,-54.663574,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
8,2017-02-15,"Today, President Donald J. Trump welcomed Isra...",Joint Readout of Meeting Between President Don...,-7.910156,4.540039,2.030029,-0.446696,False,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."
9,2017-03-30,The President today welcomed Prime Minister La...,Joint Readout of Meeting Between President Don...,65.269531,2.59961,5.340088,24.403076,True,"(0, 12252)\t1\n (0, 8970)\t1\n (0, 17836)\...","(0, 24)\t1\n (0, 130)\t1\n (0, 189)\t1\n ..."


### Preprocessing
- ~~Label~~ 
- ~~Remove Stopwords~~
- Given a label, which words are most common
- What were the sentiments of the most common words per category
- Name-Entities mentioned
- Cluster topics where we saw the difference in movements, cluster then color

### Featurizing

### Models

In [46]:
dataset['Labels'] = (dataset['Mean'] >= 0)
dataset

Unnamed: 0,Date,Body,Title,Dow Jones Delta,Nasdaq Delta,S&P 500 Delta,Mean,Labels
0,2017-03-02,Aboard Air Force OneEn Route Joint Base Andrew...,Press Gaggle by Press Secretary Sean Spicer en...,-2.740235,-9.529785,-1.200195,-4.490072,False
1,2017-04-03,President Donald J. Trump today welcomed Presi...,Readout of President Donald J. Trump’s Meeting...,-39.029296,-3.929687,-1.319824,-14.759602,False
2,2017-03-27,James S. Brady Press Briefing Room: P.M. EDTSE...,Background Briefing on the President's Energy ...,-150.519531,-34.77002,-16.97998,-67.423177,False
3,2017-03-20,"Today, President Donald J. Trump welcomed Iraq...",Joint Readout of Meeting Between President Don...,237.849609,107.699707,29.449951,124.999756,True
4,2017-03-22,The Vice President met today with Secretary of...,Readout of the Vice President's Meeting with U...,4.720703,3.950196,2.48999,3.720296,True
5,2017-03-06,President Donald J. Trump spoke separately tod...,Readout of the President's Calls with Prime Mi...,29.580078,15.25,6.920166,17.250081,True
6,2017-03-03,Its been just six weeks since President Donald...,President Donald J. Trump Delivers the Weekly ...,51.371094,21.569824,7.810058,26.916992,True
7,2017-02-08,President Donald J. Trump today provided a let...,Statement from the Press Secretary First Lady ...,-118.060547,-32.729981,-13.200195,-54.663574,False
8,2017-02-15,"Today, President Donald J. Trump welcomed Isra...",Joint Readout of Meeting Between President Don...,-7.910156,4.540039,2.030029,-0.446696,False
9,2017-03-30,The President today welcomed Prime Minister La...,Joint Readout of Meeting Between President Don...,65.269531,2.59961,5.340088,24.403076,True


In [61]:
body_list = dataset['Body'].tolist()

In [75]:
tokenized_body = vectorizer.fit_transform(body_list)

In [76]:
tokenized_title = vectorizer.fit_transform(dataset['Title'])

In [77]:
tokenized_title

<40x935 sparse matrix of type '<class 'numpy.int64'>'
	with 2558 stored elements in Compressed Sparse Row format>

In [78]:
tokenized_body.

<40x20221 sparse matrix of type '<class 'numpy.int64'>'
	with 86400 stored elements in Compressed Sparse Row format>

In [80]:
vectorizer.get_feature_names()

['#',
 '$',
 '&',
 "'",
 "''",
 "'s",
 '(',
 ')',
 ',',
 '-',
 '--',
 '.',
 '//',
 '/faqs/talking',
 '/interim',
 ':',
 '?',
 '``',
 'abbas',
 'abdel',
 'abdulaziz',
 'abdullah',
 'abe',
 'aboard',
 'abuse',
 'access',
 'accountability',
 'accurate',
 'achieving',
 'acosta',
 'act',
 'acting',
 'action',
 'actions',
 'activities',
 'addiction',
 'additional',
 'additions',
 'address',
 'administration',
 'administrator',
 'advisor',
 'advocates',
 'aeronautics',
 'affairs',
 'afghanistan',
 'africa',
 'african',
 'agencies”',
 'agenda',
 'agriculture',
 'aircraft',
 'al',
 'al-abadi',
 'al-ahmad',
 'al-jabir',
 'al-sabah',
 'al-sisi',
 'al-thani',
 'alan',
 'alexander',
 'ally',
 'ambassador',
 'america',
 'american',
 'americans',
 'amir',
 'amul',
 'analysis',
 'andrew',
 'andrews',
 'angela',
 'anna',
 'anniversary',
 'announce',
 'announcement',
 'announces',
 'anti-crime',
 'antidumping',
 'ap',
 'appeals',
 'appointment',
 'appointments',
 'approach',
 'appropriations',
 'approve