In [38]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import nltk
import sklearn

In [22]:
df = pd.read_csv('data/dataset.csv')

In [3]:
df = df.dropna().reset_index(drop=True)

In [4]:
df['Mean'] = (df['Dow Jones Delta'] + df['Nasdaq Delta'] + df['S&P 500 Delta'])/3

In [5]:
df['Body'] = df['Body'].apply(lambda body: body + ' ').str.replace('\d+', '')

In [6]:
df_upward = df[df['Mean'] > 0]
df_downward = df[df['Mean'] < 0]

In [7]:
df_upward_grouped = df_upward.groupby('Date')
df_downward_grouped = df_downward.groupby('Date')

In [8]:
grouped = df.groupby('Date')

In [9]:
df

Unnamed: 0,Date,Title,Body,Dow Jones Value,Dow Jones Delta,Nasdaq Value,Nasdaq Delta,S&P 500 Value,S&P 500 Delta,Mean
0,2017-04-04,Two Nominations Delivered to the Senate Today,NOMINATIONS SENT TO THE SENATE:Sigal Mandelker...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
1,2017-04-04,Background Briefing by Senior Administration O...,James S. Brady Press Briefing Room: P.M. EDTSE...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
2,2017-04-04,Statement from President Donald J. Trump,Todays chemical attack in Syria against innoce...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
3,2017-04-04,UPDATE: Confirming Judge Neil Gorsuch to the S...,"On Monday, Judge Neil Gorsuch was confirmed by...",20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
4,2017-04-04,President Donald J. Trump Announces Intent to ...,President Donald J. Trump today announced his ...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
5,2017-04-04,Remarks by President Trump at 2017 North Ameri...,"Washington HiltonWashington, D.C.: P.M. EDTTHE...",20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
6,2017-04-04,"Press Gaggle by Press Secretary Sean Spicer, 4...",James S. Brady Press Briefing Room: A.M. EDTMR...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
7,2017-04-04,Remarks by President Trump and Vice President ...,South Court Auditorium: A.M. EDTTHE VICE PRESI...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
8,2017-04-04,1600 Daily: Everything White House for 4/4/17,"Summary:Get news, events and updates from the ...",20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
9,2017-04-04,"Photo of the Day: April 4, 2017",Moments before Egyptian President Abdel Fattah...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562


In [10]:
df.groupby('Date')[['Dow Jones Delta', 'Nasdaq Delta', 'S&P 500 Delta']].mean()

Unnamed: 0_level_0,Dow Jones Delta,Nasdaq Delta,S&P 500 Delta
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-02-07,35.949218,-8.22998,-1.589844
2017-02-08,-118.060547,-32.729981,-13.200195
2017-02-09,-96.96875,-18.949707,-8.229981
2017-02-10,-142.791015,-29.830078,-12.149902
2017-02-13,-92.25,-18.609863,-9.330078
2017-02-14,-107.449219,-36.870117,-11.669922
2017-02-15,-7.910156,4.540039,2.030029
2017-02-16,-4.28125,-23.680176,-3.939941
2017-02-17,-118.949219,-27.370117,-14.219971
2017-02-21,-32.599609,5.320312,2.559815


In [11]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

n_samples = 20000
n_topics = 4
n_top_words = 25

##This is a function to print out the top words for each topic in a pretty way.
#Don't worry too much about understanding every line of this code.
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [12]:
# Vectorize our text using CountVectorizer
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.80, min_df=50,
                                max_features=None,
                                stop_words='english'
                                )

tf = tf_vectorizer.fit_transform(df['Body'])

Extracting tf features for LDA...


In [13]:
print("Fitting LDA models with tf features, "
      "n_samples=%d and n_topics=%d..."
      % (n_samples, n_topics))

#define the lda function, with desired options
#Check the documentation, linked above, to look through the options
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=20,
                                learning_method='online',
                                learning_offset=80.,
                                total_samples=n_samples,
                                random_state=0)
#fit the model
lda.fit(tf)

Fitting LDA models with tf features, n_samples=20000 and n_topics=4...


LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=80.0,
             max_doc_update_iter=100, max_iter=20, mean_change_tol=0.001,
             n_jobs=1, n_topics=4, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=20000, verbose=0)

In [14]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)


Topics in LDA model:

Topic #0:
going thank great people applause know want just good right really like country jobs mr american lot think im laughter time say job said years

Topic #1:
united states trump american country america security people minister prime vice world leaders economic today countries women work national jobs great americans donald years trade

Topic #2:
think mr going people just said know house question im administration want right make dont white look theres thats did sure say does lot time

Topic #3:
federal law order house trump executive department agency states state white mr secretary general court agencies rule government regulatory act office administration director attorney regulations



In [16]:
topic_dist = lda.transform(tf)
topic_dist_df = pd.DataFrame(topic_dist)
df_w_topics = topic_dist_df.join(df)
df_w_topics

Unnamed: 0,0,1,2,3,Date,Title,Body,Dow Jones Value,Dow Jones Delta,Nasdaq Value,Nasdaq Delta,S&P 500 Value,S&P 500 Delta,Mean
0,0.031925,0.224886,0.032363,0.710827,2017-04-04,Two Nominations Delivered to the Senate Today,NOMINATIONS SENT TO THE SENATE:Sigal Mandelker...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
1,0.224085,0.242026,0.533543,0.000347,2017-04-04,Background Briefing by Senior Administration O...,James S. Brady Press Briefing Room: P.M. EDTSE...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
2,0.018809,0.595005,0.366558,0.019628,2017-04-04,Statement from President Donald J. Trump,Todays chemical attack in Syria against innoce...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
3,0.217232,0.001856,0.239209,0.541703,2017-04-04,UPDATE: Confirming Judge Neil Gorsuch to the S...,"On Monday, Judge Neil Gorsuch was confirmed by...",20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
4,0.006123,0.006248,0.006181,0.981448,2017-04-04,President Donald J. Trump Announces Intent to ...,President Donald J. Trump today announced his ...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
5,0.892648,0.106647,0.000351,0.000355,2017-04-04,Remarks by President Trump at 2017 North Ameri...,"Washington HiltonWashington, D.C.: P.M. EDTTHE...",20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
6,0.000260,0.041112,0.958368,0.000259,2017-04-04,"Press Gaggle by Press Secretary Sean Spicer, 4...",James S. Brady Press Briefing Room: A.M. EDTMR...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
7,0.823436,0.000207,0.159544,0.016814,2017-04-04,Remarks by President Trump and Vice President ...,South Court Auditorium: A.M. EDTTHE VICE PRESI...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
8,0.063956,0.063133,0.064921,0.807991,2017-04-04,1600 Daily: Everything White House for 4/4/17,"Summary:Get news, events and updates from the ...",20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562
9,0.021311,0.021667,0.021488,0.935534,2017-04-04,"Photo of the Day: April 4, 2017",Moments before Egyptian President Abdel Fattah...,20689.240234,41.089843,5898.609863,34.129883,2360.159912,7.209961,27.476562


In [20]:
sorted_topic_1 = df_w_topics[['Title', 'Date', 'Mean', 0]].sort_values(by=[0], ascending=False)
sorted_topic_1['Mean'].mean()

-10.714099406156206

In [None]:
vectorizer = CountVectorizer(encoding='utf-8', stop_words='english')
dtm_upward = vectorizer.fit_transform(df_upward_grouped['Body'].sum())

In [None]:
dtm_df = pd.DataFrame(vectorizer.fit_transform(df_downward_grouped['Body'].sum()).toarray(), columns=vectorizer.get_feature_names())

In [None]:
print(dtm_df.sum().sort_values(ascending=False))

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidfvec = TfidfVectorizer(stop_words = 'english')

#create the dtm, but with cells weigthed by the tf-idf score.
dtm_tfidf_df = pd.DataFrame(tfidfvec.fit_transform(df_downward_grouped['Body'].sum()).toarray(), columns=tfidfvec.get_feature_names())

#view results
dtm_tfidf_df

In [None]:
print(dtm_tfidf_df.max().sort_values(ascending=False)[:20])