# Authors: Jon Khaykin and Richard Doan

Goals: 
- To create a vector space that will be effective for predictive modeling
- Create said predictive model

Analysis: 
- Chi2 to create viable vector space for predictive modelling using terms
- LDA topic modeling with bi-grams
- Chi2 to create viable vector space using bigrams


# Imports and Helper Functions

In [85]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_selection import chi2

from itertools import combinations
from scipy.stats import ttest_ind
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
from sklearn.manifold import MDS
from sklearn.metrics.pairwise import cosine_similarity

from bokeh.io import output_notebook, show
from bokeh.charts import Scatter, Histogram

output_notebook()

plt.style.use('ggplot')

n_topics = 4
n_top_words = 25

def get_top_words(model, feature_names, n_top_words):
    top_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_words.extend([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])
    return top_words



def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("\nTopic #{}:".format(topic_idx + 1))
        print(" - ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    
def label_data_topics(df, num_topics=4, text='Body', ngram=1):
    tf_vectorizer = CountVectorizer(max_df=0.80, min_df=50,
                                    max_features=None,
                                    stop_words='english',
                                    ngram_range=(1, ngram))

    tf = tf_vectorizer.fit_transform(df['Body'])

    #define the lda function, with desired options
    #Check the documentation, linked above, to look through the options
    lda = LatentDirichletAllocation(n_topics=num_topics, max_iter=20,
                                    learning_method='online',
                                    learning_offset=80.,
                                    total_samples=len(df['Body']),
                                    random_state=0)
    #fit the model
    data = lda.fit_transform(tf)
    labels = np.argmax(data, axis=1)
    df['Topic'] = labels
    return df, lda, tf_vectorizer

def significance_labeller(df, delta='Dow Jones Delta'):
    labels = []
    mean = df[delta].mean()
    std_dev = df[delta].std()
    for delta in df[delta]:
        if delta < mean - std_dev:
            labels.append(-1)
        elif mean - std_dev <= delta <= mean + std_dev:
            labels.append(0)
        else:
            labels.append(1)
    df['Label'] = labels
    return df


def featurize(df, k=100, text='Body', ngram=1):
    vectorizer = CountVectorizer(stop_words='english', ngram_range=(1, ngram))
    vectorizer.fit(df[text])
    words = vectorizer.get_feature_names()
    data = vectorizer.transform(df[text])
    labels = df['Label']
    
    print("There are %d datapoints that have a upward label" %(len(np.where(labels == 1)[0])))
    print("There are %d datapoints that have a neutral label" %(len(np.where(labels == 0)[0])))
    print("There are %d datapoints that have a downward label" %(len(np.where(labels == -1)[0])))

    chi_scores, p_vals = chi2(data, labels)
    
    word_scores = []
    for el in zip(chi_scores, words, p_vals):
        if not np.isnan(el[0]):
            word_scores.append(el)
            
    chi_scores, words, p_vals = zip(*word_scores)
    words = np.array(words)
    
    top_words_indices = (np.argsort(chi_scores)[::-1])[:k]
    top_words = words[top_words_indices]
    
    chi_vectorizer = CountVectorizer(vocabulary=top_words)
    data = chi_vectorizer.fit_transform(df[text])
    df = pd.DataFrame(np.hstack((data.todense(), labels.values.reshape((labels.shape[0], 1)))), columns=list(top_words) + ['label'])

    return df, chi_vectorizer

def plot_2d_scatter(df, vectorizer, text='Body', to_plot='Label'):
    X = vectorizer.transform(df[text])
    cos_dist = 1 - cosine_similarity(X.todense())
    mds = MDS(n_components=2, dissimilarity="precomputed", random_state=0)
    pos = mds.fit_transform(cos_dist)# shape (n_components, n_samples)
    distance_df = pd.DataFrame(pos, columns=['x', 'y'])
    distance_df[to_plot] = df[to_plot]
    p = Scatter(distance_df, x='x', y='y', title="MDS: White House Posts", color=to_plot,
           legend="top_right")

    show(p)

# Data Import and Processing

To label the data, I thought it would be appropriate to use 3 labels instead of 2. This will help the algorithm figure out if the stock motion was due to noise or because it correlated with some news.

In [86]:
# import complete CSV
df = pd.read_csv('data/dataset.csv')
# remove all non-alphabet characters and convert to lowercase letters
regex = re.compile('[^a-zA-Z]')
df['Body'] = df['Body'].apply(lambda post: regex.sub(' ', post).lower())
# remove spaces
df['Body'] = df['Body'].apply(lambda post: " ".join([word for word in post.split()]))
df = df[['Date', 'Title', 'Body', 'Dow Jones Delta']]
df = significance_labeller(df)
df.groupby('Label').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,Dow Jones Delta
Label,Unnamed: 1_level_1,Unnamed: 2_level_1
-1,count,14.0
-1,mean,-193.110491
-1,std,62.289668
-1,min,-237.849609
-1,25%,-237.849609
-1,50%,-237.849609
-1,75%,-112.580078
-1,max,-112.580078
0,count,384.0
0,mean,-0.609528


In [87]:
df['Dow Jones Delta'].describe()

count    463.000000
mean      21.410873
std       95.336532
min     -237.849609
25%      -35.949218
50%       -6.708985
75%       69.169922
max      303.310547
Name: Dow Jones Delta, dtype: float64

In [88]:
df, lda, tf_vectorizer = label_data_topics(df, num_topics=4, text='Body')

In [89]:
topic_groups = df.groupby('Topic')
print('Means')
print(topic_groups['Dow Jones Delta'].mean())
print()
print('Median')
print(topic_groups['Dow Jones Delta'].median())
print()
print('Std Dev')
print(topic_groups['Dow Jones Delta'].std())
print_top_words(lda, tf_vectorizer.get_feature_names(), 20)

Means
Topic
0    51.594757
1    13.281057
2    27.055982
3    15.828750
Name: Dow Jones Delta, dtype: float64

Median
Topic
0    15.679687
1    -6.708985
2    -0.990234
3    -5.714844
Name: Dow Jones Delta, dtype: float64

Std Dev
Topic
0    115.165926
1     92.536817
2     89.243914
3     86.078948
Name: Dow Jones Delta, dtype: float64

Topic #1:
order - federal - regulatory - executive - agency - agencies - actions - regulations - rule - law - action - act - cost - state - costs - review - states - department - agenda - government

Topic #2:
trump - states - united - security - house - american - law - national - secretary - white - donald - vice - minister - mr - prime - economic - administration - america - court - office

Topic #3:
going - thank - people - great - applause - know - just - want - mr - country - good - right - american - like - jobs - really - im - think - laughter - lot

Topic #4:
think - mr - going - people - just - said - know - house - question - administration 

### What does meaningful motion look like?

In [90]:
hist = Histogram(df, values='Dow Jones Delta', color='Label',
                  title="df, values='Dow Jones Delta', color='Topic'", legend='top_right', bins=20)
show(hist)

### LDA Clusters 

There seems to be good clustering in this space, so we can train a classifier to classify posts into topics. We believe that Topic 0 is meaningful in predicting market movements because it's mean market movement is much higher than the rest. Also, this topic seems to be dealing with economic and regualations, so it makes sense that the content of these posts correlate with the market.

In [91]:
feature_words = set(get_top_words(lda, tf_vectorizer.get_feature_names(), 50))
word_vectorizer = CountVectorizer(vocabulary=feature_words)
plot_2d_scatter(df, word_vectorizer, text='Body', to_plot='Topic')

The labels for these posts in this space aren't well clustered so we will need to create a better feature space for this later.

In [92]:
plot_2d_scatter(df, word_vectorizer, text='Body', to_plot='Label')

Here is our proposed feature space for separating out the labeled data into their proper labels. It muddles Topic 0 and Topic 1 quite a bit. It also doesn't cluster our data very well, so we'll need to keep investigating.

In [93]:
X_df, chi_vectorizer = featurize(df, k=50, text='Body')
plot_2d_scatter(df, chi_vectorizer, text='Body', to_plot='Topic')

There are 65 datapoints that have a upward label
There are 384 datapoints that have a neutral label
There are 14 datapoints that have a downward label


In [94]:
plot_2d_scatter(df, chi_vectorizer, text='Body', to_plot='Label')

Below, we repeat our methods above but using a bigrams model instead. The results weren't great because we've exponentiated the number of features. With this many features, everything looks equally distant. Maybe feature reduction methods will help us out here.

In [95]:
df, lda, tf_vectorizer = label_data_topics(df, num_topics=4, text='Body', ngram=2)

In [96]:
print_top_words(lda, tf_vectorizer.get_feature_names(), 20)


Topic #1:
order - federal - states - executive - regulatory - united - united states - agency - law - secretary - national - house - agencies - security - actions - state - executive order - department - general - white

Topic #2:
mr - judge - law - budget - court - director - department - senate - served - school - justice - committee - house - programs - money - trump - serve - said - tax - special

Topic #3:
trump - states - united - united states - president trump - applause - american - great - america - thank - country - jobs - people - today - new - work - years - minister - women - americans

Topic #4:
think - mr - going - people - just - know - said - want - right - im - thank - house - lot - question - say - like - make - thats - dont - look



In [97]:
feature_words = set(get_top_words(lda, tf_vectorizer.get_feature_names(), 50))
word_vectorizer = CountVectorizer(vocabulary=feature_words)

In [98]:
plot_2d_scatter(df, word_vectorizer, text='Body', to_plot='Topic')

In [99]:
X_df, chi_vectorizer = featurize(df, k=50, text='Body', ngram=2)

There are 65 datapoints that have a upward label
There are 384 datapoints that have a neutral label
There are 14 datapoints that have a downward label


In [100]:
plot_2d_scatter(df, chi_vectorizer, text='Body', to_plot='Label')

In [101]:
len(chi_vectorizer.get_feature_names())

50

In [102]:
chi_vectorizer.get_feature_names()

['red cross',
 'collusion',
 'ms sanders',
 'sanders',
 'unitedstates',
 'gates',
 'isis daesh',
 'speech night',
 'american red',
 'headlines',
 'japan',
 'mr',
 'daesh',
 'glawe',
 'spicer',
 'mr spicer',
 'conclusion',
 'russian operatives',
 'minister al',
 'cross',
 'comey',
 'operatives',
 'states iraq',
 'mr glawe',
 'irish american',
 'russia election',
 'wto',
 'ms',
 'speech',
 'hbcus',
 'congressional leaders',
 'senior administrative',
 'states japan',
 'administrative official',
 'iraq',
 'aware contacts',
 'iraqs',
 'dnc',
 'night address',
 'confession',
 'firstday',
 'investigation mean',
 'heritage month',
 'intelligence analysis',
 'hampton roads',
 'firstday march',
 'met bipartisan',
 'dr emanuel',
 'gates foundation',
 'know ongoing']

# Peer Reviews

## Review 1 - Julian Park

Did they match techniques to question?
- The techniques used were mostly clustering ones. DBSCAN was a good choice because it will determine the optimal number of clusters, which helps answer Julian's question of 'Is there a correlation between the number of clusters in an essay to the score it receives?' The dendrogram techniques was relevant as well, at least for an initial exploration.

Other suggestions?
- Incorporate word2vec model for the clustering threshold base instead of TFIDF.