In [85]:
import pandas as pd
import numpy as np

import nltk.data
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class
LabeledSentence = gensim.models.doc2vec.LabeledSentence

from nltk.corpus import stopwords
stop = stopwords.words('english')

from numpy import asarray
from collections import Counter
import string
from string import punctuation

In [100]:
data = pd.read_csv("Project6500.csv",encoding= 'unicode_escape')
data.drop(['datetime', 'ticker'], axis=1, inplace=True)
data = data[data.sentiment.isnull() == False]
data['sentiment'] = data['sentiment'].map(int)
data = data[data['headline'].isnull() == False]
#data['headline'] = data['headline'].apply(lambda x: ' '.join([item for item in x.split() if item not in stop]))
data['headline']=data['headline'].str.split()
table = str.maketrans('', '', punctuation)
data['headline']= [str(w).translate(table) for w in data['headline']]

data

Unnamed: 0,headline,sentiment
0,MMM fell on hard times but could be set to reb...,0
1,Wolfe Research Upgrades 3M MMM to ¡§Peer Perfo...,1
2,3M MMM Upgraded to ¡§Peer Perform¡¨ by Wolfe R...,1
3,MMM insideday follow up as it also opened up w...,1
4,MMM is best dividend stock out there and down ...,0
5,MMM 3M The Fallen Dividend King Will Be Back ...,1
6,MMMcelebrates New Year with 7 month high close...,1
7,MMM above 180 baby is going higher,1
8,MMMhasnt really done much this year but it loo...,1
9,3M MMM Rating Increased to Neutral at JPMorgan...,1


In [80]:
def tokenize(headline):
    headline = headline.lower()
    tokens = tokenizer.tokenize(headline)
    return tokens

In [81]:
#def postprocess(data, n=1000000):
#    data = data.head(n)
def postprocess(data):
    data['tokens'] = data['headline'].progress_map(tokenize)  ## progress_map is a variant of the map function plus a progress bar. Handy to monitor DataFrame creations.
  #  data = data[data.tokens != 'NC']
 #   data.reset_index(inplace=True)
 #   data.drop('index', inplace=True, axis=1)
    return data

data = postprocess(data)


progress-bar:   0%|          | 0/9470 [00:00<?, ?it/s][A
progress-bar:  31%|███▏      | 2965/9470 [00:00<00:00, 29619.73it/s][A
progress-bar:  63%|██████▎   | 6012/9470 [00:00<00:00, 29866.70it/s][A
progress-bar: 100%|██████████| 9470/9470 [00:00<00:00, 30117.40it/s][A


In [101]:
vocab = Counter()

for item in data['headline']:
    # split into tokens by white space
    tokens = str(item).split()
    # remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    # filter out short tokens
    tokens = [word for word in tokens if len(word) > 1]
    vocab.update(tokens)

In [134]:
print(len(vocab))

10473


In [103]:
print(vocab.most_common(50))

[('stocks', 1400), ('Co', 1140), ('Stock', 1001), ('Shares', 909), ('Sells', 776), ('Inc', 771), ('Earnings', 649), ('Price', 645), ('Raised', 588), ('Target', 586), ('Group', 559), ('EPS', 551), ('IBM', 544), ('markets', 484), ('PT', 462), ('MSFT', 428), ('Dividend', 427), ('Johnson', 417), ('INTC', 413), ('Lowered', 405), ('higher', 391), ('Rating', 387), ('AAPL', 386), ('Buy', 378), ('JPM', 374), ('The', 369), ('MMM', 361), ('CSCO', 360), ('UNH', 353), ('Insider', 352), ('WMT', 342), ('PG', 330), ('United', 328), ('Technologies', 328), ('Estimates', 323), ('Quarterly', 310), ('PFE', 306), ('HD', 306), ('UTX', 305), ('MCD', 303), ('Research', 298), ('WBA', 296), ('Verizon', 293), ('AXP', 291), ('Gamble', 289), ('Investment', 288), ('JNJ', 287), ('Procter', 285), ('RT', 283), ('CAT', 282)]


In [104]:
min_occurane = 2
tokens = [k for k,c in vocab.items() if c >= min_occurane]
print(len(tokens))

4772


In [105]:
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()

# save tokens to a vocabulary file
save_list(tokens, 'vocab.txt')

In [107]:
X_train, X_test, y_train, y_test = train_test_split(np.array(data.headline),
                                                    np.array(data.sentiment), test_size=0.2)

In [109]:
len(X_train)

7576

In [112]:
model = Word2Vec(X_train, size=100, window=5, workers=5, min_count=1)
# summarize vocabulary size in model
words = list(model.wv.vocab)
print('Vocabulary size: %d' % len(words))

Vocabulary size: 70


In [113]:
filename = 'embedding_word2vec.txt.word2vec'
model.wv.save_word2vec_format(filename, binary=False)

In [114]:
from string import punctuation
from os import listdir
from numpy import array
from numpy import asarray
from numpy import zeros
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [115]:
tokenizer = Tokenizer()
# fit the tokenizer on the training documents
tokenizer.fit_on_texts(X_train)

In [118]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(X_train)
# get the max length
max_length = max([len(s.split()) for s in X_train])

In [119]:
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

In [120]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(X_test)
# pad sequences
Xtest = pad_sequences(encoded_docs, maxlen=max_length, padding='post')


In [121]:
vocab_size = len(tokenizer.word_index) + 1

In [122]:
word2vec_embedding = load_embedding('embedding_word2vec.txt.word2vec')
# get vectors in the right order
word2vec_embedding_vectors = get_weight_matrix(word2vec_embedding, tokenizer.word_index)

In [123]:
word2vec_embedding_layer = Embedding(vocab_size, 100, weights=[word2vec_embedding_vectors],
                            input_length=max_length, trainable=False)

In [124]:
model_word2vec = Sequential()
model_word2vec.add(word2vec_embedding_layer)
model_word2vec.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model_word2vec.add(MaxPooling1D(pool_size=2))
model_word2vec.add(Flatten())
model_word2vec.add(Dense(1, activation='sigmoid'))
print(model_word2vec.summary())

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 30, 100)           1074200   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 26, 128)           64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 13, 128)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 1664)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 1665      
Total params: 1,139,993
Trainable params: 65,793
Non-trainable params: 1,074,200
_________________________________________________________________
None


In [125]:
model_word2vec.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [127]:
model_word2vec.fit(Xtrain, y_train, epochs=10, verbose=2)

Epoch 1/10
 - 2s - loss: 0.7859 - accuracy: 0.5945
Epoch 2/10
 - 1s - loss: 0.6731 - accuracy: 0.6080
Epoch 3/10
 - 1s - loss: 0.6871 - accuracy: 0.6109
Epoch 4/10
 - 1s - loss: 0.6850 - accuracy: 0.6125
Epoch 5/10
 - 1s - loss: 0.6722 - accuracy: 0.6135
Epoch 6/10
 - 2s - loss: 0.6573 - accuracy: 0.6131
Epoch 7/10
 - 2s - loss: 0.6706 - accuracy: 0.6136
Epoch 8/10
 - 2s - loss: 0.6730 - accuracy: 0.6140
Epoch 9/10
 - 1s - loss: 0.6577 - accuracy: 0.6144
Epoch 10/10
 - 2s - loss: 0.6895 - accuracy: 0.6147


<keras.callbacks.callbacks.History at 0x1a4ace0fd0>

In [129]:
#### evaluation
loss, acc = model_word2vec.evaluate(Xtest, y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 61.298841


In [135]:
def get_weight_matrix(embedding, vocab):
    # total vocabulary size plus 0 for unknown words
    vocab_size = len(vocab) + 1
    # define weight matrix dimensions with all 0
    weight_matrix = zeros((vocab_size, 100))
    # step vocab, store vectors using the Tokenizer's integer mapping
    for word, i in vocab.items():
        vector = embedding.get(word)
        if vector is not None:
            weight_matrix[i] = vector
    return weight_matrix

In [136]:
def load_embedding(filename):
    # load embedding into memory, skip first line
    file = open(filename, 'r')
    lines = file.readlines()
    file.close()
    # create a map of words to vectors
    embedding = dict()
    for line in lines:
        parts = line.split()
        # key is string word, value is numpy array for vector
        embedding[parts[0]] = asarray(parts[1:], dtype='float32')
    return embedding

In [138]:
# load embedding from file
raw_embedding = load_embedding('glove.6B.100d.txt')
# get vectors in the right order
embedding_vectors = get_weight_matrix(raw_embedding, tokenizer.word_index)
# create the embedding layer
embedding_layer = Embedding(vocab_size, 100, weights=[embedding_vectors], input_length=max_length, trainable=False)

In [139]:
# define model
model_glove = Sequential()
model_glove.add(embedding_layer)
model_glove.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model_glove.add(MaxPooling1D(pool_size=2))
model_glove.add(Flatten())
model_glove.add(Dense(1, activation='sigmoid'))
print(model_glove.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 30, 100)           1074200   
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 26, 128)           64128     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 13, 128)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 1664)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 1665      
Total params: 1,139,993
Trainable params: 65,793
Non-trainable params: 1,074,200
_________________________________________________________________
None


In [141]:
# compile network
model_glove.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model_glove.fit(Xtrain, y_train, epochs=10, verbose=2)

Epoch 1/10
 - 1s - loss: 0.4643 - accuracy: 0.7746
Epoch 2/10
 - 1s - loss: 0.2538 - accuracy: 0.8996
Epoch 3/10
 - 1s - loss: 0.1478 - accuracy: 0.9530
Epoch 4/10
 - 1s - loss: 0.0895 - accuracy: 0.9772
Epoch 5/10
 - 1s - loss: 0.0483 - accuracy: 0.9931
Epoch 6/10
 - 1s - loss: 0.0258 - accuracy: 0.9985
Epoch 7/10
 - 1s - loss: 0.0173 - accuracy: 0.9989
Epoch 8/10
 - 1s - loss: 0.0125 - accuracy: 0.9992
Epoch 9/10
 - 1s - loss: 0.0085 - accuracy: 0.9992
Epoch 10/10
 - 1s - loss: 0.0084 - accuracy: 0.9989


<keras.callbacks.callbacks.History at 0x1a4bbbff98>

In [143]:
# evaluation
loss, acc = model_glove.evaluate(Xtest, y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))

Test Accuracy: 90.760297


In [144]:
import re
import fasttext

In [145]:
# Write the test file.
with open("./test.txt", "w") as test_file_handler:
    for X_test_entry, y_test_entry in zip(x_test, y_test):
        line_to_write = "__label__" + str(y_test_entry) + "\t" + str(X_test_entry) + "\n"
        try:
            test_file_handler.write(line_to_write)
        except:
            print(line_to_write)
            break

In [146]:
# Write the train file.
with open("./train.txt", "w") as train_file_handler:
    for X_train_entry, y_train_entry in zip(x_train, y_train):
        line_to_write = "__label__" + str(y_train_entry) + "\t" + str(X_train_entry) + "\n"
        try:
            train_file_handler.write(line_to_write)
        except:
            print(line_to_write)
            break

In [147]:
model = fasttext.train_supervised(input="./train.txt")

In [148]:
def print_results(N, p, r):
    print("N\t" + str(N))
    print("P@{}\t{:.3f}".format(1, p))
    print("R@{}\t{:.3f}".format(1, r))

results = model.test("./test.txt")
print_results(*results)

N	1894
P@1	0.970
R@1	0.970
