In [1]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

In [2]:
def split_document_to_chunks(row, field_name, chunk_len):
    text = row[field_name]
    if(text!='[]'):
        label = row['label']
        sentences = sent_tokenize(text)
        output = []
        for i in range(0,len(sentences), chunk_len):
            if(i+chunk_len < len(sentences)):
                chunk = ''.join(sentences[i:i+chunk_len])
            else:
                chunk = ''.join(sentences[i:len(sentences)])
            output.append((chunk,label))
        return output
def prepare_data_set(data_set,field_name, chunk_size=3):
    chunked_text_labels = data_set.apply(split_document_to_chunks, args=(field_name, chunk_size), axis=1)
    X=[]
    y=[]
    #print("Size before chunking: ", len(chunked_text_labels))
    for chunks in chunked_text_labels:
        if(chunks is not None):
            for chunk in chunks:
                X.append(chunk[0])
                y.append(chunk[1])
    return X,y

In [3]:
data_set = pd.read_csv('../Data/final_dataset_joined_aapl240_onlyMentions.csv')
data_set.loc[data_set['label']==-1,'label'] = 0
train_df = data_set[data_set.stock_time <= "2018-12-01 00:00:00"]
test_df = data_set[data_set.stock_time > "2018-12-01 00:00:00"]

In [4]:
X_train_text,y_train_labels = prepare_data_set(train_df,'filteredtext_aapl')
X_test_text,y_test_labels = prepare_data_set(test_df,'filteredtext_aapl')
print("Train Shape: ",len(X_train_text),len(y_train_labels))
print("Test Shape: ",len(X_test_text),len(y_test_labels))

Train Shape:  59278 59278
Test Shape:  13349 13349


In [5]:
from __future__ import print_function

import os
import sys
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense, Input, Bidirectional,LSTM, Embedding, Dropout
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.initializers import Constant
from tensorflow.keras.optimizers import Adam

In [6]:
MAX_SEQUENCE_LENGTH = 100
MAX_NUM_WORDS = 30000
EMBEDDING_DIM = 100

X_total = X_train_text+ X_test_text
y_total = y_train_labels + y_test_labels

print('Processing text dataset')
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(X_total)
sequences = tokenizer.texts_to_sequences(X_total)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(y_total))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Processing text dataset
Found 132808 unique tokens.
Shape of data tensor: (72627, 100)
Shape of label tensor: (72627, 2)


In [7]:
X_train = data[:len(X_train_text)]
y_train = labels[:len(y_train_labels)]
X_test = data[len(X_train_text):]
y_test = labels[len(y_train_labels):]
print('Shape of train data tensor:', X_train.shape)
print('Shape of train label tensor:', y_train.shape)
print('Shape of test data tensor:', X_test.shape)
print('Shape of test label tensor:', y_test.shape)

Shape of train data tensor: (59278, 100)
Shape of train label tensor: (59278, 2)
Shape of test data tensor: (13349, 100)
Shape of test label tensor: (13349, 2)


In [8]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove')
print('Indexing word vectors.')
embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))


print('Preparing embedding matrix.')
# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed

Indexing word vectors.
Found 400000 word vectors.
Preparing embedding matrix.


In [None]:
model = Sequential()
model.add(Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax'))

# try using different optimizers and different optimizer configs
adam = Adam(0.01)
model.compile(loss='categorical_crossentropy', metrics=['accuracy'], optimizer=adam)

print('Train...')
model.fit(X_train, y_train,
          batch_size=128,
          epochs=50,
          validation_data=[X_test, y_test])

Train...
Train on 59278 samples, validate on 13349 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50