In [79]:
import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
import tqdm
import re
from nltk.corpus import stopwords

In [26]:
news = pd.read_csv("News.csv")
dj = pd.read_csv('DowJones.csv')

In [27]:
news = news[news.Date.isin(dj.Date)]

In [28]:
dj = dj.set_index('Date').diff(periods=1)

In [29]:
dj['Date'] = dj.index

In [30]:
dj = dj.reset_index(drop=True)

In [31]:
dj = dj.drop(['High', 'Low', 'Close', 'Volume', 'Adj Close'], 1)

In [32]:
dj = dj[dj.Open.notnull()]

In [57]:
price = []
headlines = []
for i, dj_row  in dj.iterrows():
    dj_date = dj_row['Date']
    price.append(dj_row['Open'])
    news_selected_date = news[news.Date==dj_date]
    
    temp=''
    for str_headline in news_selected_date['News'].values:
        temp+=str_headline  

    headlines.append(temp)

In [64]:
headlines_df = pd.DataFrame(data=headlines, index=dj.Date,columns=['headlines'])

In [73]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [86]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'0,0', '00', text) 
    text = re.sub(r'[_"\-;%()|.,+&=*%.,!?:#@\[\]]', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\$', ' $ ', text)
    text = re.sub(r'u s ', ' united states ', text)
    text = re.sub(r'u n ', ' united nations ', text)
    text = re.sub(r'u k ', ' united kingdom ', text)
    text = re.sub(r'j k ', ' jk ', text)
    text = re.sub(r' s ', ' ', text)
    text = re.sub(r' yr ', ' year ', text)
    text = re.sub(r' l g b t ', ' lgbt ', text)
    text = re.sub(r'0km ', '0 km ', text)
    text = re.sub(r'b ', ' ', text)
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [87]:
headlines_df['cleaned_headlines']=headlines_df['headlines'].map(clean_text)

In [101]:
all_words = set()
for headline in headlines_df['cleaned_headlines'].values:
    all_words.update(headline.split(" "))

In [102]:
len(all_words)

54895

In [145]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
MAX_SEQUENCE_LENGTH = 150
MAX_NUM_WORDS = 55000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [146]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}
with open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt')) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [147]:
texts = list(headlines_df['cleaned_headlines'].values)


In [148]:
len(texts), len(price)

(1988, 1988)

In [149]:
# finally, vectorize the text samples into a 2D integer tensor
#MAX_NUM_WORDS=180000
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 54389 unique tokens.


In [150]:
print(max(len(s) for s in sequences))
print(min(len(s) for s in sequences))

465
170


In [151]:
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [157]:
data.shape

(1988, 150)

In [153]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [154]:
embedding_matrix.shape

(54390, 100)

In [160]:
from keras.layers import Embedding


In [161]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i > MAX_NUM_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            embeddings_initializer=Constant(embedding_matrix),
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

Preparing embedding matrix.


In [163]:
embedding_matrix.shape, MAX_SEQUENCE_LENGTH

((54390, 100), 150)

In [208]:
print('Training model.')

from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras import optimizers
from keras.optimizers import Adam, SGD, RMSprop


model = Sequential()
model.add(Embedding(num_words, EMBEDDING_DIM , input_length=MAX_SEQUENCE_LENGTH, weights= [embedding_matrix], trainable=False))

#model.add(Dropout(0.2))
model.add(Conv1D(64, 3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(LSTM(128))
#model.add(Dense(600))
model.add(Dense(1))

model.compile(loss='mean_squared_error', optimizer=Adam(lr=0.001,clipvalue=1.0), metrics=['mse'])

Training model.


In [209]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_30 (Embedding)     (None, 150, 100)          5439000   
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 148, 64)           19264     
_________________________________________________________________
max_pooling1d_23 (MaxPooling (None, 74, 64)            0         
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 70, 64)            20544     
_________________________________________________________________
lstm_20 (LSTM)               (None, 128)               98816     
_________________________________________________________________
dense_19 (Dense)             (None, 1)                 129       
Total params: 5,577,753
Trainable params: 138,753
Non-trainable params: 5,439,000
____________________________________________________________

In [210]:
# Normalize opening prices (target values)
max_price = max(price)
min_price = min(price)
mean_price = np.mean(price)
def normalize(price):
    return ((price-min_price)/(max_price-min_price))
norm_price = []
for p in price:
    norm_price.append(normalize(p))

In [None]:
history = model.fit(data, norm_price, batch_size=100, epochs=10, validation_split=0.15)

Train on 1689 samples, validate on 299 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
news = pd.read_csv("News.csv")
import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant
news = pd.read_csv("News.csv")
news.head()
news = pd.read_csv("News.csv")
dj = pd.read_csv('DowJones.csv')
news = news[news.Date.isin(dj.Date)]
news.head()
dj = dj.set_index('Date').diff(periods=1)
dj
dj['Date'] = d