In [207]:
# https://github.com/Currie32/Predicting-the-Dow-Jones-with-Headlines/blob/master/Predict_Dow_with_News.ipynb
#https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.
#https://www.kaggle.com/stacykurnikova/using-glove-embedding
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.metrics import median_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import accuracy_score as acc
import matplotlib.pyplot as plt

from keras.models import Sequential
from keras import initializers
from keras.layers import Dropout, Activation, Embedding, Convolution1D, MaxPooling1D, Input, Dense, \
                         BatchNormalization, Flatten, Reshape, Concatenate
# add Merge layer later
from keras.layers.recurrent import LSTM, GRU
from keras.callbacks import Callback, ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from keras.models import Model
from keras.optimizers import Adam, SGD, RMSprop
from keras import regularizers

from tqdm import tqdm


In [208]:
dj = pd.read_csv('DowJones.csv')

In [209]:
dj.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2016-07-01,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
1,2016-06-30,17712.759766,17930.609375,17711.800781,17929.990234,133030000,17929.990234
2,2016-06-29,17456.019531,17704.509766,17456.019531,17694.679688,106380000,17694.679688
3,2016-06-28,17190.509766,17409.720703,17190.509766,17409.720703,112190000,17409.720703
4,2016-06-27,17355.210938,17355.210938,17063.080078,17140.240234,138740000,17140.240234


In [9]:
news = pd.read_csv("News.csv")

In [10]:
news.head()

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...


In [12]:
dj.isnull().sum()

Date         0
Open         0
High         0
Low          0
Close        0
Volume       0
Adj Close    0
dtype: int64

In [14]:
news.isnull().sum()

Date    0
News    0
dtype: int64

In [15]:
dj.shape, news.shape

((1989, 7), (73608, 2))

In [21]:
dj['Date'].unique().shape, news['Date'].unique().shape

((1989,), (2943,))

In [26]:
news = news[news.Date.isin(dj.Date)]

In [27]:
dj['Date'].unique().shape, news['Date'].unique().shape

((1989,), (1989,))

In [33]:
dj.Date.values

array(['2016-07-01', '2016-06-30', '2016-06-29', ..., '2008-08-12',
       '2008-08-11', '2008-08-08'], dtype=object)

In [35]:
news.Date.unique()

array(['2016-07-01', '2016-06-30', '2016-06-29', ..., '2008-08-12',
       '2008-08-11', '2008-08-08'], dtype=object)

In [37]:
np.array_equal(dj.Date.values,news.Date.unique())

True

In [40]:
dj[0:15]

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2016-07-01,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
1,2016-06-30,17712.759766,17930.609375,17711.800781,17929.990234,133030000,17929.990234
2,2016-06-29,17456.019531,17704.509766,17456.019531,17694.679688,106380000,17694.679688
3,2016-06-28,17190.509766,17409.720703,17190.509766,17409.720703,112190000,17409.720703
4,2016-06-27,17355.210938,17355.210938,17063.080078,17140.240234,138740000,17140.240234
5,2016-06-24,17946.630859,17946.630859,17356.339844,17400.75,239000000,17400.75
6,2016-06-23,17844.109375,18011.070312,17844.109375,18011.070312,98070000,18011.070312
7,2016-06-22,17832.669922,17920.160156,17770.359375,17780.830078,89440000,17780.830078
8,2016-06-21,17827.330078,17877.839844,17799.800781,17829.730469,85130000,17829.730469
9,2016-06-20,17736.869141,17946.359375,17736.869141,17804.869141,99380000,17804.869141


In [42]:
dj = dj.set_index('Date').diff(periods=1)

In [43]:
dj['Date'] = dj.index

In [44]:
dj.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,Adj Close,Date
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-07-01,,,,,,,2016-07-01
2016-06-30,-211.480468,-71.771484,-205.109375,-19.378907,50870000.0,-19.378907,2016-06-30
2016-06-29,-256.740235,-226.099609,-255.78125,-235.310546,-26650000.0,-235.310546,2016-06-29
2016-06-28,-265.509765,-294.789063,-265.509765,-284.958985,5810000.0,-284.958985,2016-06-28
2016-06-27,164.701172,-54.509765,-127.429688,-269.480469,26550000.0,-269.480469,2016-06-27


In [46]:
dj = dj.reset_index(drop=True)

In [47]:
dj = dj.drop(['High', 'Low', 'Close', 'Volume', 'Adj Close'], 1)

In [50]:
dj.sample(10)

Unnamed: 0,Open,Date
1444,-31.620117,2010-10-06
174,-344.228516,2015-10-22
24,-64.650391,2016-05-27
383,-64.220703,2014-12-23
121,160.459961,2016-01-08
1829,149.259765,2009-03-27
1908,-255.389648,2008-12-02
1542,119.75,2010-05-18
722,18.010742,2013-08-20
1331,-82.879883,2011-03-18


In [53]:
dj = dj[dj.Open.notnull()]

In [55]:
dj.sample(10)

Unnamed: 0,Open,Date
835,-50.219727,2013-03-11
1948,367.099609,2008-10-06
796,5.070313,2013-05-06
1851,80.520019,2009-02-25
1204,105.350585,2011-09-19
136,-225.689453,2015-12-16
338,-147.898438,2015-03-02
1795,56.069336,2009-05-15
883,156.889648,2012-12-28
1151,0.830078,2011-12-02


In [56]:
dj.isnull().sum()

Open    0
Date    0
dtype: int64

In [99]:
dj.shape

(1988, 2)

In [96]:
price = []
headlines = []
for i, dj_row  in tqdm(dj.iterrows()):
    dj_date = dj_row['Date']
    price.append(dj_row['Open'])
    news_selected_date = news[news.Date==dj_date]
    headlines.append(list(news_selected_date['News'].values))


1988it [00:05, 386.01it/s]


In [101]:
len(headlines), len(price)

(1988, 1988)

In [111]:
print(max([len(headline) for headline in headlines]))
print(min([len(headline) for headline in headlines]))

25
22


In [115]:
import spacy 
nlp = spacy.load("en")

In [127]:
doc = nlp(headlines[0][0])

In [129]:
headlines[0][0]

'Jamaica proposes marijuana dispensers for tourists at airports following legalisation: The kiosks and desks would give people a license to purchase up to 2 ounces of the drug to use during their stay'

In [132]:
for token in doc:
    print("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\t{7}".format(
        token.text,
        token.idx,
        token.lemma_,
        token.is_punct,
        token.is_space,
        token.shape_,
        token.pos_,
        token.tag_
    ))

Jamaica	0	Jamaica	False	False	Xxxxx	PROPN	NNP
proposes	8	propose	False	False	xxxx	VERB	VBZ
marijuana	17	marijuana	False	False	xxxx	NOUN	NN
dispensers	27	dispenser	False	False	xxxx	NOUN	NNS
for	38	for	False	False	xxx	ADP	IN
tourists	42	tourist	False	False	xxxx	NOUN	NNS
at	51	at	False	False	xx	ADP	IN
airports	54	airport	False	False	xxxx	NOUN	NNS
following	63	follow	False	False	xxxx	VERB	VBG
legalisation	73	legalisation	False	False	xxxx	NOUN	NN
:	85	:	True	False	:	PUNCT	:
The	87	the	False	False	Xxx	DET	DT
kiosks	91	kiosk	False	False	xxxx	NOUN	NNS
and	98	and	False	False	xxx	CCONJ	CC
desks	102	desk	False	False	xxxx	NOUN	NNS
would	108	would	False	False	xxxx	VERB	MD
give	114	give	False	False	xxxx	VERB	VB
people	119	people	False	False	xxxx	NOUN	NNS
a	126	a	False	False	x	DET	DT
license	128	license	False	False	xxxx	NOUN	NN
to	136	to	False	False	xx	PART	TO
purchase	139	purchase	False	False	xxxx	VERB	VB
up	148	up	False	False	xx	PART	RP
to	151	to	False	False	xx	PART	TO
2	154	2	False	False	d	NUM	CD


In [135]:
for ent in doc.ents:
    print(ent.text, ent.label_)

Jamaica GPE
up to 2 ounces QUANTITY


In [137]:
from spacy import displacy

displacy.render(doc, style='ent', jupyter=True)

In [143]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [144]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'0,0', '00', text) 
    text = re.sub(r'[_"\-;%()|.,+&=*%.,!?:#@\[\]]', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\$', ' $ ', text)
    text = re.sub(r'u s ', ' united states ', text)
    text = re.sub(r'u n ', ' united nations ', text)
    text = re.sub(r'u k ', ' united kingdom ', text)
    text = re.sub(r'j k ', ' jk ', text)
    text = re.sub(r' s ', ' ', text)
    text = re.sub(r' yr ', ' year ', text)
    text = re.sub(r' l g b t ', ' lgbt ', text)
    text = re.sub(r'0km ', '0 km ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [145]:
# Clean the headlines
clean_headlines = []

for daily_headlines in headlines:
    clean_daily_headlines = []
    for headline in daily_headlines:
        clean_daily_headlines.append(clean_text(headline))
    clean_headlines.append(clean_daily_headlines)

In [None]:
'''
#https://stackoverflow.com/questions/54396405/how-can-i-preprocess-nlp-text-lowercase-remove-special-characters-remove-numb
# check https://github.com/pemagrg1/Text-Pre-Processing-in-Python as well
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

 def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


df['cleanText']=df['Text'].map(lambda s:preprocess(s)) 
'''

In [167]:
def word_strings(text):
    ws = set()
    for headline in text:
        ws.update(headline.split(" "))
    return ws

In [181]:
word_counts = set()
for headline in clean_headlines:
   # print(word_strings(headline))
    word_counts.update(word_strings(headline))

In [182]:
len(word_counts)

35190

In [183]:
# Load GloVe's embeddings
embeddings_index = {}
with open('../glove.840B.300d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        embedding = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = embedding

print('Word embeddings:', len(embeddings_index))

Word embeddings: 2196017


In [184]:

# Find the number of times each word was used and the size of the vocabulary
word_counts = {}

for date in clean_headlines:
    for headline in date:
        for word in headline.split():
            if word not in word_counts:
                word_counts[word] = 1
            else:
                word_counts[word] += 1
            
print("Size of Vocabulary:", len(word_counts))

Size of Vocabulary: 35190


In [185]:
# Find the number of words that are missing from GloVe, and are used more than our threshold.
missing_words = 0
threshold = 10

for word, count in word_counts.items():
    if count > threshold:
        if word not in embeddings_index:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from GloVe:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from GloVe: 47
Percent of words that are missing from vocabulary: 0.13%


In [186]:

# Limit the vocab that we will use to words that appear ≥ threshold or are in GloVe

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in embeddings_index:
        vocab_to_int[word] = value
        value += 1

In [188]:
# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

In [191]:
vocab_to_int['<UNK>'], vocab_to_int['<PAD>']

(31263, 31264)

In [192]:
vocab_to_int

{'jamaica': 0,
 'proposes': 1,
 'marijuana': 2,
 'dispensers': 3,
 'tourists': 4,
 'airports': 5,
 'following': 6,
 'legalisation': 7,
 'kiosks': 8,
 'desks': 9,
 'would': 10,
 'give': 11,
 'people': 12,
 'license': 13,
 'purchase': 14,
 '2': 15,
 'ounces': 16,
 'drug': 17,
 'use': 18,
 'stay': 19,
 'stephen': 20,
 'hawking': 21,
 'says': 22,
 'pollution': 23,
 'stupidity': 24,
 'still': 25,
 'biggest': 26,
 'threats': 27,
 'mankind': 28,
 'certainly': 29,
 'become': 30,
 'less': 31,
 'greedy': 32,
 'stupid': 33,
 'treatment': 34,
 'environment': 35,
 'past': 36,
 'decade': 37,
 'boris': 38,
 'johnson': 39,
 'run': 40,
 'tory': 41,
 'party': 42,
 'leadership': 43,
 'six': 44,
 'gay': 45,
 'men': 46,
 'ivory': 47,
 'coast': 48,
 'abused': 49,
 'forced': 50,
 'flee': 51,
 'homes': 52,
 'pictured': 53,
 'signing': 54,
 'condolence': 55,
 'book': 56,
 'victims': 57,
 'recent': 58,
 'attack': 59,
 'nightclub': 60,
 'florida': 61,
 'switzerland': 62,
 'denies': 63,
 'citizenship': 64,
 'musl

In [193]:
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

In [194]:
usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total Number of Unique Words:", len(word_counts))
print("Number of Words we will use:", len(vocab_to_int))
print("Percent of Words we will use: {}%".format(usage_ratio))

Total Number of Unique Words: 35190
Number of Words we will use: 31265
Percent of Words we will use: 88.85%


In [195]:
int_to_vocab

{0: 'jamaica',
 1: 'proposes',
 2: 'marijuana',
 3: 'dispensers',
 4: 'tourists',
 5: 'airports',
 6: 'following',
 7: 'legalisation',
 8: 'kiosks',
 9: 'desks',
 10: 'would',
 11: 'give',
 12: 'people',
 13: 'license',
 14: 'purchase',
 15: '2',
 16: 'ounces',
 17: 'drug',
 18: 'use',
 19: 'stay',
 20: 'stephen',
 21: 'hawking',
 22: 'says',
 23: 'pollution',
 24: 'stupidity',
 25: 'still',
 26: 'biggest',
 27: 'threats',
 28: 'mankind',
 29: 'certainly',
 30: 'become',
 31: 'less',
 32: 'greedy',
 33: 'stupid',
 34: 'treatment',
 35: 'environment',
 36: 'past',
 37: 'decade',
 38: 'boris',
 39: 'johnson',
 40: 'run',
 41: 'tory',
 42: 'party',
 43: 'leadership',
 44: 'six',
 45: 'gay',
 46: 'men',
 47: 'ivory',
 48: 'coast',
 49: 'abused',
 50: 'forced',
 51: 'flee',
 52: 'homes',
 53: 'pictured',
 54: 'signing',
 55: 'condolence',
 56: 'book',
 57: 'victims',
 58: 'recent',
 59: 'attack',
 60: 'nightclub',
 61: 'florida',
 62: 'switzerland',
 63: 'denies',
 64: 'citizenship',
 65: '

In [196]:

# Need to use 300 for embedding dimensions to match GloVe's vectors.
embedding_dim = 300

nb_words = len(vocab_to_int)
# Create matrix with default values of zero
word_embedding_matrix = np.zeros((nb_words, embedding_dim))
for word, i in vocab_to_int.items():
    if word in embeddings_index:
        word_embedding_matrix[i] = embeddings_index[word]
    else:
        # If word not in GloVe, create a random embedding for it
        new_embedding = np.array(np.random.uniform(-1.0, 1.0, embedding_dim))
        embeddings_index[word] = new_embedding
        word_embedding_matrix[i] = new_embedding

# Check if value matches len(vocab_to_int)
print(len(word_embedding_matrix))

31265


In [200]:
word_embedding_matrix[1].shape

(300,)

In [203]:
clean_headlines[0]

['jamaica proposes marijuana dispensers tourists airports following legalisation kiosks desks would give people license purchase 2 ounces drug use stay',
 'stephen hawking says pollution stupidity still biggest threats mankind certainly become less greedy less stupid treatment environment past decade',
 'boris johnson says run tory party leadership',
 'six gay men ivory coast abused forced flee homes pictured signing condolence book victims recent attack gay nightclub florida',
 'switzerland denies citizenship muslim immigrant girls refused swim boys report',
 'palestinian terrorist stabs israeli teen girl death bedroom',
 'puerto rico default $ 1 billion debt friday',
 'republic ireland fans awarded medal sportsmanship paris mayor',
 'afghan suicide bomber kills 40 bbc news',
 'us airstrikes kill least 250 isis fighters convoy outside fallujah official says',
 'turkish cop took istanbul gunman hailed hero',
 'cannabis compounds could treat alzheimer removing plaque forming proteins br

In [204]:
# Change the text from words to integers
# If word is not in vocab, replace it with <UNK> (unknown)
word_count = 0
unk_count = 0

int_headlines = []

for date in clean_headlines:
    int_daily_headlines = []
    for headline in date:
        int_headline = []
        for word in headline.split():
            word_count += 1
            if word in vocab_to_int:
                int_headline.append(vocab_to_int[word])
            else:
                int_headline.append(vocab_to_int["<UNK>"])
                unk_count += 1
        int_daily_headlines.append(int_headline)
    int_headlines.append(int_daily_headlines)

unk_percent = round(unk_count/word_count,4)*100

print("Total number of words in headlines:", word_count)
print("Total number of UNKs in headlines:", unk_count)
print("Percent of words that are UNK: {}%".format(unk_percent))

Total number of words in headlines: 615989
Total number of UNKs in headlines: 5262
Percent of words that are UNK: 0.8500000000000001%


In [205]:
int_headlines[]

[[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
  [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 31, 33, 34, 35, 36, 37],
  [38, 39, 22, 40, 41, 42, 43],
  [44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 45, 60, 61],
  [62, 63, 64, 65, 66, 67, 68, 69, 70, 71],
  [72, 73, 74, 75, 76, 77, 78, 79],
  [80, 81, 82, 83, 84, 85, 86, 87],
  [88, 89, 90, 91, 92, 93, 94, 95],
  [96, 97, 98, 99, 100, 101, 102],
  [103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 22],
  [114, 115, 116, 117, 118, 119, 120],
  [121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133],
  [134,
   135,
   136,
   137,
   138,
   139,
   140,
   141,
   142,
   103,
   73,
   143,
   144,
   145,
   146,
   22,
   147,
   65,
   148,
   149],
  [150, 151, 152, 153, 154, 155, 156],
  [157, 158, 159, 160, 161, 162, 163, 164, 165],
  [166, 167, 168, 169, 170, 171, 172, 173, 174, 175],
  [176, 177, 133, 178, 179, 180, 181, 182, 183, 184, 185],
  [46, 186, 187, 