In [0]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import datetime
import utils_v2 as utils

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Using DJIA News headlines for stock prediction ##  

In [3]:
news_df = pd.read_csv('uci-news-aggregator.csv')
news_df.head(2)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698
1,2,Fed's Charles Plosser sees high bar for change...,http://www.livemint.com/Politics/H2EvwJSK2VE6O...,Livemint,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.livemint.com,1394470371207


In [4]:
news_df = news_df[news_df['CATEGORY'] == 't'].reset_index(drop = True)
news_df = news_df[['TIMESTAMP', 'TITLE']]
news_df.columns = ['Date', 'news'] 
news_df['Date'] = news_df['Date'].apply(lambda x: datetime.datetime.fromtimestamp(int(int(x)/1000)).strftime('%Y-%m-%d'))
news_df['Date'] = pd.to_datetime(news_df['Date'])
news_df.head()

Unnamed: 0,Date,news
0,2014-03-10,Titanfall Review Roundup
1,2014-03-10,Titanfall review: my buddy
2,2014-03-10,Respawn: Titanfall's server stability is in Mi...
3,2014-03-10,Titanfall's Resolution 'Likely' to be Increase...
4,2014-03-10,Xbox One Titanfall Bundle Release Date Tomorro...


In [5]:
print("News Time Period, From %s to %s"%(min(news_df['Date']), max(news_df['Date'])))
print("Number of days headlines: %d"%(len(news_df['Date'].unique())))
print("Total News: %d"%len(news_df))

News Time Period, From 2014-03-10 00:00:00 to 2014-08-28 00:00:00
Number of days headlines: 95
Total News: 108344


## Cleaning news

In [0]:
news_df["news"] = news_df["news"].apply(utils.get_clean_text)

In [7]:
print("After cleaning")
news_df.head(5)

After cleaning


Unnamed: 0,Date,news
0,2014-03-10,titanfall review roundup
1,2014-03-10,titanfall review buddy
2,2014-03-10,respawn titanfall server stability microsoft a...
3,2014-03-10,titanfall resolution likely increased patch
4,2014-03-10,xbox one titanfall bundle release date tomorro...


In [0]:
utils.save_pickle('uci_tech_clean_news', news_df)

### Getting stock change labels for time duration we have the news ##

In [0]:
company = "GOOG"
start_date = min(news_df['Date'])
end_date = max(news_df['Date'])
predict_trend = 1

In [0]:
stock_labels_df = utils.get_stock_change_labels(start=start_date,
                                                end=end_date,
                                                company=company,
                                                cutoff=.5,
                                                n_labels=3,
                                                shift=predict_trend)

In [11]:
stock_labels_df.head()

Unnamed: 0,Date,change_1
0,2014-03-10,2
1,2014-03-11,2
2,2014-03-12,2
3,2014-03-13,2
4,2014-03-14,1


In [12]:
stock_labels_df['change_1'].value_counts()

2    53
1    41
0    26
Name: change_1, dtype: int64

### Getting stock features for time duration we have the news

In [0]:
stock_data_df = utils.get_stock_data(start=min(news_df['Date']),
                                     end=max(news_df['Date']),
                                     company="GOOG",
                                     additional_features=True)

In [14]:
stock_data_df.head(7)

Unnamed: 0,Date,High,Low,Open,Close,Volume,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,week_11,week_12,week_13,week_14,week_15,week_16,week_17,week_18,week_19,week_20,week_21,week_22,week_23,week_24,week_25,week_26,week_27,week_28,week_29,week_30,week_31,week_32,week_33,week_34,week_35,month_3,month_4,month_5,month_6,month_7,month_8
0,2014-03-10,606.546509,599.796814,605.575134,603.522827,2438200.0,False,False,False,False,False,False,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
1,2014-03-11,604.8927,596.085754,604.618713,597.754456,3438800.0,False,False,False,False,False,False,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2,2014-03-12,601.6698,589.883972,595.966187,601.395813,3943200.0,False,False,False,False,False,False,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,2014-03-13,602.989868,590.167908,601.719604,592.309875,4708600.0,False,False,False,False,False,False,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
4,2014-03-14,593.211487,584.075745,588.788086,584.210266,4604900.0,False,False,False,False,False,False,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
5,2014-03-17,596.299927,586.855347,587.423218,593.824219,4340300.0,False,False,False,False,False,False,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
6,2014-03-18,603.50293,594.302429,595.104431,603.368408,3643500.0,False,False,False,False,False,False,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


### Preparing sequential for stock data

In [0]:
# combining stock features and labels
stock_data_df = pd.merge(stock_data_df, stock_labels_df, on="Date",how="inner")
stock_data_df = stock_data_df.set_index('Date')

In [16]:
stock_data_df.head(5)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,week_11,week_12,week_13,week_14,week_15,week_16,week_17,week_18,week_19,week_20,week_21,week_22,week_23,week_24,week_25,week_26,week_27,week_28,week_29,week_30,week_31,week_32,week_33,week_34,week_35,month_3,month_4,month_5,month_6,month_7,month_8,change_1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1
2014-03-10,606.546509,599.796814,605.575134,603.522827,2438200.0,False,False,False,False,False,False,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2
2014-03-11,604.8927,596.085754,604.618713,597.754456,3438800.0,False,False,False,False,False,False,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2
2014-03-12,601.6698,589.883972,595.966187,601.395813,3943200.0,False,False,False,False,False,False,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2
2014-03-13,602.989868,590.167908,601.719604,592.309875,4708600.0,False,False,False,False,False,False,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2
2014-03-14,593.211487,584.075745,588.788086,584.210266,4604900.0,False,False,False,False,False,False,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1


In [0]:
prev = 7
X,y = utils.prepare_sequential_stock_data(stock_df=stock_data_df, label='change_' + str(1), prev=prev, scaling=True)

### Checking sequences

In [18]:
X.shape, y.shape

((114, 7, 47), (114,))

In [19]:
i = 0
stock_data_df.iloc[i:i+prev][['High', 'Low', 'Volume', 'change_1']]

Unnamed: 0_level_0,High,Low,Volume,change_1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014-03-10,606.546509,599.796814,2438200.0,2
2014-03-11,604.8927,596.085754,3438800.0,2
2014-03-12,601.6698,589.883972,3943200.0,2
2014-03-13,602.989868,590.167908,4708600.0,2
2014-03-14,593.211487,584.075745,4604900.0,1
2014-03-17,596.299927,586.855347,4340300.0,1
2014-03-18,603.50293,594.302429,3643500.0,0


In [20]:
X[i][:,:3]

array([[1.        , 1.        , 1.        ],
       [0.98188226, 0.96227581, 0.99029086],
       [0.94657487, 0.89923257, 0.90245445],
       [0.96103643, 0.90211888, 0.96086046],
       [0.85391271, 0.84018995, 0.82958573],
       [0.88774706, 0.86844556, 0.81573023],
       [0.9666571 , 0.9441477 , 0.89370632]])

In [21]:
y[i]

0

### Dropping out news for days we don't have stock data

In [22]:
print("Number of days we have headlines: %d" %(len(news_df['Date'].unique())))

Number of days we have headlines: 95


In [0]:
date_index_to_keep = stock_data_df.index[prev-1:]
news_df = news_df[news_df['Date'].isin(date_index_to_keep)].reset_index(drop=True)
news_df = news_df.sort_values('Date').reset_index(drop=True)

In [24]:
news_df.head()

Unnamed: 0,Date,news
0,2014-03-18,long sought evidence spotted universe early gr...
1,2014-03-18,friends improvements coming xbox one april
2,2014-03-18,xbox one april system update brings game/app s...
3,2014-03-18,xbox one april update bring yet user experienc...
4,2014-03-18,microsoft take xbox one 26 new countries septe...


In [25]:
print("Number of same days we have headlines and stock : %d" %(len(news_df['Date'].unique())))

Number of same days we have headlines and stock : 69


In [26]:
news_summary = news_df.groupby('Date').agg({'news':'count'}).reset_index()
print("Max number of headlines for a day: %d, Min number of headlines for a day: %d, \
Mean number of headlines for a day: %d" %(max(news_summary['news']),
                                                min(news_summary['news']), 
                                                np.mean(news_summary['news'])))

Max number of headlines for a day: 2438, Min number of headlines for a day: 413, Mean number of headlines for a day: 1190


### Duplicating stock data label sequences and saving them

In [0]:
stock_data_seq = []
stock_label_seq = []
for date in news_df["Date"]:
    arr_pos = np.where(date_index_to_keep == date)[0][0]
    stock_data_seq.append(X[arr_pos])
    stock_label_seq.append(y[arr_pos])
stock_data_seq = np.array(stock_data_seq)
stock_label_seq = np.array(stock_label_seq)

In [28]:
stock_data_seq.shape

(82144, 7, 47)

In [0]:
utils.save_pickle(filename=company +"_uci_stock_sequences", variable=stock_data_seq)
utils.save_pickle(filename=company +"_uci_stock_labels", variable=stock_label_seq)

### Getting vocabulary from news corpus

In [0]:
vocab = utils.get_corpus_vocab(df=news_df, text_col='news')

In [31]:
print("Size of Vocabulary: %s"%len(vocab))

Size of Vocabulary: 23784


In [32]:
utils.print_n_dict_items(dict_=vocab, n=5)

key: long, key count: 221 
key: sought, key count: 18 
key: evidence, key count: 165 
key: spotted, key count: 157 
key: universe, key count: 231 


## Getting Glove Vector Enconding

In [0]:
## downloading glove word embeddings
from urllib.request import urlretrieve
urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip', 'glove.6B.zip')

from zipfile import ZipFile
with ZipFile('glove.6B.zip', 'r') as zf:
    zf.extractall('')

In [0]:
## getting glove word embeddings
word_embeddings = utils.get_word_embeddings(file='glove.6B.300d.txt')

In [35]:
print('No of word embeddings: %d' %len(word_embeddings))
print('No of categories against which words are classified: %d' %len(word_embeddings['man']))

No of word embeddings: 400000
No of categories against which words are classified: 300


In [36]:
# Find the number of words that are missing from GloVe, and are used more than our threshold.
cnt_missing_words = 0
threshold = 10
"""
checking if news word count is greater than threshold, 
if greater than threshold and not present in word embeddings 
then word will be considered as missing
"""
for word, word_cnt in vocab.items():
    if word_cnt > threshold: 
        if word not in word_embeddings:
            cnt_missing_words += 1
            
ratio_missing_words = round(cnt_missing_words/len(vocab),4)*100
            
print("Number of words missing from GloVe: %d"%cnt_missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(ratio_missing_words))

Number of words missing from GloVe: 164
Percent of words that are missing from vocabulary: 0.69%


### Getting words from news with frequency greater than threshold or present in Glove embeddings

In [0]:
"""
creating news word list with word index with conditions
if either word frequency in corpus greater 
than threshold or present in Glove embeddings
"""
# dictionary to convert words to integers
vocab_to_index = {}
index = 0
for word, word_cnt in vocab.items():
    if word_cnt >= threshold or word in word_embeddings: 
        vocab_to_index[word] = index
        index += 1 # giving index

In [38]:
utils.print_n_dict_items(dict_=vocab_to_index, n=10)

key: long, key count: 0 
key: sought, key count: 1 
key: evidence, key count: 2 
key: spotted, key count: 3 
key: universe, key count: 4 
key: early, key count: 5 
key: growth, key count: 6 
key: spurt, key count: 7 
key: friends, key count: 8 
key: improvements, key count: 9 


In [0]:
#Getting frequency of words thst are missing in word embeddings and frequency greater than threashold
missing_words_in_embeddings = {}
for word in vocab_to_index.keys():
    if word not in word_embeddings:
        missing_words_in_embeddings[word] = vocab_to_index[word]

In [40]:
missing_words_in_embeddings

{'//': 2772,
 '//www': 9627,
 '119m': 12996,
 '13mp': 12940,
 '145m': 15159,
 '15gb': 17093,
 '186f': 10791,
 '1tb': 13320,
 '218000': 14857,
 '240000': 16538,
 '284000': 12609,
 '29999': 9240,
 '2fmusktfhziyo3u0m': 9626,
 '33000': 17666,
 '36000': 15227,
 '36340': 11180,
 '400000': 9100,
 '42000': 8048,
 '435000': 9089,
 '440000': 15114,
 '500000': 151,
 '50mp': 2342,
 '51000': 5368,
 '51500': 5474,
 '555m': 17052,
 '56000': 13568,
 '56214': 13573,
 '5mp': 12487,
 '600000': 8819,
 '60fps': 17610,
 '6999': 14368,
 '700000': 12293,
 '700k': 12298,
 '883000': 19386,
 '883k': 19388,
 '900000': 1071,
 '970m': 19674,
 'amzn': 5557,
 'antutu': 8102,
 'aquarid': 12930,
 'aquarids': 12933,
 'bardarbunga': 19637,
 'batterybox': 3681,
 'bb10': 6337,
 'bbry': 6346,
 'beovision': 13767,
 'blackshades': 14464,
 'blinkfeed': 3950,
 'camelopardalid': 15239,
 'camelopardalids': 15240,
 'cameraphone': 11391,
 'carplay': 7119,
 'chadder': 13191,
 'chariklo': 5575,
 'chromebooks': 13289,
 'chromecast': 1

### Special tokens that will be added to our vocab

In [0]:
codes = ["<UNK>","<PAD>"]   
# Add codes to vocab
for code in codes:
    vocab_to_index[code] = len(vocab_to_index)

In [42]:
print(vocab_to_index["<UNK>"], vocab_to_index["<PAD>"])

19772 19773


### Dictionary to convert index to words

In [0]:
index_to_vocab = {}
for word, index in vocab_to_index.items():
    index_to_vocab[index] = word

In [44]:
utils.print_n_dict_items(dict_=index_to_vocab, n=10)

key: 0, key count: long 
key: 1, key count: sought 
key: 2, key count: evidence 
key: 3, key count: spotted 
key: 4, key count: universe 
key: 5, key count: early 
key: 6, key count: growth 
key: 7, key count: spurt 
key: 8, key count: friends 
key: 9, key count: improvements 


### Percentage of words used for word embeddings

In [45]:
usage_ratio = round((len(vocab_to_index)-2)/ len(vocab),4)*100 # getting how many words we are using

print("Total number of unique words in news corpus: %d" %len(vocab))
print("Number of words we will use: %d" %len(vocab_to_index))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words in news corpus: 23784
Number of words we will use: 19774
Percent of words we will use: 83.13000000000001%


### Creating embedding matrix for selected words(selected words are one's with frequency greater than threshold or present in Glove's embeddings)

In [0]:
embedding_dimension = len(word_embeddings['the']) # getting embedding dimension
n_words_to_use = len(vocab_to_index) # number of words we will use including padding and unknown
word_embeddings_matrix = np.zeros((n_words_to_use, embedding_dimension)) # create matrix with default values of zero

for word, index in vocab_to_index.items():
    if word in word_embeddings: # if word is present in word embeddings 
        word_embeddings_matrix[index] = word_embeddings[word] # getting embedding from the word embeddings
    else: # if news word not in word embeddings, creating random embedding for it
        new_word_embeddings = np.array(np.random.uniform(-1.0, 1.0, embedding_dimension))
        word_embeddings_matrix[index] = new_word_embeddings # adding random word embeddings for new word 
        word_embeddings[word] = new_word_embeddings # updating word embeddings with new word

In [0]:
del word_embeddings

In [48]:
# Check if value matches len(vocab_to_index)
print("Number of words we are going to use: %d, Number of embeddings for words to be used: %d"%(len(vocab_to_index),
                                                                                               len(word_embeddings_matrix)))

Number of words we are going to use: 19774, Number of embeddings for words to be used: 19774


Note: The embeddings will be updated as the model trains, so our new 'random' embeddings will be more accurate by the end of training. This is also why we want to only use words that appear at least 10 times. By having the model see the word numerous times it will be better able to understand what it means.

### Saving word embeddings 

In [0]:
utils.save_pickle(filename='uci_words_embeddings', variable=word_embeddings_matrix)

### Converting words in text to word indexes

In [0]:
text_indexes_df = pd.DataFrame()
text_indexes_df['Date'] = news_df['Date']
text_indexes_df['news_words_indexes'] = news_df['news'].apply(utils.convert_text_to_indexes, 
                                                                               word_indexes=vocab_to_index)

In [51]:
text_indexes_df.head()

Unnamed: 0,Date,news_words_indexes
0,2014-03-18,"[0, 1, 2, 3, 4, 5, 6, 7]"
1,2014-03-18,"[8, 9, 10, 11, 12, 13]"
2,2014-03-18,"[11, 12, 13, 14, 15, 16, 19772, 17, 18, 19, 20]"
3,2014-03-18,"[11, 12, 13, 15, 21, 22, 23, 24, 9]"
4,2014-03-18,"[25, 26, 11, 12, 27, 28, 29, 30]"


### Total words and total unknown words in news 

In [52]:
word_cnt_tuple = news_df['news'].apply(utils.get_word_unk_cnt, 
                                                  word_indexes=vocab_to_index)
word_cnt = [word_cnt_tuple[i][0] for i in range(len(word_cnt_tuple))]
word_cnt = np.sum(word_cnt)

unk_cnt = [word_cnt_tuple[i][1] for i in range(len(word_cnt_tuple))]
unk_cnt = np.sum(unk_cnt)

unk_percent = round(unk_cnt/word_cnt,4)*100

print("Total number of words in headlines:", word_cnt)
print("Total number of UNKs in headlines:", unk_cnt)
print("Percent of words that are UNK: {}%".format(unk_percent))


Total number of words in headlines: 600782
Total number of UNKs in headlines: 6200
Percent of words that are UNK: 1.03%


### Description about no. of words in news

In [0]:
# Find the length of headlines
lengths = [len(text_indexes_df.iloc[i]['news_words_indexes']) for i in range(len(text_indexes_df))]

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])

In [54]:
lengths.describe()

Unnamed: 0,counts
count,82144.0
mean,7.313766
std,11.575638
min,0.0
25%,6.0
50%,7.0
75%,9.0
max,2884.0


### Generating sequence of word indexes on daily basis 
Limiting no. of words in a single headline to 15(can be modified) and combining single day headlines.

In [0]:
text_indexes_df['news_sequence'] = text_indexes_df['news_words_indexes'].apply(utils.limiting_text_length, 
                                                                                            limit_length = 15,
                                                                                            pad_index = vocab_to_index["<UNK>"])

In [56]:
text_indexes_df.head()

Unnamed: 0,Date,news_words_indexes,news_sequence
0,2014-03-18,"[0, 1, 2, 3, 4, 5, 6, 7]","[0, 1, 2, 3, 4, 5, 6, 7, 19772, 19772, 19772, ..."
1,2014-03-18,"[8, 9, 10, 11, 12, 13]","[8, 9, 10, 11, 12, 13, 19772, 19772, 19772, 19..."
2,2014-03-18,"[11, 12, 13, 14, 15, 16, 19772, 17, 18, 19, 20]","[11, 12, 13, 14, 15, 16, 19772, 17, 18, 19, 20..."
3,2014-03-18,"[11, 12, 13, 15, 21, 22, 23, 24, 9]","[11, 12, 13, 15, 21, 22, 23, 24, 9, 19772, 197..."
4,2014-03-18,"[25, 26, 11, 12, 27, 28, 29, 30]","[25, 26, 11, 12, 27, 28, 29, 30, 19772, 19772,..."


### Saving news sequences

In [0]:
text_indexes_df = text_indexes_df[['Date', 'news_sequence']]

In [0]:
utils.save_pickle(filename="uci_news_sequences", variable=text_indexes_df)