In [0]:
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
import matplotlib.pyplot as plt
import utils_v2 as utils

In [71]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Using DJIA News headlines for stock prediction ##  

In [72]:
news_df = pd.read_csv('Combined_News_DJIA.csv')
news_df.head(2)

Unnamed: 0,Date,Label,Top1,Top2,Top3,Top4,Top5,Top6,Top7,Top8,Top9,Top10,Top11,Top12,Top13,Top14,Top15,Top16,Top17,Top18,Top19,Top20,Top21,Top22,Top23,Top24,Top25
0,2008-08-08,0,"b""Georgia 'downs two Russian warplanes' as cou...",b'BREAKING: Musharraf to be impeached.',b'Russia Today: Columns of troops roll into So...,b'Russian tanks are moving towards the capital...,"b""Afghan children raped with 'impunity,' U.N. ...",b'150 Russian tanks have entered South Ossetia...,"b""Breaking: Georgia invades South Ossetia, Rus...","b""The 'enemy combatent' trials are nothing but...",b'Georgian troops retreat from S. Osettain cap...,b'Did the U.S. Prep Georgia for War with Russia?',b'Rice Gives Green Light for Israel to Attack ...,b'Announcing:Class Action Lawsuit on Behalf of...,"b""So---Russia and Georgia are at war and the N...","b""China tells Bush to stay out of other countr...",b'Did World War III start today?',b'Georgia Invades South Ossetia - if Russia ge...,b'Al-Qaeda Faces Islamist Backlash',"b'Condoleezza Rice: ""The US would not act to p...",b'This is a busy day: The European Union has ...,"b""Georgia will withdraw 1,000 soldiers from Ir...",b'Why the Pentagon Thinks Attacking Iran is a ...,b'Caucasus in crisis: Georgia invades South Os...,b'Indian shoe manufactory - And again in a se...,b'Visitors Suffering from Mental Illnesses Ban...,"b""No Help for Mexico's Kidnapping Surge"""
1,2008-08-11,1,b'Why wont America and Nato help us? If they w...,b'Bush puts foot down on Georgian conflict',"b""Jewish Georgian minister: Thanks to Israeli ...",b'Georgian army flees in disarray as Russians ...,"b""Olympic opening ceremony fireworks 'faked'""",b'What were the Mossad with fraudulent New Zea...,b'Russia angered by Israeli military sale to G...,b'An American citizen living in S.Ossetia blam...,b'Welcome To World War IV! Now In High Definit...,"b""Georgia's move, a mistake of monumental prop...",b'Russia presses deeper into Georgia; U.S. say...,b'Abhinav Bindra wins first ever Individual Ol...,b' U.S. ship heads for Arctic to define territ...,b'Drivers in a Jerusalem taxi station threaten...,b'The French Team is Stunned by Phelps and the...,b'Israel and the US behind the Georgian aggres...,"b'""Do not believe TV, neither Russian nor Geor...",b'Riots are still going on in Montreal (Canada...,b'China to overtake US as largest manufacturer',b'War in South Ossetia [PICS]',b'Israeli Physicians Group Condemns State Tort...,b' Russia has just beaten the United States ov...,b'Perhaps *the* question about the Georgia - R...,b'Russia is so much better at war',"b""So this is what it's come to: trading sex fo..."


In [73]:
news_cols = [col for col in news_df.columns if col.__contains__("Top")] # getting news cols
news_df = pd.melt(news_df, id_vars= 'Date',value_vars=news_cols, value_name='news') # converting daily news for each day into rows
news_df.drop('variable', axis=1, inplace=True)
news_df = news_df.sort_values(['Date']).reset_index(drop=True)
news_df['Date'] = pd.to_datetime(news_df['Date'])
news_df.head()

Unnamed: 0,Date,news
0,2008-08-08,"b""Georgia 'downs two Russian warplanes' as cou..."
1,2008-08-08,"b""Georgia will withdraw 1,000 soldiers from Ir..."
2,2008-08-08,"b""So---Russia and Georgia are at war and the N..."
3,2008-08-08,"b""China tells Bush to stay out of other countr..."
4,2008-08-08,b'This is a busy day: The European Union has ...


In [74]:
print("News Time Period, From %s to %s"%(min(news_df['Date']), max(news_df['Date'])))
print("Number of days headlines: %d"%(len(news_df['Date'].unique())))
print("Total News: %d"%len(news_df))

News Time Period, From 2008-08-08 00:00:00 to 2016-07-01 00:00:00
Number of days headlines: 1989
Total News: 49725


## Cleaning news

In [75]:
print("Before Cleaning")
news_df.head(5)

Before Cleaning


Unnamed: 0,Date,news
0,2008-08-08,"b""Georgia 'downs two Russian warplanes' as cou..."
1,2008-08-08,"b""Georgia will withdraw 1,000 soldiers from Ir..."
2,2008-08-08,"b""So---Russia and Georgia are at war and the N..."
3,2008-08-08,"b""China tells Bush to stay out of other countr..."
4,2008-08-08,b'This is a busy day: The European Union has ...


In [0]:
news_df["news"] = news_df["news"].apply(utils.get_clean_text)

In [77]:
print("After cleaning")
news_df.head(5)

After cleaning


Unnamed: 0,Date,news
0,2008-08-08,georgia downs two russian warplanes countries ...
1,2008-08-08,georgia withdraw 1 000 soldiers iraq help figh...
2,2008-08-08,russia georgia war nyt top story opening cerem...
3,2008-08-08,china tells bush stay countries affairs
4,2008-08-08,busy day european union approved new sanctions...


In [0]:
utils.save_pickle('djia_clean_news', news_df)

### Getting stock change labels for time duration we have the news ##

In [0]:
company = "GOOG"
start_date = min(news_df['Date'])
end_date = max(news_df['Date'])
predict_trend = 1

In [0]:
stock_labels_df = utils.get_stock_change_labels(start=start_date,
                                                end=end_date,
                                                company=company,
                                                cutoff=.5,
                                                n_labels=3,
                                                shift=predict_trend)

In [81]:
stock_labels_df.head()

Unnamed: 0,Date,change_1
0,2008-08-08,1
1,2008-08-11,1
2,2008-08-12,0
3,2008-08-13,1
4,2008-08-14,1


In [82]:
stock_labels_df['change_1'].value_counts()

1    788
2    741
0    459
Name: change_1, dtype: int64

### Getting stock features for time duration we have the news

In [0]:
stock_data_df = utils.get_stock_data(start=start_date,
                                     end=end_date,
                                     company=company,
                                     additional_features=True)

In [84]:
stock_data_df.head()

Unnamed: 0,Date,High,Low,Open,Close,Volume,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,week_1,week_2,week_3,week_4,week_5,week_6,week_7,week_8,week_9,week_10,week_11,week_12,week_13,week_14,week_15,week_16,week_17,week_18,week_19,week_20,week_21,week_22,week_23,week_24,week_25,week_26,week_27,week_28,week_29,week_30,week_31,week_32,week_33,week_34,week_35,week_36,week_37,week_38,week_39,week_40,week_41,week_42,week_43,week_44,week_45,week_46,week_47,week_48,week_49,week_50,week_51,week_52,week_53,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12
0,2008-08-08,246.949371,236.956833,239.178497,246.58075,7506500.0,False,False,False,False,False,False,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
1,2008-08-11,253.489853,244.971786,245.315491,249.484863,8510300.0,False,False,False,False,False,False,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,2008-08-12,252.119995,248.070175,250.062698,250.366562,5532000.0,False,False,False,False,False,False,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
3,2008-08-13,250.829834,246.017868,249.863449,249.081375,7278100.0,False,False,False,False,False,False,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
4,2008-08-14,252.857224,247.218369,247.920731,251.801193,5859000.0,False,False,False,False,False,False,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0


### Preparing sequential for stock data

In [0]:
# combining stock features and labels
stock_data_df = pd.merge(stock_data_df, stock_labels_df, on="Date",how="inner")
stock_data_df = stock_data_df.set_index('Date')

In [86]:
stock_data_df.head(5)

Unnamed: 0_level_0,High,Low,Open,Close,Volume,Is_month_end,Is_month_start,Is_quarter_end,Is_quarter_start,Is_year_end,Is_year_start,dayofweek_0,dayofweek_1,dayofweek_2,dayofweek_3,dayofweek_4,week_1,week_2,week_3,week_4,week_5,week_6,week_7,week_8,week_9,week_10,week_11,week_12,week_13,week_14,week_15,week_16,week_17,week_18,week_19,week_20,week_21,week_22,week_23,week_24,week_25,week_26,week_27,week_28,week_29,week_30,week_31,week_32,week_33,week_34,week_35,week_36,week_37,week_38,week_39,week_40,week_41,week_42,week_43,week_44,week_45,week_46,week_47,week_48,week_49,week_50,week_51,week_52,week_53,month_1,month_2,month_3,month_4,month_5,month_6,month_7,month_8,month_9,month_10,month_11,month_12,change_1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1
2008-08-08,246.949371,236.956833,239.178497,246.58075,7506500.0,False,False,False,False,False,False,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2008-08-11,253.489853,244.971786,245.315491,249.484863,8510300.0,False,False,False,False,False,False,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2008-08-12,252.119995,248.070175,250.062698,250.366562,5532000.0,False,False,False,False,False,False,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
2008-08-13,250.829834,246.017868,249.863449,249.081375,7278100.0,False,False,False,False,False,False,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
2008-08-14,252.857224,247.218369,247.920731,251.801193,5859000.0,False,False,False,False,False,False,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1


In [0]:
prev = 30
X,y = utils.prepare_sequential_stock_data(stock_df=stock_data_df, label='change_' + str(1), prev=prev, scaling=True)

### Checking sequences

In [88]:
X.shape, y.shape

((1959, 30, 81), (1959,))

In [89]:
i = 0
stock_data_df.iloc[i:i+prev][['High', 'Low', 'Volume', 'change_1']]

Unnamed: 0_level_0,High,Low,Volume,change_1
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2008-08-08,246.949371,236.956833,7506500.0,1
2008-08-11,253.489853,244.971786,8510300.0,1
2008-08-12,252.119995,248.070175,5532000.0,0
2008-08-13,250.829834,246.017868,7278100.0,1
2008-08-14,252.857224,247.218369,5859000.0,1
2008-08-15,254.376541,251.806168,7117800.0,2
2008-08-18,254.04776,246.829819,6692700.0,2
2008-08-19,248.209656,242.406403,6115700.0,2
2008-08-20,247.417618,240.383987,7993900.0,2
2008-08-21,244.035294,238.740143,7054500.0,1


In [90]:
X[i][:,:3]

array([[0.17198321, 0.17673837, 0.16583725],
       [0.1819582 , 0.18918953, 0.17522483],
       [0.17986901, 0.19400284, 0.1824865 ],
       [0.17790137, 0.1908146 , 0.18218171],
       [0.18099337, 0.19267957, 0.17920999],
       [0.18331051, 0.19980667, 0.18628878],
       [0.18280908, 0.19207596, 0.18846042],
       [0.17390529, 0.18520423, 0.17367039],
       [0.17269734, 0.18206243, 0.17693929],
       [0.16753891, 0.17950873, 0.16794794],
       [0.17132226, 0.18740968, 0.17448572],
       [0.17293285, 0.18123441, 0.17037865],
       [0.16264637, 0.17279176, 0.1683594 ],
       [0.15609007, 0.16834216, 0.16094534],
       [0.1573208 , 0.17259056, 0.1600005 ],
       [0.15318798, 0.1663998 , 0.15791267],
       [0.16167395, 0.16569559, 0.16326176],
       [0.15567984, 0.16427172, 0.15713544],
       [0.14728502, 0.15639399, 0.15048337],
       [0.13909537, 0.14918177, 0.13942703],
       [0.13946001, 0.13174703, 0.14440275],
       [0.12384037, 0.12977374, 0.12241965],
       [0.

In [91]:
y[i]

2

### Saving stock data and stock label sequence

In [0]:
utils.save_pickle(filename=company +"_djia_stock_sequences", variable=X)
utils.save_pickle(filename=company +"_djia_stock_labels", variable=y)

### Dropping out news for days we don't have stock data

In [93]:
print("Number of days we have headlines: %d" %(len(news_df['Date'].unique())))

Number of days we have headlines: 1989


In [0]:
date_index_to_keep = stock_data_df.index[prev-1:]
news_df = news_df[news_df['Date'].isin(date_index_to_keep)].reset_index(drop=True)
news_df = news_df.sort_values('Date').reset_index(drop=True)

In [95]:
news_df.head()

Unnamed: 0,Date,news
0,2008-09-19,people worried short selling alex financial ca...
1,2008-09-19,spain media uproar mccain comments regarding p...
2,2008-09-19,complete list world billionaires
3,2008-09-19,venezuela expels united states rights group cr...
4,2008-09-19,chavez expells human rights watch venezuela cr...


In [96]:
print("Number of same days we have headlines and stock : %d" %(len(news_df['Date'].unique())))

Number of same days we have headlines and stock : 1959


In [97]:
news_summary = news_df.groupby('Date').agg({'news':'count'}).reset_index()
print("Max number of headlines for a day: %d, Min number of headlines for a day: %d, \
Mean number of headlines for a day: %d" %(max(news_summary['news']),
                                                min(news_summary['news']), 
                                                np.mean(news_summary['news'])))

Max number of headlines for a day: 25, Min number of headlines for a day: 25, Mean number of headlines for a day: 25


### Getting vocabulary from news corpus

In [0]:
vocab = utils.get_corpus_vocab(df=news_df, text_col='news')

In [99]:
print("Size of Vocabulary: %s"%len(vocab))

Size of Vocabulary: 35184


In [100]:
utils.print_n_dict_items(dict_=vocab, n=5)

key: people, key count: 1864 
key: worried, key count: 33 
key: short, key count: 63 
key: selling, key count: 113 
key: alex, key count: 8 


## Getting Glove Vector Enconding

In [0]:
## downloading glove word embeddings
from urllib.request import urlretrieve
urlretrieve('http://nlp.stanford.edu/data/glove.6B.zip', 'glove.6B.zip')

from zipfile import ZipFile
with ZipFile('glove.6B.zip', 'r') as zf:
    zf.extractall('')

In [0]:
## getting glove word embeddings
word_embeddings = utils.get_word_embeddings(file='glove.6B.300d.txt')

In [103]:
print('No of word embeddings: %d' %len(word_embeddings))
print('No of categories against which words are classified: %d' %len(word_embeddings['man']))

No of word embeddings: 400000
No of categories against which words are classified: 300


In [104]:
# Find the number of words that are missing from GloVe, and are used more than our threshold.
cnt_missing_words = 0
threshold = 10
"""
checking if news word count is greater than threshold, 
if greater than threshold and not present in word embeddings 
then word will be considered as missing
"""
for word, word_cnt in vocab.items():
    if word_cnt > threshold: 
        if word not in word_embeddings:
            cnt_missing_words += 1
            
ratio_missing_words = round(cnt_missing_words/len(vocab),4)*100
            
print("Number of words missing from GloVe: %d"%cnt_missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(ratio_missing_words))

Number of words missing from GloVe: 41
Percent of words that are missing from vocabulary: 0.12%


### Getting words from news with frequency greater than threshold or present in Glove embeddings

In [0]:
"""
creating news word list with word index with conditions
if either word frequency in corpus greater 
than threshold or present in Glove embeddings
"""
# dictionary to convert words to integers
vocab_to_index = {}
index = 0
for word, word_cnt in vocab.items():
    if word_cnt >= threshold or word in word_embeddings: 
        vocab_to_index[word] = index
        index += 1 # giving index

In [106]:
utils.print_n_dict_items(dict_=vocab_to_index, n=10)

key: people, key count: 0 
key: worried, key count: 1 
key: short, key count: 2 
key: selling, key count: 3 
key: alex, key count: 4 
key: financial, key count: 5 
key: cartoon, key count: 6 
key: spain, key count: 7 
key: media, key count: 8 
key: uproar, key count: 9 


In [0]:
#Getting frequency of words thst are missing in word embeddings and frequency greater than threashold
missing_words_in_embeddings = {}
for word in vocab_to_index.keys():
    if word not in word_embeddings:
        missing_words_in_embeddings[word] = vocab_to_index[word]

In [108]:
missing_words_in_embeddings

{'/r/worldnews': 7292,
 '150000': 3760,
 '200000': 12443,
 '250000': 3865,
 '300000': 4389,
 '400000': 7679,
 '500000': 4764,
 '5bn': 2776,
 '600000': 5570,
 '700000': 5571,
 '7bn': 1815,
 '90000': 1707,
 '\\n': 10360,
 '\\r\\n': 10422,
 'brexit': 29998,
 'can\\': 3408,
 'china\\': 2344,
 'daesh': 28576,
 'didn\\': 6076,
 'doesn\\': 7203,
 'don\\': 4491,
 'egypts': 19222,
 'erdoan': 26013,
 'greeces': 14066,
 'guantnamo': 6242,
 'i\\': 1168,
 'iran\\': 6324,
 'isil': 27539,
 'israel\\': 5656,
 'it\\': 1165,
 'jurez': 7557,
 'mh17': 28296,
 'mh370': 27644,
 'netanyah': 8075,
 'nsfw': 10705,
 'nusra': 26484,
 'redditors': 2008,
 'snowdens': 25984,
 'syrias': 14871,
 'there\\': 8786,
 'they\\': 5046,
 'ttip': 27790,
 'ukraines': 13820,
 'we\\': 2964,
 'won\\': 1497,
 'world\\': 2659,
 'xpost': 19369}

### Special tokens that will be added to our vocab

In [0]:
codes = ["<UNK>","<PAD>"]   
# Add codes to vocab
for code in codes:
    vocab_to_index[code] = len(vocab_to_index)

In [110]:
print(vocab_to_index["<UNK>"], vocab_to_index["<PAD>"])

31672 31673


### Dictionary to convert index to words

In [0]:
index_to_vocab = {}
for word, index in vocab_to_index.items():
    index_to_vocab[index] = word

In [112]:
utils.print_n_dict_items(dict_=index_to_vocab, n=10)

key: 0, key count: people 
key: 1, key count: worried 
key: 2, key count: short 
key: 3, key count: selling 
key: 4, key count: alex 
key: 5, key count: financial 
key: 6, key count: cartoon 
key: 7, key count: spain 
key: 8, key count: media 
key: 9, key count: uproar 


### Percentage of words used for word embeddings

In [113]:
usage_ratio = round((len(vocab_to_index)-2)/ len(vocab),4)*100 # getting how many words we are using

print("Total number of unique words in news corpus: %d" %len(vocab))
print("Number of words we will use: %d" %len(vocab_to_index))
print("Percent of words we will use: {}%".format(usage_ratio))

Total number of unique words in news corpus: 35184
Number of words we will use: 31674
Percent of words we will use: 90.02%


### Creating embedding matrix for selected words(selected words are one's with frequency greater than threshold or present in Glove's embeddings)

In [0]:
embedding_dimension = len(word_embeddings['the']) # getting embedding dimension
n_words_to_use = len(vocab_to_index) # number of words we will use including padding and unknown
word_embeddings_matrix = np.zeros((n_words_to_use, embedding_dimension)) # create matrix with default values of zero

for word, index in vocab_to_index.items():
    if word in word_embeddings: # if word is present in word embeddings 
        word_embeddings_matrix[index] = word_embeddings[word] # getting embedding from the word embeddings
    else: # if news word not in word embeddings, creating random embedding for it
        new_word_embeddings = np.array(np.random.uniform(-1.0, 1.0, embedding_dimension))
        word_embeddings_matrix[index] = new_word_embeddings # adding random word embeddings for new word 
        word_embeddings[word] = new_word_embeddings # updating word embeddings with new word

In [0]:
del word_embeddings

In [116]:
# Check if value matches len(vocab_to_index)
print("Number of words we are going to use: %d, Number of embeddings for words to be used: %d"%(len(vocab_to_index),
                                                                                               len(word_embeddings_matrix)))

Number of words we are going to use: 31674, Number of embeddings for words to be used: 31674


Note: The embeddings will be updated as the model trains, so our new 'random' embeddings will be more accurate by the end of training. This is also why we want to only use words that appear at least 10 times. By having the model see the word numerous times it will be better able to understand what it means.

### Saving word embeddings 

In [0]:
utils.save_pickle(filename='djia_words_embeddings', variable=word_embeddings_matrix)

### Converting words in text to word indexes

In [0]:
text_indexes_df = pd.DataFrame()
text_indexes_df['Date'] = news_df['Date']
text_indexes_df['news_words_indexes'] = news_df['news'].apply(utils.convert_text_to_indexes, 
                                                                               word_indexes=vocab_to_index)

In [119]:
text_indexes_df.head()

Unnamed: 0,Date,news_words_indexes
0,2008-09-19,"[0, 1, 2, 3, 4, 5, 6]"
1,2008-09-19,"[7, 8, 9, 10, 11, 12, 13, 14, 15]"
2,2008-09-19,"[16, 17, 18, 19]"
3,2008-09-19,"[20, 21, 22, 23, 24, 25, 26]"
4,2008-09-19,"[27, 31672, 28, 24, 29, 20, 30, 31]"


### Total words and total unknown words in news 

In [120]:
word_cnt_tuple = news_df['news'].apply(utils.get_word_unk_cnt, 
                                                  word_indexes=vocab_to_index)
word_cnt = [word_cnt_tuple[i][0] for i in range(len(word_cnt_tuple))]
word_cnt = np.sum(word_cnt)

unk_cnt = [word_cnt_tuple[i][1] for i in range(len(word_cnt_tuple))]
unk_cnt = np.sum(unk_cnt)

unk_percent = round(unk_cnt/word_cnt,4)*100

print("Total number of words in headlines:", word_cnt)
print("Total number of UNKs in headlines:", unk_cnt)
print("Percent of words that are UNK: {}%".format(unk_percent))


Total number of words in headlines: 597901
Total number of UNKs in headlines: 4415
Percent of words that are UNK: 0.74%


### Description about no. of words in news

In [0]:
# Find the length of headlines
lengths = [len(text_indexes_df.iloc[i]['news_words_indexes']) for i in range(len(text_indexes_df))]

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])

In [122]:
lengths.describe()

Unnamed: 0,counts
count,48975.0
mean,12.20829
std,6.843015
min,1.0
25%,7.0
50%,10.0
75%,16.0
max,41.0


### Generating sequence of word indexes on daily basis 
Limiting no. of words in a single headline to 16(can be modified) and combining single day headlines and limiting words to 200(can be modified). Padding combined with pad if less than 200. Note- Maintainig most of the sequence of words in news for better prediction

In [0]:
text_indexes_df['news_words_indexes_fix'] = text_indexes_df['news_words_indexes'].apply(utils.add_pad, 
                                                                                        max_length = 16,
                                                                                        pad_index = vocab_to_index["<UNK>"])

In [124]:
text_indexes_df.head()

Unnamed: 0,Date,news_words_indexes,news_words_indexes_fix
0,2008-09-19,"[0, 1, 2, 3, 4, 5, 6]","[0, 1, 2, 3, 4, 5, 6, 31672]"
1,2008-09-19,"[7, 8, 9, 10, 11, 12, 13, 14, 15]","[7, 8, 9, 10, 11, 12, 13, 14, 15, 31672]"
2,2008-09-19,"[16, 17, 18, 19]","[16, 17, 18, 19, 31672]"
3,2008-09-19,"[20, 21, 22, 23, 24, 25, 26]","[20, 21, 22, 23, 24, 25, 26, 31672]"
4,2008-09-19,"[27, 31672, 28, 24, 29, 20, 30, 31]","[27, 31672, 28, 24, 29, 20, 30, 31, 31672]"


In [125]:
# Find the length of headlines
lengths = [len(text_indexes_df.iloc[i]['news_words_indexes_fix']) for i in range(len(text_indexes_df))]

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])
lengths.describe()

Unnamed: 0,counts
count,48975.0
mean,11.615824
std,4.091085
min,2.0
25%,8.0
50%,11.0
75%,17.0
max,17.0


### Combining single day headlines word indexes

In [0]:
dates = text_indexes_df['Date'].unique()
news_sequences = pd.DataFrame(columns=['Date', 'news_sequence'])
arr= None
for date in dates:
    seq = text_indexes_df[text_indexes_df['Date']==date]['news_words_indexes_fix'].values # getting all news for a day
    long_seq = []
    for arr in seq: # iterating over news sequence for a day
        for i in arr:
            long_seq.append(i) # appending to create a single seq for a day

    news_sequences = news_sequences.append({'Date': date, 'news_sequence': long_seq}, ignore_index=True)
# news_sequences = news_sequences.sort_values('Date').reset_index(drop=True)    

In [127]:
news_sequences.head()

Unnamed: 0,Date,news_sequence
0,2008-09-19,"[0, 1, 2, 3, 4, 5, 6, 31672, 7, 8, 9, 10, 11, ..."
1,2008-09-22,"[159, 160, 161, 162, 163, 164, 165, 166, 167, ..."
2,2008-09-23,"[329, 259, 330, 331, 332, 124, 31672, 333, 334..."
3,2008-09-24,"[458, 281, 459, 460, 461, 462, 31672, 218, 463..."
4,2008-09-25,"[585, 586, 587, 291, 588, 589, 590, 591, 592, ..."


In [128]:
# Find the length of headlines
lengths = [len(news_sequences.iloc[i]['news_sequence']) for i in range(len(news_sequences))]

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])
lengths.describe()

Unnamed: 0,counts
count,1959.0
mean,290.39561
std,25.8538
min,194.0
25%,274.0
50%,292.0
75%,308.0
max,378.0


### Limiting single day news to 308 words

In [0]:
news_sequences['news_sequence'] = news_sequences['news_sequence'].apply(utils.limiting_text_length, 
                                                                        limit_length = 308,
                                                                        pad_index = vocab_to_index["<UNK>"])

In [130]:
# Find the length of headlines
lengths = [len(news_sequences.iloc[i]['news_sequence']) for i in range(len(news_sequences))]

# Create a dataframe so that the values can be inspected
lengths = pd.DataFrame(lengths, columns=['counts'])
lengths.describe()

Unnamed: 0,counts
count,1959.0
mean,308.0
std,0.0
min,308.0
25%,308.0
50%,308.0
75%,308.0
max,308.0


### Saving news sequences

In [0]:
utils.save_pickle(filename="djia_news_sequences", variable=news_sequences)