# Modeling Stock Market Sentiment with LSTM 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

In [73]:
# importing libraries

import os
import re
import string
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from collections import Counter

In [4]:
# Display

pd.set_option('max_colwidth', 800)
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [5]:
# current directory
os.getcwd()

'/home/siddharth/workspace-python/LSTM_stock_market_sentiment'

## Processing Data


Data used here is from **StockTwits.com** which is a social media network for traders and investors to share their views about the stock market. When a user posts a message, they tag the relevant stock ticker [$SPY in our case which is for S&P 500 index fund] and have option to tag the message with their sentiment - "bullish" or "bearish"

#### Read and view data

In [6]:
# read data from csv file
data = pd.read_csv('StockTwits_SPY_Sentiment_2017.gz',encoding='utf-8',index_col=0)

In [7]:
data.head()

Unnamed: 0,message,sentiment
0,$SPY crazy day so far!,bearish
1,$SPY Will make a new ATH this week. Watch it!,bullish
2,$SPY $DJIA white elephant in room is $AAPL. Up 14% since election. Strong headwinds w/Trump trade & Strong dollar. How many 7's do you see?,bearish
3,$SPY blocks above. We break above them We should push to double top,bullish
4,"$SPY Nothing happening in the market today, guess I'll go to the store and spend some $.",bearish


In [8]:
# Defining text messages and their labels

messages = data.message.values
labels = data.sentiment.values

#### Preprocess messages

Preprocessing the raw text data to normalize for the context. Normalizing for known unique 'entities' that carry similar contextual meaning. 

Therefore replacing the references to 
* specific stock ticker ($SPY), 
* user names, 
* url links,
* numbers with special tokenidentifying the entity 

Converting text into lower case and removing punctuations.               

In [9]:
def preprocess_messages(text):
    
    
    # SAVING REGEX PATTERNS
    REGEX_PRICE_SIGN = re.compile(r'\$(?!\d*\.?\d+%)\d*\.?\d+|(?!\d*\.?\d+%)\d*\.?\d+\$')
    REGEX_PRICE_NOSIGN = re.compile(r'(?!\d*\.?\d+%)(?!\d*\.?\d+k)\d*\.?\d+')
    REGEX_TICKER = re.compile('\$[a-zA-Z]+')
    REGEX_USER = re.compile('\@\w+')
    REGEX_LINK = re.compile('https?:\/\/[^\s]+')
    REGEX_HTML_ENTITY = re.compile('\&\w+')
    REGEX_NON_ACSII = re.compile('[^\x00-\x7f]')
    
    #string.punctuation - '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    #string.punctuation.replace('<', '').replace('>', '')
    #--> '!"#$%&\'()*+,-./:;=?@[\\]^_`{|}~'
    #re.escape(string.punctuation.replace('<', ''))
    #--> '\\!\\"\\#\\$\\%\\&\\\'\\(\\)\\*\\+\\,\\-\\.\\/\\:\\;\\=\\>\\?\\@\\[\\\\\\]\\^_\\`\\{\\|\\}\\~'
    
    REGEX_PUNCTUATION = re.compile('[%s]' % re.escape(string.punctuation.replace('<', '').replace('>', '')))
    REGEX_NUMBER = re.compile(r'[-+]?[0-9]+')
    
    
    # CONVERTING TO LOWERCASE
    text = text.lower()
    
    # REPLACE ST "ENTITITES" WITH A UNIQUE TOKEN
    text = re.sub(REGEX_TICKER, ' <TICKER> ', text)
    text = re.sub(REGEX_USER, ' <USER> ', text)
    text = re.sub(REGEX_LINK, ' <LINK> ', text)
    text = re.sub(REGEX_PRICE_SIGN, ' <PRICE> ', text)
    text = re.sub(REGEX_PRICE_NOSIGN, ' <NUMBER> ', text)
    text = re.sub(REGEX_NUMBER, ' <NUMBER> ', text)
    # REMOVE EXTRANEOUS TEXT DATA
    text = re.sub(REGEX_HTML_ENTITY, "", text)
    text = re.sub(REGEX_NON_ACSII, "", text)
    text = re.sub(REGEX_PUNCTUATION, "", text)
    
    # Tokenizing and removing < and > that are not in special tokens
    words = " ".join(token.replace("<", "").replace(">", "")
                     if token not in ['<TICKER>', '<USER>', '<LINK>', '<PRICE>', '<NUMBER>']
                     else token
                     for token in text.split())

    return words

In [10]:
messages = np.array([preprocess_messages(msg) for msg in messages])

#### Generate Vocab to index mapping

Encoding words to numbers for the alogrithm to work with inputs by encoding each word to a unique index.

In [11]:
vocab = " ".join(messages).split()

In [12]:
len(vocab)

1267980

In [13]:
len(set(vocab))

31980

In [14]:
word_idx = {word:idx for idx,word in enumerate(set(vocab),1)}
idx_word = {idx:word for word,idx in word_idx.items()}    

#### Checking messages length

In [15]:
message_len = [len(msg) for msg in messages]

print('Minimum length : ',min(message_len))
print('Maximum length : ',max(message_len))
print('Mean length : ',np.mean(message_len))

Minimum length :  0
Maximum length :  244
Mean length :  78.21856920395598


In [16]:
min_idx = [i  for i in range(len(message_len)) if message_len[i]==0]
print("Indexes where message length is 0 :",min_idx)

Indexes where message length is 0 : [88808]


In [17]:
print('messages length: ',len(messages))
print('no of labels: ',len(labels))

messages length:  96967
no of labels:  96967


In [18]:
# dropping zero message length message

messages = np.delete(messages,min_idx)
labels = np.delete(labels,min_idx)

In [20]:
print('messages length after removing of zero length messages: ',len(messages))
print('no of labels after removing of zero length messages: ',len(labels))

messages length after removing of zero length messages:  96966
no of labels after removing of zero length messages:  96966


#### Encoding Messages and Labels to the indexes

In [24]:
def encode_messages(messages,word_idx):
    encoded_msg = [] 
    for msg in messages:
        encoded_msg.append([word_idx[word] for word in msg.split()])
    
    return np.array(encoded_msg)

In [25]:
encoded_msg = encode_messages(messages,word_idx)
encoded_msg

array([list([8687, 9942, 9264, 19861, 11725]),
       list([8687, 18485, 1068, 18440, 29932, 26382, 12761, 20319, 2173, 23849]),
       list([8687, 8687, 24655, 3609, 31956, 12807, 25511, 8687, 3662, 17562, 14481, 30034, 7145, 23443, 14792, 1396, 7145, 1347, 9849, 10646, 17562, 3842, 28742, 8580, 5145]),
       ..., list([8687, 19220, 7237, 31787, 30939, 13572, 21394]),
       list([8687, 17562, 13945, 18867, 28430, 25511, 11267, 21081, 8947, 21042]),
       list([8687, 8442, 25511, 18440, 9264, 11739, 28717, 29840, 16168, 8580, 30862, 18440, 17562, 29291, 29972, 11715, 8580, 23352, 21542, 1898, 1106, 25370, 12761, 180, 8687, 8687, 8687])],
      dtype=object)

In [30]:
data.sentiment.nunique()

2

In [31]:
data.sentiment.value_counts()

bullish    53704
bearish    43263
Name: sentiment, dtype: int64

In [32]:
def encode_labels(labels):
    return np.array([0 if label=='bullish' else 1 for label in labels ])

In [35]:
encoded_label = encode_labels(labels)
encoded_label

array([1, 0, 1, ..., 1, 0, 0])

#### Zero Padding the messages

In [62]:
# finding the maximum sentence

len_encoded_msg = [len(i) for i in encoded_msg]
seq_len = max(len_encoded_msg)

In [63]:
# padding the encoded_messages

padd_msg = np.zeros((len(encoded_msg),seq_len))

for i,message in enumerate(encoded_msg):
    padd_msg[i,seq_len-len(message):] = message

In [71]:
padd_msg.shape

(96966, 39)