In [1]:
import pandas as pd
import re
import string
import numpy as np
from numpy import asarray, zeros
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout

## Data Processing
This step further cleans the text data by converting text to lowercase, removing punctuation and extra spaces.

Certain stopwords affected the sentiment of the text and hence removing them would decrease the accuracy of the model. Furthermore, this project implements a neural network approach and thus
1. directional words such as ['above', 'below', 'up', 'down', 'over', 'under']
2. words that serves as a negation such as 'can't'

In [2]:
df = pd.read_csv('stock_data.csv')
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
# Check the number of positive and negative sentiments
df['Sentiment'].value_counts() # more positive than negative

Sentiment
 1    3685
-1    2106
Name: count, dtype: int64

In [4]:
# Download stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gareth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
dir_words = ['above', 'below', 'up', 'down', 'over', 'under']
neg_words = ['but', 'no', 'nor', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
exception_words = dir_words + neg_words

In [6]:
# Define function to process the sentences
def process_text(text):
    
    cleaned = text.lower()

    # Remove punctuations
    cleaned = re.sub('[^a-zA-Z]', ' ', cleaned)

    # Remove multiple spaces
    cleaned = re.sub(r'\s+', ' ', cleaned)

    # Remove stopwords
    '''
    remove words that affect sentiment from the list of stopwords such as:
    ['above', 'below', 'up', 'down', 'over', 'under', 'no', 'nor']
    '''
    stop_words = [word for word in stopwords.words('english') if word not in exception_words] 
    pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*') 
    cleaned = pattern.sub('', cleaned)
    
    return cleaned

In [7]:
tx = "user I'd be afraid to short AMZN - and they are looking like a near-monopoly in eBooks and infrastructure-as-a-service"
process_text(tx)

'user afraid short amzn looking like near monopoly ebooks infrastructure service'

In [8]:
df['Text'] = df['Text'].apply(lambda x: process_text(x))
df.head()

Unnamed: 0,Text,Sentiment
0,kickers watchlist xide tit soq pnk cpw bpz aj ...,1
1,user aap movie return fea geed indicator trade...,1
2,user afraid short amzn looking like near monop...,1
3,mnta over,1
4,oi over,1


In [9]:
# Check if the exceptions affects the sentiment
df.loc[df['Text'].str.contains(' | '.join(exception_words)),:]

Unnamed: 0,Text,Sentiment
3,mnta over,1
4,oi over,1
5,pgnx over,1
10,assuming fcx opens tomorrow above trigger buy ...,1
14,momentum coming back etfc broke ma resistance ...,1
...,...,...
5779,investors lose rs lakh crore worst day markets...,-1
5785,tcs share price jumps no layoffs dividend ann...,1
5787,gold prices slip below rs investors book prof...,-1
5789,sharemarket live sensex day high up points ni...,1


In [10]:
X = df['Text']
y = df['Sentiment'].replace(-1, 0)

### Word Frequency

In [11]:
# Create word freq dict
all_words = []

for sentence in X:
    words = sentence.split()
    all_words.extend(words)

freq_dist = FreqDist(all_words)

In [12]:
freq_dist

FreqDist({'aap': 929, 'co': 711, 'https': 695, 'user': 646, 'short': 457, 'up': 432, 'over': 352, 'today': 343, 'day': 324, 'volume': 307, ...})

In [13]:
# Create a list of words with freq <= threshold
'''
Attempted tweaking the threshold but did not find any significant improvements
'''
threshold = 0 # threshold for low freq words 
low_freq_list = []

for word, freq in freq_dist.items():
    if freq <= threshold: 
        low_freq_list.append(word)

low_freq_list

[]

In [14]:
# Define function to remove words with low freq
def remove_low_freq(text):
    words = text.split()
    removed = [word for word in words if word not in low_freq_list]
    return ' '.join(removed)

In [15]:
X = X.apply(lambda x: remove_low_freq(x))
X

0       kickers watchlist xide tit soq pnk cpw bpz aj ...
1       user aap movie return fea geed indicator trade...
2       user afraid short amzn looking like near monop...
3                                               mnta over
4                                                 oi over
                              ...                        
5786    industry body cii said discoms likely suffer n...
5787    gold prices slip below rs investors book profi...
5788    workers bajaj auto agreed wage cut period apri...
5789    sharemarket live sensex day high up points nif...
5790    sensex nifty climb day highs still up key fact...
Name: Text, Length: 5791, dtype: object

In [16]:
# 80 20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4632,), (1159,), (4632,), (1159,))

### Vectorizing

convert every sentence into an sequences of indexes which represent the words in the corpus.

In [17]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Create word to index dictionary
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [18]:
maxlen = 0
for i in X_train:
    if len(i) > maxlen:
        maxlen = len(i)
for i in X_test:
    if len(i) > maxlen:
        maxlen = len(i)
print(f'The maximum sentence length is {maxlen}')

The maximum sentence length is 22


In [19]:
tokenizer.word_index

{'aap': 1,
 'co': 2,
 'https': 3,
 'user': 4,
 'short': 5,
 'up': 6,
 'over': 7,
 'today': 8,
 'volume': 9,
 'day': 10,
 'long': 11,
 'like': 12,
 'but': 13,
 'not': 14,
 'good': 15,
 'stock': 16,
 'goog': 17,
 'watch': 18,
 'new': 19,
 'still': 20,
 'above': 21,
 'down': 22,
 'stop': 23,
 'nice': 24,
 'back': 25,
 'bac': 26,
 'next': 27,
 'move': 28,
 'market': 29,
 'buy': 30,
 'time': 31,
 'coronavirus': 32,
 'see': 33,
 'one': 34,
 'ong': 35,
 'higher': 36,
 'no': 37,
 'trade': 38,
 'week': 39,
 'triangle': 40,
 'stocks': 41,
 'could': 42,
 'close': 43,
 'weekly': 44,
 'sensex': 45,
 'looking': 46,
 'breakout': 47,
 'big': 48,
 'nifty': 49,
 'support': 50,
 'p': 51,
 'go': 52,
 'break': 53,
 'bullish': 54,
 'going': 55,
 'last': 56,
 'looks': 57,
 'target': 58,
 'nfx': 59,
 'green': 60,
 'amzn': 61,
 'lower': 62,
 'gap': 63,
 'position': 64,
 'highs': 65,
 'price': 66,
 'markets': 67,
 'points': 68,
 'earnings': 69,
 'f': 70,
 'high': 71,
 'rt': 72,
 'under': 73,
 'below': 74,
 'get

In [20]:
# Get number of unique words
vocab_size = len(tokenizer.word_index) + 1 # for glove
vocab_size

8026

In [21]:
# Fill sentences with len < maxlen with 0s
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')

In [22]:
# GloVe Embeddings
embeddings_dict = dict()
glove_file = open('glove_files/glove.6B.50d.txt', encoding="utf8")

for line in glove_file:
    values = line.split()
    word = values[0]
    vector = asarray(values[1:], dtype='float32')
    embeddings_dict[word] = vector
glove_file.close()

embedding_matrix = zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

embedding_matrix.shape

(8026, 50)

In [23]:
# shape = (num of rows, maxlen)
X_train.shape, X_test.shape

((4632, 22), (1159, 22))

## Model Implementation

An LSTM is used as the words are treated as 'timesteps' where each word affects subsequent inputs.

Avoided overfitting by reducing epochs and layers.

In [24]:
# GLOBAL VARS
EPOCHS = 4 #10

In [25]:
# Define model
model = keras.Sequential([

    keras.layers.Embedding(input_dim=vocab_size, output_dim=50, weights=[embedding_matrix], input_length=maxlen), # GloVe embeddings
    keras.layers.LSTM(units=64, activation='relu'),
    keras.layers.Dense(units=1, activation='sigmoid')
    
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



2023-12-14 20:54:03.062013: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2023-12-14 20:54:03.062046: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-12-14 20:54:03.062053: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-12-14 20:54:03.062330: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-14 20:54:03.062785: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 22, 50)            401300    
                                                                 
 lstm (LSTM)                 (None, 64)                29440     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 430805 (1.64 MB)
Trainable params: 430805 (1.64 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [26]:
history = model.fit(X_train, y_train, epochs=EPOCHS, validation_split=0.2)

Epoch 1/4


2023-12-14 20:54:03.848100: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4


In [27]:
test_eval = model.evaluate(X_test, y_test)
test_loss, test_acc = test_eval[0], test_eval[1]
print(f'Test accuracy: {test_acc}')

Test accuracy: 0.780845582485199
