In [1]:
import pandas as pd
import re
import string
import numpy as np
from numpy import asarray, zeros
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
from sklearn.model_selection import train_test_split

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout

## Data Processing
This step further cleans the text data by converting text to lowercase, removing punctuation and extra spaces.

Certain stopwords affected the sentiment of the text and hence removing them would decrease the accuracy of the model. Furthermore, this project implements a neural network approach and thus
1. directional words such as ['above', 'below', 'up', 'down', 'over', 'under']
2. words that serves as a negation such as 'can't'

In [2]:
df = pd.read_csv('stock_data.csv')
df.head()

Unnamed: 0,Text,Sentiment
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,1
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,1
2,user I'd be afraid to short AMZN - they are lo...,1
3,MNTA Over 12.00,1
4,OI Over 21.37,1


In [3]:
# Check the number of positive and negative sentiments
df['Sentiment'].value_counts() # more positive than negative

Sentiment
 1    3685
-1    2106
Name: count, dtype: int64

In [4]:
# Download stopwords
nltk.download('stopwords')
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gareth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
dir_words = ['above', 'below', 'up', 'down', 'over', 'under']
neg_words = ['but', 'no', 'nor', 'not', 'don', "don't", 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"]
exception_words = dir_words + neg_words

In [6]:
# Define function to process the sentences
def process_text(text):
    
    cleaned = text.lower()

    # Remove punctuations
    cleaned = re.sub('[^a-zA-Z]', ' ', cleaned)

    # Remove multiple spaces
    cleaned = re.sub(r'\s+', ' ', cleaned)

    # Remove stopwords
    '''
    remove words that affect sentiment from the list of stopwords such as:
    ['above', 'below', 'up', 'down', 'over', 'under', 'no', 'nor']
    '''
    stop_words = [word for word in stopwords.words('english') if word not in exception_words] 
    pattern = re.compile(r'\b(' + r'|'.join(stop_words) + r')\b\s*') 
    cleaned = pattern.sub('', cleaned)
    
    return cleaned

In [7]:
tx = "user I'd be afraid to short AMZN - and they are looking like a near-monopoly in eBooks and infrastructure-as-a-service"
process_text(tx)

'user afraid short amzn looking like near monopoly ebooks infrastructure service'

### Stemming

In [8]:
# Define function to stem the sentences
stemmer = PorterStemmer()

def stem_text(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [9]:
tx = 'kickers watchlist xide tit soq pnk cpw bpz aj trade method method see prev posts'
stem_text(tx)

'kicker watchlist xide tit soq pnk cpw bpz aj trade method method see prev post'

In [10]:
df['Text'] = df['Text'].apply(lambda x: stem_text(process_text(x)))
df.head()

Unnamed: 0,Text,Sentiment
0,kicker watchlist xide tit soq pnk cpw bpz aj t...,1
1,user aap movi return fea geed indic trade year...,1
2,user afraid short amzn look like near monopoli...,1
3,mnta over,1
4,oi over,1


In [11]:
# Check if the exceptions affects the sentiment
df.loc[df['Text'].str.contains(' | '.join(exception_words)),:]

Unnamed: 0,Text,Sentiment
14,momentum come back etfc broke ma resist solid ...,1
16,user gameplan shot today but like trend break ...,1
25,vs invert head shoulder play well wasn abl cat...,1
26,red not readi break,-1
28,user bac quick trade late but invest good entr...,1
...,...,...
5779,investor lose rs lakh crore worst day market o...,-1
5785,tc share price jump no layoff dividend announc...,1
5787,gold price slip below rs investor book profit ...,-1
5789,sharemarket live sensex day high up point nift...,1


In [12]:
X = df['Text']
y = df['Sentiment'].replace(-1, 0)

### Word Frequency

In [13]:
# Create word freq dict
all_words = []

for sentence in X:
    words = sentence.split()
    all_words.extend(words)

freq_dist = FreqDist(all_words)

In [14]:
freq_dist

FreqDist({'aap': 929, 'co': 711, 'http': 696, 'user': 648, 'short': 522, 'up': 440, 'day': 385, 'stock': 372, 'over': 352, 'today': 347, ...})

In [15]:
# Create a list of words with freq <= threshold
'''
Attempted tweaking the threshold but did not find any significant improvements
'''
threshold = 0 # threshold for low freq words 
low_freq_list = []

for word, freq in freq_dist.items():
    if freq <= threshold: 
        low_freq_list.append(word)

low_freq_list

[]

In [16]:
# Define function to remove words with low freq
def remove_low_freq(text):
    words = text.split()
    removed = [word for word in words if word not in low_freq_list]
    return ' '.join(removed)

In [17]:
X = X.apply(lambda x: remove_low_freq(x))
X

0       kicker watchlist xide tit soq pnk cpw bpz aj t...
1       user aap movi return fea geed indic trade year...
2       user afraid short amzn look like near monopoli...
3                                               mnta over
4                                                 oi over
                              ...                        
5786    industri bodi cii said discom like suffer net ...
5787    gold price slip below rs investor book profit ...
5788    worker bajaj auto agre wage cut period april t...
5789    sharemarket live sensex day high up point nift...
5790    sensex nifti climb day high still up key facto...
Name: Text, Length: 5791, dtype: object

In [18]:
# 80 20 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4632,), (1159,), (4632,), (1159,))

### Vectorizing

convert every sentence into an sequences of indexes which represent the words in the corpus.

In [19]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

# Create word to index dictionary
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [20]:
maxlen = 0
for i in X_train:
    if len(i) > maxlen:
        maxlen = len(i)
for i in X_test:
    if len(i) > maxlen:
        maxlen = len(i)
print(f'The maximum sentence length is {maxlen}')

#maxlen = 50 # for glove 

The maximum sentence length is 22


In [21]:
tokenizer.word_index

{'aap': 1,
 'co': 2,
 'http': 3,
 'user': 4,
 'short': 5,
 'up': 6,
 'stock': 7,
 'day': 8,
 'today': 9,
 'over': 10,
 'like': 11,
 'look': 12,
 'volum': 13,
 'market': 14,
 'buy': 15,
 'long': 16,
 'but': 17,
 'stop': 18,
 'move': 19,
 'watch': 20,
 'go': 21,
 'trade': 22,
 'break': 23,
 'not': 24,
 'good': 25,
 'nice': 26,
 'goog': 27,
 'high': 28,
 'new': 29,
 'still': 30,
 'abov': 31,
 'back': 32,
 'down': 33,
 'close': 34,
 'bac': 35,
 'time': 36,
 'next': 37,
 'week': 38,
 'see': 39,
 'coronaviru': 40,
 'call': 41,
 'get': 42,
 'ong': 43,
 'price': 44,
 'one': 45,
 'posit': 46,
 'higher': 47,
 'hold': 48,
 'no': 49,
 'point': 50,
 'sell': 51,
 'share': 52,
 'triangl': 53,
 'breakout': 54,
 'could': 55,
 'open': 56,
 'take': 57,
 'weekli': 58,
 'sensex': 59,
 'target': 60,
 'big': 61,
 'year': 62,
 'come': 63,
 'nifti': 64,
 'support': 65,
 'bullish': 66,
 'p': 67,
 'gap': 68,
 'low': 69,
 'last': 70,
 'lower': 71,
 'nfx': 72,
 'green': 73,
 'amzn': 74,
 'earn': 75,
 'put': 76,
 '

In [22]:
# Get number of unique words
vocab_size = len(tokenizer.word_index) #+ 1 # for glove
vocab_size

6409

In [23]:
# Fill sentences with len < maxlen with 0s
X_train = pad_sequences(X_train, maxlen=maxlen, padding='post')
X_test = pad_sequences(X_test, maxlen=maxlen, padding='post')

In [24]:
# GloVe Embeddings
'''embeddings_dict = dict()
glove_file = open('glove_files/glove.6B.50d.txt', encoding="utf8")

for line in glove_file:
    values = line.split()
    word = values[0]
    vector = asarray(values[1:], dtype='float32')
    embeddings_dict[word] = vector
glove_file.close()

embedding_matrix = zeros((vocab_size, 50))
for word, index in tokenizer.word_index.items():
    embedding_vector = embeddings_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

embedding_matrix.shape'''

'embeddings_dict = dict()\nglove_file = open(\'glove_files/glove.6B.50d.txt\', encoding="utf8")\n\nfor line in glove_file:\n    values = line.split()\n    word = values[0]\n    vector = asarray(values[1:], dtype=\'float32\')\n    embeddings_dict[word] = vector\nglove_file.close()\n\nembedding_matrix = zeros((vocab_size, 50))\nfor word, index in tokenizer.word_index.items():\n    embedding_vector = embeddings_dict.get(word)\n    if embedding_vector is not None:\n        embedding_matrix[index] = embedding_vector\n\nembedding_matrix.shape'

In [25]:
# shape = (num of rows, maxlen)
X_train.shape, X_test.shape

((4632, 22), (1159, 22))

## Model Implementation

An LSTM is used as the words are treated as 'timesteps' where each word affects subsequent inputs.

Avoided overfitting by reducing epochs and layers

In [26]:
# GLOBAL VARS
EPOCHS = 4 #10

In [27]:
# Define model
model = keras.Sequential([

    #keras.layers.Embedding(input_dim=vocab_size, output_dim=50, weights=[embedding_matrix], input_length=maxlen), # GloVe embeddings
    keras.layers.Embedding(input_dim=vocab_size, output_dim=32, input_length=maxlen),
    keras.layers.LSTM(units=64, activation='relu'),
    keras.layers.Dense(units=1, activation='sigmoid') # softmax has higher accuracy but theoretically sigmoid is better for binary classification
    
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



2023-12-14 10:33:27.966234: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2023-12-14 10:33:27.966261: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2023-12-14 10:33:27.966266: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2023-12-14 10:33:27.966299: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2023-12-14 10:33:27.966313: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 22, 32)            205088    
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 229985 (898.38 KB)
Trainable params: 229985 (898.38 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [28]:
history = model.fit(X_train, y_train, epochs=EPOCHS, validation_split=0.2)

Epoch 1/4


2023-12-14 10:33:28.676504: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Epoch 2/4
Epoch 3/4
Epoch 4/4


In [30]:
test_eval = model.evaluate(X_test, y_test)
test_loss, test_acc = test_eval[0], test_eval[1]
print(f'Test accuracy: {test_acc}')

Test accuracy: 0.7903364896774292


In [36]:
#model.save('final.keras')