In [75]:
#import necessary libraries
import numpy as np
import pandas as pd

In [76]:
data = pd.read_csv('twitter_training.csv')
data.head()

Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...


## Clean Data

In [77]:
data.describe()

Unnamed: 0,2401
count,74681.0
mean,6432.640149
std,3740.423819
min,1.0
25%,3195.0
50%,6422.0
75%,9601.0
max,13200.0


In [78]:
data.isna().sum()

2401                                                       0
Borderlands                                                0
Positive                                                   0
im getting on borderlands and i will murder you all ,    686
dtype: int64

In [79]:
data.dropna(inplace = True)

In [80]:
data.columns = ['ID', 'Game', 'Sentiment', 'Text', ]

## Preprocess data
### Tokenize text

## using LSTM for sentiment analysis in NLP


NOTE: we will use the tensorflow(keras)

In [81]:
#import tensor flow and tokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [82]:
#tokenizer text
tokenizer = Tokenizer(num_words = 10000)
tokenizer.fit_on_texts(data['Text']) 
#👆creates a word index (a unique numerical index to each word in the dataset based on its frequency in the text)
#that maps each unique word in the data to a unique integer.

sequences = tokenizer.texts_to_sequences(data['Text']) 
'''
#👆converts text sentences into numerical sequences using the word index created by tokenizer.fit_on_texts().
#It can replace unseen words with a special token (e.g., `<OOV>`).what this means is that words which already occurred 
in the train set, are assigned index based on frequency(how many times they occurred), however words that never occurred are assigned "<oov>" which equals to 1
'''
sequences[:3]

[[2, 120, 404, 3, 1, 6744, 4, 2, 52, 434, 12, 27],
 [307, 174, 13, 140, 4, 2, 52, 434, 12, 27],
 [307, 404, 13, 140, 4, 2, 52, 1793, 12, 27]]

In [84]:
data['Text'][:2]

0    I am coming to the borders and I will kill you...
1    im getting on borderlands and i will kill you ...
Name: Text, dtype: object

In [89]:
tokenizer.fit_on_texts(data['Text']),sequences[:1]

(None, [[2, 120, 404, 3, 1, 6744, 4, 2, 52, 434, 12, 27]])

In [49]:
#padding sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences
x = pad_sequences(sequences,  padding = 'post')  
'''
padding adds zeros to less dense sentences 
to make all sentences equal with regards 
to the number of properties they have.
This is to make them uniform, as neural network
require fixed-length inputs, but the data 
sentences have varying lengths
'''

'\npadding adds zeros to less dense sentences \nto make all sentences equal with regards \nto the number of properties they have.\nThis is to make them uniform, as neural network\nrequire fixed-length inputs, but the data \nsentences have varying lengths\n'

In [50]:
#convert the values in our target to numeric
data[data['Sentiment'] == 'Negative'] = 3 
data[data['Sentiment'] == 'Positive'] = 2  
data[data['Sentiment'] == 'Neutral'] = 1
data[data['Sentiment'] == 'Irrelevant'] = 0 

In [51]:
#assign our dependent variable
y1 = data[['Sentiment']].astype('i1')
y1.value_counts()

Sentiment
3            22358
2            20654
1            18108
0            12875
Name: count, dtype: int64

In [52]:
#convert target to one hot encoding
from tensorflow.keras.utils import to_categorical

# Convert labels to one-hot encoding
y = to_categorical(y1, num_classes=4)

In [53]:
# split data
from sklearn.model_selection import train_test_split
x_train, x_test1, y_train, y_test1 = train_test_split(x, y, train_size = 0.7, random_state = 42)

In [54]:
x_valid, x_test, y_valid, y_test = train_test_split(x_test1, y_test1, train_size = 0.5)

## Build LSTM model

In [55]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

modelLSTM = Sequential()

In [56]:
modelLSTM.add(Embedding(input_dim = 15000, output_dim = 64, input_length = 10))
'''
EMBEDDING LAYER: The Embedding layer helps the model learn relationships between words instead of using raw integers.
input_dim = 15000: (max number of unique words in the dataset).
output_dim=64: shape of each word. (64-dimensional dense vector).
input_length=10: shape of sentences.The input sequences have a fixed length of 10 words.
It ensures that the sentences in the dataset has uniform size.
i.e small-sized sentences would get zeros as values to make its length the same with others)
'''
   



'\nEMBEDDING LAYER: The Embedding layer helps the model learn relationships between words instead of using raw integers.\ninput_dim = 15000: (max number of unique words in the dataset).\noutput_dim=64: shape of each word. (64-dimensional dense vector).\ninput_length=10: shape of sentences.The input sequences have a fixed length of 10 words.\nIt ensures that the sentences in the dataset has uniform size.\ni.e small-sized sentences would get zeros as values to make its length the same with others)\n'

In [57]:
modelLSTM.add(LSTM(128, return_sequences=False))  # First LSTM Layer  

'''   
This LSTM layer processes the sequence of word vectors and compresses it into a single 128D vector.
"return_sequences=False" ensures we return only the final LSTM output, not all timesteps.


 LSTM units[128]", it refers to the number of Long Short-Term Memory (LSTM) 
 cells(neurons) in a particular LSTM layer of a neural network. 
 Each LSTM unit consists of: A cell state (memory)
                             Gates (input, forget, and output) to control information flow
The output dimension of the LSTM layer will be (batch_size, sequence_length, 2000) if return_sequences=True.
'''



'   \nThis LSTM layer processes the sequence of word vectors and compresses it into a single 100D vector.\n"return_sequences=False" ensures we return only the final LSTM output, not all timesteps.\n\n\n LSTM units[128]", it refers to the number of Long Short-Term Memory (LSTM) \n cells(neurons) in a particular LSTM layer of a neural network. \n Each LSTM unit consists of: A cell state (memory)\n                             Gates (input, forget, and output) to control information flow\nThe output dimension of the LSTM layer will be (batch_size, sequence_length, 2000) if return_sequences=True.\n'

In [17]:
modelLSTM.add(Dense(32, activation='relu'))    #The Dense layer with 32 neurons refines the LSTM output.
modelLSTM.add(Dense(4, activation='softmax'))  # 4-class classification output
'''
The final Dense layer (with softmax) predicts probabilities for 4 possible classes.
'''

'\nThe final Dense layer (with softmax) predicts probabilities for 4 possible classes.\n'

In [18]:
#compile model
modelLSTM.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
'''
This sets the loss function (categorical_crossentropy for multi-class classification).
Adam optimizer is used for training efficiency.
'''

'\nThis sets the loss function (categorical_crossentropy for multi-class classification).\nAdam optimizer is used for training efficiency.\n'

In [35]:
y_train.shape, y_valid.shape

((51796, 4), (11099, 4))

In [23]:
#train model
modelLSTM.fit(x_train, y_train, epochs = 4, batch_size = 100, class_weight={0: 1.0, 1: 2.0, 2: 3.0}, validation_data = (x_valid, y_valid))

Epoch 1/4
[1m518/518[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 503ms/step - accuracy: 0.2778 - loss: 2.2284 - val_accuracy: 0.2815 - val_loss: 1.4549
Epoch 2/4
[1m112/518[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m3:06[0m 459ms/step - accuracy: 0.2877 - loss: 2.2299


KeyboardInterrupt



## The above model took forever in training, and the accuracy was stable at 28-29%. so I interrupted the process and created another model(the one below) wholly tuned and fitted by chatGPT

In [25]:
MAX_WORDS = 10000  # Keep only the top 10,000 words
MAX_LEN = 100  # Truncate/pad sentences to 100 tokens
EMBEDDING_DIM = 100  # Word vector size
BATCH_SIZE = 128  # Optimized batch size
EPOCHS = 10  # Train for 10 epochs (Early Stopping prevents overfitting

In [58]:
# Tokenize Text
sequences
padded_sequences = pad_sequences(sequences, maxlen=MAX_LEN, padding="post", truncating="post")

In [59]:
# Build LSTM Model
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.callbacks import EarlyStopping


modelLSTM1 = Sequential([
    Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=MAX_LEN),
    Bidirectional(LSTM(64, return_sequences=True)),  # Bidirectional LSTM
    Dropout(0.5),  # Dropout for regularization
    LSTM(32),  # Smaller LSTM layer for speed
    Dense(32, activation="relu"),
    Dense(4, activation="softmax")  # multinomial classification
])

# Compile Model
modelLSTM1.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Early Stopping (Prevents long training if no improvement)
early_stopping = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)

# Train Model
history = modelLSTM1.fit(
    padded_sequences, y,
    batch_size=BATCH_SIZE,
    epochs=EPOCHS,
    validation_split=0.2,  # 20% data for validation
    callbacks=[early_stopping]
)




'''
Why This Model Trains Fast?
✔ Uses Precomputed Word Embeddings (No need to learn from scratch)
✔ Bidirectional LSTM (64 units) + Small LSTM (32 units) (Efficient & accurate)
✔ Dropout (0.5) (Prevents overfitting)
✔ Early Stopping (Stops when validation loss stops improving)
✔ Batch Size of 128 (Optimized for faster training)

Would you like help with hyperparameter tuning? 😊
'''

Epoch 1/10
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m210s[0m 426ms/step - accuracy: 0.2842 - loss: 0.5668 - val_accuracy: 0.2513 - val_loss: 0.5530
Epoch 2/10
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m206s[0m 445ms/step - accuracy: 0.2845 - loss: 0.5579 - val_accuracy: 0.3523 - val_loss: 0.5492
Epoch 3/10
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m207s[0m 447ms/step - accuracy: 0.2881 - loss: 0.5578 - val_accuracy: 0.2512 - val_loss: 0.5538
Epoch 4/10
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 436ms/step - accuracy: 0.2868 - loss: 0.5580 - val_accuracy: 0.2512 - val_loss: 0.5508
Epoch 5/10
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 497ms/step - accuracy: 0.2903 - loss: 0.5575 - val_accuracy: 0.3523 - val_loss: 0.5491
Epoch 6/10
[1m463/463[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m230s[0m 427ms/step - accuracy: 0.2914 - loss: 0.5576 - val_accuracy: 0.2512 - val_loss: 0.5509
Epoc

KeyboardInterrupt: 

### also had to interrupt this chatGPT tuned model as it didn't improve, still stable at 28-29% accuracy 