#**Loading Data and preprocessing**

In [None]:
import pandas as pd

data = pd.read_csv('Twitter_Data.csv')
data = data.dropna()

#**Tokenizing the whole input column**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import itertools

# Concatenate every element in the column to form a story like text data
res = [i for i in data['clean_text']]
X = " ".join(res)

# Tokenize the whole text data extracted
tokenizer = Tokenizer()
tokenizer.fit_on_texts([X])

total_words = len(tokenizer.word_index) + 1
print("Total no. of unique words in the whole book :", total_words)

# Printing the first 10 items in the tokenizer.word_index dictionary
print(dict(itertools.islice(tokenizer.word_index.items(), 10)), '...')

#**Converting text input sequences into sequences of the token numbers**

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

# [modi, and, you, will, not, india] ==> [1, 3, 5, 7, 6, 9]
input_seq = []
for messages in data['clean_text']:
  input_seq.append(tokenizer.texts_to_sequences([messages])[0])

# Padding extra zeroes to the start of sentence and converting the whole thing to a numpy array
max_sequence_len = max([len(seq) for seq in input_seq])
input_sequences = np.array(pad_sequences(input_seq, maxlen = max_sequence_len, padding = 'pre'))

# Data to Train on (Note: We are one hot encoding the output i.e., 'y')
X = input_sequences
y = np.array(to_categorical(data['category'], num_classes = 3))

#**Creating the model**

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

# Simple LSTM model
'''model = Sequential()
model.add(Embedding(total_words, 100, input_length = max_sequence_len))
model.add(LSTM(150))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])'''

# Bidirectional LSTM model
model = Sequential([
    Embedding(total_words, 100, input_length = max_sequence_len),
    Bidirectional(LSTM(150, return_sequences=True)),
    Bidirectional(LSTM(150)),
    Dense(3, activation="softmax"),
])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 52, 100)           11367900  
                                                                 
 bidirectional (Bidirection  (None, 52, 300)           301200    
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 300)               541200    
 onal)                                                           
                                                                 
 dense (Dense)               (None, 3)                 903       
                                                                 
Total params: 12211203 (46.58 MB)
Trainable params: 12211203 (46.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


#**Training the model**

In [None]:
model.fit(X, y, epochs = 40, verbose = 1, batch_size = 1024, validation_split = 0.33)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.src.callbacks.History at 0x7ff6fc50d540>

#**Saving the model**

In [None]:
from tensorflow.keras.saving import load_model
#model.save('sentiment_analysis.keras')
new_model = load_model('sentiment_analysis.keras')

#**Testing the model**

In [None]:
# print 'y' to check how we created the output_sentiment dictionary
output_sentiment = {0: 'neutral', 1: 'positive', 2: 'negative'}
seed_text = ["He is the worst", "He is partly good partly bad", "He is the best"]

for message in seed_text:
  # Convert to token
  token_list = tokenizer.texts_to_sequences([message])[0]
  # Path sequences
  token_list = pad_sequences([token_list], maxlen = max_sequence_len, padding = 'pre')
  # Model prediction
  predicted = np.argmax(model.predict(token_list), axis=-1)
  print(predicted, output_sentiment[predicted[0]])

[2] negative
[0] neutral
[1] positive
