In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import neattext.functions as nfx
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

# Text preprocessing
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Model building
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.utils import to_categorical


In [2]:

# Load Dataset
df = pd.read_csv("../data/emotion_dataset_raw.csv")

# Data Cleaning
df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles)
df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)

# Features & Labels
Xfeatures = df['Clean_Text']
ylabels = df['Emotion']



In [3]:
# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(ylabels)
y_categorical = to_categorical(y_encoded)

# Split Data
x_train, x_test, y_train, y_test = train_test_split(Xfeatures, y_categorical, test_size=0.3, random_state=42)

# Tokenization
max_words = 10000  # Maximum number of words to keep
max_len = 100      # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(x_train)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)

# Pad sequences
x_train_padded = pad_sequences(train_sequences, maxlen=max_len)
x_test_padded = pad_sequences(test_sequences, maxlen=max_len)



In [4]:
# Build LSTM Model
embedding_dim = 128

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=max_len))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

model.compile(loss='categorical_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

print(model.summary())



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 128)          1280000   
                                                                 
 spatial_dropout1d (Spatial  (None, 100, 128)          0         
 Dropout1D)                                                      
                                                                 
 lstm (LSTM)                 (None, 100)               91600     
                                                                 
 dense (Dense)               (None, 8)                 808       
                                                                 
Total params: 1372408 (5.24 MB)
Trainable params: 1372408 (5.24 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [11]:
# Train the model
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

history = model.fit(x_train_padded, y_train, 
                    epochs=20, 
                    batch_size=64, 
                    validation_split=0.1,
                    callbacks=[early_stop])



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


In [12]:
# Evaluate the model
loss, accuracy = model.evaluate(x_test_padded, y_test)
print(f'Test accuracy: {accuracy:.4f}')



Test accuracy: 0.6153


In [13]:
# Save the model and tokenizer
model.save('../models/emotion_lstm_model.h5')
joblib.dump(tokenizer, '../models/emotion_tokenizer.pkl')
joblib.dump(label_encoder, '../models/emotion_label_encoder.pkl')



  saving_api.save_model(


['../models/emotion_label_encoder.pkl']

In [14]:
# Function to predict emotion
def predict_emotion(text):
    # Preprocess
    text = nfx.remove_userhandles(text)
    text = nfx.remove_stopwords(text)
    
    # Tokenize and pad
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_len)
    
    # Predict
    prediction = model.predict(padded)
    predicted_label = label_encoder.inverse_transform([np.argmax(prediction)])
    
    return predicted_label[0], prediction



In [15]:
# Example prediction
ex1 = "This book was so interesting it made me happy"
emotion, prob = predict_emotion(ex1)
print(f"Predicted Emotion: {emotion}")
print(f"Probabilities: {prob}")

Predicted Emotion: joy
Probabilities: [[1.2146877e-03 3.0299934e-04 9.0186438e-04 9.6351928e-01 2.3598098e-03
  8.7505067e-03 3.2211105e-06 2.2947568e-02]]
