In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, f1_score


In [2]:
# Download NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load & Preprocess Data
df = pd.read_csv("C:/Users/user/Downloads/tweet_emotions.csv/tweet_emotions.csv")

# Encode labels
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

# Cleaning functions
lemmatizer_obj = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'@[\w]*', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatizer(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer_obj.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(lemmatized).strip()

df['cleaned_text'] = df['content'].apply(clean_text).apply(lemmatizer)
df = df[df['cleaned_text'].str.strip().astype(bool)].copy()


In [4]:
# Balance the Classes

X = df['cleaned_text'].astype(str)
y = df['sentiment_encoded']

X_df = pd.DataFrame({'text': X})
y_df = pd.Series(y)

oversampler = RandomOverSampler(random_state=42)
X_resampled_df, y_resampled = oversampler.fit_resample(X_df, y_df)

X = X_resampled_df['text']
y = y_resampled


In [5]:
# Train/Validation/Test Split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [6]:
# Tokenization & Padding
max_vocab_size = 10000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_sequence_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')


In [7]:
# Load GloVe 300D Embeddings

embedding_index = {}
with open("glove.6B.300d.txt", encoding="utf-8") as f:  
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coeffs

print(f"Loaded {len(embedding_index)} word vectors.")

embedding_dim = 300  # 300D GloVe
vocab_size = min(max_vocab_size, len(tokenizer.word_index) + 1)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


Loaded 400000 word vectors.


In [8]:
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.optimizers import Adam


# values to test
dropout_rates = [0.3, 0.5]
learning_rates = [0.001, 0.0005]

results = []

# Loop through combinations
for dropout in dropout_rates:
    for lr in learning_rates:
        print(f"\nTraining model with dropout={dropout}, learning_rate={lr}\n")

        # Build model with fixed structure but variable hyperparameters
        model = Sequential([
            Embedding(input_dim=vocab_size, output_dim=300,
                      weights=[embedding_matrix], input_length=max_sequence_length, trainable=True),
            
            Bidirectional(LSTM(128, dropout=dropout, recurrent_dropout=dropout)),
            Dense(64, activation='relu'),
            Dropout(dropout),
            Dense(len(le.classes_), activation='softmax')
        ])

        # Compile model
        optimizer = Adam(learning_rate=lr)
        model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

        # Callbacks
        es = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

        # Train model
        history = model.fit(
            X_train_pad, y_train,
            validation_data=(X_val_pad, y_val),
            epochs=5,
            batch_size=64,
            callbacks=[es],
        )

        # Evaluate model
        y_val_pred = np.argmax(model.predict(X_val_pad), axis=-1)
        f1 = f1_score(y_val, y_val_pred, average='weighted')
        val_acc = history.history['val_accuracy'][-1]

        print(f"\nDone: Dropout={dropout}, LR={lr}")
        print(f"Validation Accuracy: {val_acc:.4f}")
        print(f"Weighted F1-score: {f1:.4f}")

        results.append({
            "dropout": dropout,
            "learning_rate": lr,
            "val_accuracy": val_acc,
            "f1_score": f1
        })

# Summary of results
print("\nAll Results:")
for res in results:
    print(f"Dropout={res['dropout']} | LR={res['learning_rate']} → Val Acc: {res['val_accuracy']:.4f}, F1: {res['f1_score']:.4f}")



Training model with dropout=0.3, learning_rate=0.001





Epoch 1/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m422s[0m 337ms/step - accuracy: 0.2831 - loss: 2.1081 - val_accuracy: 0.5408 - val_loss: 1.3624
Epoch 2/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m425s[0m 351ms/step - accuracy: 0.5622 - loss: 1.2998 - val_accuracy: 0.6145 - val_loss: 1.1219
Epoch 3/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m405s[0m 335ms/step - accuracy: 0.6520 - loss: 1.0264 - val_accuracy: 0.6616 - val_loss: 0.9923
Epoch 4/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m417s[0m 345ms/step - accuracy: 0.6958 - loss: 0.8798 - val_accuracy: 0.6961 - val_loss: 0.8904
Epoch 5/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m415s[0m 343ms/step - accuracy: 0.7386 - loss: 0.7542 - val_accuracy: 0.7205 - val_loss: 0.8264
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 59ms/step

Done: Dropout=0.3, LR=0.001
Validation Accuracy: 0.7205
Weighted F1-score: 0.7123

Trai



Epoch 1/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m436s[0m 349ms/step - accuracy: 0.2281 - loss: 2.2584 - val_accuracy: 0.4846 - val_loss: 1.5561
Epoch 2/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m349s[0m 289ms/step - accuracy: 0.4827 - loss: 1.5472 - val_accuracy: 0.5681 - val_loss: 1.2837
Epoch 3/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m328s[0m 271ms/step - accuracy: 0.5756 - loss: 1.2640 - val_accuracy: 0.6109 - val_loss: 1.1493
Epoch 4/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m330s[0m 273ms/step - accuracy: 0.6275 - loss: 1.0969 - val_accuracy: 0.6406 - val_loss: 1.0474
Epoch 5/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 282ms/step - accuracy: 0.6633 - loss: 0.9869 - val_accuracy: 0.6612 - val_loss: 0.9926
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 52ms/step

Done: Dropout=0.3, LR=0.0005
Validation Accuracy: 0.6612
Weighted F1-score: 0.6575

Tra



Epoch 1/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 302ms/step - accuracy: 0.2145 - loss: 2.2855 - val_accuracy: 0.4666 - val_loss: 1.5812
Epoch 2/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m442s[0m 366ms/step - accuracy: 0.4578 - loss: 1.6003 - val_accuracy: 0.5620 - val_loss: 1.2907
Epoch 3/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m451s[0m 373ms/step - accuracy: 0.5511 - loss: 1.3309 - val_accuracy: 0.6020 - val_loss: 1.1599
Epoch 4/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m347s[0m 287ms/step - accuracy: 0.6081 - loss: 1.1642 - val_accuracy: 0.6358 - val_loss: 1.0669
Epoch 5/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m315s[0m 260ms/step - accuracy: 0.6432 - loss: 1.0529 - val_accuracy: 0.6547 - val_loss: 1.0073
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 37ms/step

Done: Dropout=0.5, LR=0.001
Validation Accuracy: 0.6547
Weighted F1-score: 0.6449

Trai



[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m456s[0m 371ms/step - accuracy: 0.1688 - loss: 2.4163 - val_accuracy: 0.4048 - val_loss: 1.7510
Epoch 2/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m485s[0m 402ms/step - accuracy: 0.3782 - loss: 1.8168 - val_accuracy: 0.4994 - val_loss: 1.5078
Epoch 3/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m369s[0m 305ms/step - accuracy: 0.4700 - loss: 1.5677 - val_accuracy: 0.5500 - val_loss: 1.3453
Epoch 4/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m493s[0m 408ms/step - accuracy: 0.5306 - loss: 1.3949 - val_accuracy: 0.5817 - val_loss: 1.2360
Epoch 5/5
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m484s[0m 400ms/step - accuracy: 0.5694 - loss: 1.2767 - val_accuracy: 0.6011 - val_loss: 1.1604
[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 73ms/step

Done: Dropout=0.5, LR=0.0005
Validation Accuracy: 0.6011
Weighted F1-score: 0.5940

All Results:
