In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report, f1_score


In [2]:
# Download NLTK Resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load & Preprocess Data
df = pd.read_csv("C:/Users/user/Downloads/tweet_emotions.csv/tweet_emotions.csv")

# Encode labels
le = LabelEncoder()
df['sentiment_encoded'] = le.fit_transform(df['sentiment'])

# Cleaning functions
lemmatizer_obj = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    if pd.isnull(text):
        return ""
    text = str(text).lower()
    text = re.sub(r'@[\w]*', '', text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def lemmatizer(text):
    tokens = word_tokenize(text)
    lemmatized = [lemmatizer_obj.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(lemmatized).strip()

df['cleaned_text'] = df['content'].apply(clean_text).apply(lemmatizer)
df = df[df['cleaned_text'].str.strip().astype(bool)].copy()


In [4]:
# Balance the Classes

X = df['cleaned_text'].astype(str)
y = df['sentiment_encoded']

X_df = pd.DataFrame({'text': X})
y_df = pd.Series(y)

oversampler = RandomOverSampler(random_state=42)
X_resampled_df, y_resampled = oversampler.fit_resample(X_df, y_df)

X = X_resampled_df['text']
y = y_resampled


In [5]:
# Train/Validation/Test Split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [6]:
# Tokenization & Padding
max_vocab_size = 10000
max_sequence_length = 100

tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_val_seq = tokenizer.texts_to_sequences(X_val)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length, padding='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_sequence_length, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length, padding='post')


In [7]:
# Load GloVe 300D Embeddings

embedding_index = {}
with open("glove.6B.300d.txt", encoding="utf-8") as f:  
    for line in f:
        values = line.split()
        word = values[0]
        coeffs = np.asarray(values[1:], dtype="float32")
        embedding_index[word] = coeffs

print(f"Loaded {len(embedding_index)} word vectors.")

embedding_dim = 300  # 300D GloVe
vocab_size = min(max_vocab_size, len(tokenizer.word_index) + 1)

embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


Loaded 400000 word vectors.


In [8]:
# Build model 
from tensorflow.keras.layers import Bidirectional

model = Sequential([
      Embedding(input_dim=vocab_size, output_dim=300,  # Keep 300D GloVe for better semantics
              weights=[embedding_matrix], input_length=max_sequence_length, trainable=True),
    
      Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)),
      Dense(64, activation='relu'),
      Dropout(0.4),
      Dense(len(le.classes_), activation='softmax')
])



In [9]:
# Train the Model
es = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)

# Compile the model 
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

# Train the model
history = model.fit(
    X_train_pad, y_train,
    validation_data=(X_val_pad, y_val),
    epochs= 30,
    batch_size=64,
    callbacks=[es]
)


Epoch 1/30
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m502s[0m 407ms/step - accuracy: 0.2650 - loss: 2.1471 - val_accuracy: 0.5155 - val_loss: 1.4241
Epoch 2/30
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m527s[0m 436ms/step - accuracy: 0.5485 - loss: 1.3484 - val_accuracy: 0.6108 - val_loss: 1.1356
Epoch 3/30
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m478s[0m 396ms/step - accuracy: 0.6374 - loss: 1.0727 - val_accuracy: 0.6567 - val_loss: 1.0009
Epoch 4/30
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m575s[0m 475ms/step - accuracy: 0.6865 - loss: 0.9175 - val_accuracy: 0.6877 - val_loss: 0.9127
Epoch 5/30
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m686s[0m 568ms/step - accuracy: 0.7222 - loss: 0.8040 - val_accuracy: 0.7117 - val_loss: 0.8507
Epoch 6/30
[1m1209/1209[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m393s[0m 325ms/step - accuracy: 0.7515 - loss: 0.7137 - val_accuracy: 0.7315 - val_loss:

In [10]:
# Evaluate model
from sklearn.metrics import f1_score, classification_report
import numpy as np

y_val_pred = np.argmax(model.predict(X_val_pad), axis=-1)
f1 = f1_score(y_val, y_val_pred, average='weighted')
print(f"Weighted F1-score: {f1:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, target_names=le.classes_))

[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 65ms/step
Weighted F1-score: 0.7568

Classification Report:
              precision    recall  f1-score   support

       anger       1.00      1.00      1.00      1275
     boredom       0.99      1.00      0.99      1276
       empty       0.90      0.98      0.94      1276
  enthusiasm       0.94      0.98      0.96      1275
         fun       0.82      0.91      0.86      1275
   happiness       0.53      0.54      0.54      1275
        hate       0.91      0.97      0.94      1275
        love       0.68      0.72      0.70      1276
     neutral       0.43      0.29      0.35      1275
      relief       0.85      0.94      0.89      1275
     sadness       0.53      0.61      0.56      1275
    surprise       0.77      0.83      0.80      1275
       worry       0.42      0.24      0.30      1276

    accuracy                           0.77     16579
   macro avg       0.75      0.77      0.76     16579
weighte

In [11]:
model.save("Best_model1.keras")
print("Model saved successfully!")

Model saved successfully!


In [12]:
import pickle

# Save tokenizer
with open('BestTokenizer1.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Save label encoder
with open('BestLabelEncoder.pkl', 'wb') as f:
    pickle.dump(le, f)

In [13]:
from sklearn.metrics import classification_report

# Predict on test set
y_test_pred = np.argmax(model.predict(X_test_pad), axis=-1)

# Print classification report
print("Test Set Performance:\n")
print(classification_report(y_test, y_test_pred, target_names=le.classes_))


[1m519/519[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 56ms/step
Test Set Performance:

              precision    recall  f1-score   support

       anger       0.99      1.00      1.00      1275
     boredom       0.99      1.00      1.00      1275
       empty       0.90      0.99      0.94      1275
  enthusiasm       0.94      0.99      0.96      1276
         fun       0.84      0.92      0.88      1276
   happiness       0.54      0.57      0.56      1275
        hate       0.91      0.97      0.94      1276
        love       0.69      0.72      0.70      1275
     neutral       0.43      0.29      0.35      1275
      relief       0.84      0.93      0.88      1275
     sadness       0.53      0.60      0.56      1276
    surprise       0.80      0.84      0.82      1275
       worry       0.42      0.24      0.30      1275

    accuracy                           0.77     16579
   macro avg       0.75      0.77      0.76     16579
weighted avg       0.75      0.77 

In [15]:
# Calculate weighted F1-score
f1 = f1_score(y_test, y_test_pred, average='weighted')
print(f"Weighted F1-score (Test Set): {f1:.4f}")

Weighted F1-score (Test Set): 0.7612
