In [None]:
!pip install gensim

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from gensim.models import Word2Vec, KeyedVectors
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [75]:
# 1. Load Dataset
# --------------------------
# Replace this with your actual dataset
data = pd.read_csv("/content/drive/MyDrive/0- July-Dec 2025/5th sem Intro to LLM and GenAI/Classroom Mini Projects/Part-2/Cleaned_dataset.csv")  # ['final_cleaned_text', 'Sentiments']
X = data['final_cleaned_text'].astype(str)
X.fillna('too', inplace=True)
y = data['Sentiment'].astype('category').cat.codes  # encode to 0,1,2

# --------------------------
# Helper: Macro F1 calculation
# --------------------------
def macro_f1(y_true, y_pred):
    return f1_score(y_true, y_pred, average='macro')

results = []

In [76]:
# --------------------------
# 2. BOW + GBoost
# --------------------------
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=100)

gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
f1_bow = macro_f1(y_test, y_pred)
results.append(["BOW", "GBoost", f1_bow])

# --------------------------
# 3. Word2Vec (Skip-gram) + LSTM
# --------------------------
sentences = [text.split() for text in X]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
word_index = tokenizer.word_index
X_seq = pad_sequences(sequences, maxlen=100)

embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=0.2, random_state=42)

model_w2v = Sequential()
model_w2v.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=100, trainable=False))
model_w2v.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model_w2v.add(Dense(3, activation='softmax'))
model_w2v.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=[])
model_w2v.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

y_pred = np.argmax(model_w2v.predict(X_test), axis=1)
f1_w2v = macro_f1(y_test, y_pred)
results.append(["Word2Vec (Skip-gram)", "LSTM", f1_w2v])

# --------------------------
# 4. GloVe (100d) + LSTM
# --------------------------
glove_path = '/content/drive/MyDrive/0- July-Dec 2025/5th sem Intro to LLM and GenAI/Classroom Mini Projects/Part-2/glove.6B.100d.txt.word2vec'
glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False)

embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in glove_model:
        embedding_matrix[i] = glove_model[word]

model_glove = Sequential()
model_glove.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=100, trainable=False))
model_glove.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model_glove.add(Dense(3, activation='softmax'))
model_glove.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=[])
model_glove.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

y_pred = np.argmax(model_glove.predict(X_test), axis=1)
f1_glove = macro_f1(y_test, y_pred)
results.append(["GloVe (100d)", "LSTM", f1_glove])

# --------------------------
# 5. Comparison Table
# --------------------------
df_results = pd.DataFrame(results, columns=["Representation Technique", "Model Type", "Macro F1 Score"])
print(df_results)


Epoch 1/5




[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 71ms/step - loss: 0.8663
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 67ms/step - loss: 0.5203
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 68ms/step - loss: 0.5303
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 87ms/step - loss: 0.5751
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 88ms/step - loss: 0.5722




[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 63ms/step
Epoch 1/5




[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 79ms/step - loss: 0.7643
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 75ms/step - loss: 0.5370
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 169ms/step - loss: 0.5237
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 201ms/step - loss: 0.5200
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 148ms/step - loss: 0.4253
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 70ms/step
  Representation Technique Model Type  Macro F1 Score
0                      BOW     GBoost        0.539407
1     Word2Vec (Skip-gram)       LSTM        0.309333
2             GloVe (100d)       LSTM        0.418399


1. Fine-tune the embeddings

In [77]:
# --------------------------
# 2. BOW + GBoost
# --------------------------
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=100)

gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
f1_bow = macro_f1(y_test, y_pred)
results.append(["BOW", "GBoost", f1_bow])

# --------------------------
# 3. Word2Vec (Skip-gram) + LSTM
# --------------------------
sentences = [text.split() for text in X]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
word_index = tokenizer.word_index
X_seq = pad_sequences(sequences, maxlen=100)

embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=0.2, random_state=42)

model_w2v = Sequential()
model_w2v.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=100, trainable=True))
model_w2v.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model_w2v.add(Dense(3, activation='softmax'))
model_w2v.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=[])
model_w2v.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

y_pred = np.argmax(model_w2v.predict(X_test), axis=1)
f1_w2v = macro_f1(y_test, y_pred)
results.append(["Word2Vec (Skip-gram)", "LSTM", f1_w2v])

# --------------------------
# 4. GloVe (100d) + LSTM
# --------------------------
glove_path = '/content/drive/MyDrive/0- July-Dec 2025/5th sem Intro to LLM and GenAI/Classroom Mini Projects/Part-2/glove.6B.100d.txt.word2vec'
glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False)

embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in glove_model:
        embedding_matrix[i] = glove_model[word]

model_glove = Sequential()
model_glove.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=100, trainable=True))
model_glove.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
model_glove.add(Dense(3, activation='softmax'))
model_glove.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=[])
model_glove.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

y_pred = np.argmax(model_glove.predict(X_test), axis=1)
f1_glove = macro_f1(y_test, y_pred)
results.append(["GloVe (100d)", "LSTM", f1_glove])

# --------------------------
# 5. Comparison Table
# --------------------------
df_results = pd.DataFrame(results, columns=["Representation Technique", "Model Type", "Macro F1 Score"])
print(df_results)


Epoch 1/5




[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 81ms/step - loss: 0.8000
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 77ms/step - loss: 0.5314
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 94ms/step - loss: 0.5484
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 83ms/step - loss: 0.4896
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 77ms/step - loss: 0.4107
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 60ms/step
Epoch 1/5




[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 85ms/step - loss: 0.7959
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 80ms/step - loss: 0.5188
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 112ms/step - loss: 0.5137
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 76ms/step - loss: 0.4680
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 87ms/step - loss: 0.4395
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 63ms/step
  Representation Technique Model Type  Macro F1 Score
0                      BOW     GBoost        0.539407
1     Word2Vec (Skip-gram)       LSTM        0.309333
2             GloVe (100d)       LSTM        0.418399
3                      BOW     GBoost        0.571731
4     Word2Vec (Skip-gram)       LSTM        0.384235
5             GloVe (100d)       LSTM        0.343494


2. Use Bidirectional LSTM + Dense layers

In [78]:
# --------------------------
# 2. BOW + GBoost
# --------------------------
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_bow, y, test_size=0.2, random_state=100)

gb_model = GradientBoostingClassifier()
gb_model.fit(X_train, y_train)
y_pred = gb_model.predict(X_test)
f1_bow = macro_f1(y_test, y_pred)
results.append(["BOW", "GBoost", f1_bow])

# --------------------------
# 3. Word2Vec (Skip-gram) + LSTM
# --------------------------
sentences = [text.split() for text in X]
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, sg=1)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
word_index = tokenizer.word_index
X_seq = pad_sequences(sequences, maxlen=100)

embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in w2v_model.wv:
        embedding_matrix[i] = w2v_model.wv[word]

X_train, X_test, y_train, y_test = train_test_split(X_seq, y, test_size=0.2, random_state=42)

model_w2v = Sequential()
model_w2v.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=100, trainable=True))

from tensorflow.keras.layers import Bidirectional
model_w2v.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)))
model_w2v.add(Dense(64, activation='relu'))
model_w2v.add(Dropout(0.3))

model_w2v.add(Dense(3, activation='softmax'))
model_w2v.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=[])
model_w2v.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

y_pred = np.argmax(model_w2v.predict(X_test), axis=1)
f1_w2v = macro_f1(y_test, y_pred)
results.append(["Word2Vec (Skip-gram)", "LSTM", f1_w2v])

# --------------------------
# 4. GloVe (100d) + LSTM
# --------------------------
glove_path = '/content/drive/MyDrive/0- July-Dec 2025/5th sem Intro to LLM and GenAI/Classroom Mini Projects/Part-2/glove.6B.100d.txt.word2vec'
glove_model = KeyedVectors.load_word2vec_format(glove_path, binary=False)

embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
    if word in glove_model:
        embedding_matrix[i] = glove_model[word]

model_glove = Sequential()
model_glove.add(Embedding(len(word_index) + 1, 100, weights=[embedding_matrix], input_length=100, trainable=True))

from tensorflow.keras.layers import Bidirectional
model_glove.add(Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)))
model_glove.add(Dense(64, activation='relu'))
model_glove.add(Dropout(0.3))

model_glove.add(Dense(3, activation='softmax'))
model_glove.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(), metrics=[])
model_glove.fit(X_train, y_train, epochs=5, batch_size=32, verbose=1)

y_pred = np.argmax(model_glove.predict(X_test), axis=1)
f1_glove = macro_f1(y_test, y_pred)
results.append(["GloVe (100d)", "LSTM", f1_glove])

# --------------------------
# 5. Comparison Table
# --------------------------
df_results = pd.DataFrame(results, columns=["Representation Technique", "Model Type", "Macro F1 Score"])
print(df_results)


Epoch 1/5




[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 457ms/step - loss: 0.7510
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 323ms/step - loss: 0.4960
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 338ms/step - loss: 0.5186
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 333ms/step - loss: 0.5092
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 315ms/step - loss: 0.4643
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 168ms/step
Epoch 1/5




[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 329ms/step - loss: 0.7298
Epoch 2/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 277ms/step - loss: 0.5546
Epoch 3/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 298ms/step - loss: 0.5311
Epoch 4/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 333ms/step - loss: 0.4439
Epoch 5/5
[1m26/26[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 324ms/step - loss: 0.4198
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 153ms/step
  Representation Technique Model Type  Macro F1 Score
0                      BOW     GBoost        0.539407
1     Word2Vec (Skip-gram)       LSTM        0.309333
2             GloVe (100d)       LSTM        0.418399
3                      BOW     GBoost        0.571731
4     Word2Vec (Skip-gram)       LSTM        0.384235
5             GloVe (100d)       LSTM        0.343494
6                      BOW     GBoost        0.573488
7