In [1]:
# Cell 1: Install required libraries
!pip install pandas numpy scikit-learn nltk gensim tqdm

# Cell 2: Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


[nltk_data] Downloading package punkt to /Library/Frameworks/Python.fr
[nltk_data]     amework/Versions/3.12/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Library/Frameworks/Pytho
[nltk_data]     n.framework/Versions/3.12/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Library/Frameworks/Python.
[nltk_data]     framework/Versions/3.12/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Library/Frameworks/Pytho
[nltk_data]     n.framework/Versions/3.12/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
# Cell 3: Load dataset
# Place your dataset CSV (with 'text' and 'emotion' columns) in the same directory or provide a path
df = pd.read_csv("../data/tweet_emotions.csv")
df.head()


Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
# Cell 4: Preprocess text

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z]', ' ', text.lower())
    tokens = nltk.word_tokenize(text)
    return [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]

tqdm.pandas()
df["tokens"] = df["content"].progress_apply(preprocess_text)

100%|██████████████████████████████████████████████████████████████| 40000/40000 [00:02<00:00, 15194.80it/s]


In [4]:
df

Unnamed: 0,tweet_id,sentiment,content,tokens
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,"[tiffanylue, know, listenin, bad, habit, earli..."
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,"[layin, n, bed, headache, ughhhh, waitin, call]"
2,1956967696,sadness,Funeral ceremony...gloomy friday...,"[funeral, ceremony, gloomy, friday]"
3,1956967789,enthusiasm,wants to hang out with friends SOON!,"[want, hang, friend, soon]"
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,"[dannycastillo, want, trade, someone, houston,..."
...,...,...,...,...
39995,1753918954,neutral,@JohnLloydTaylor,[johnlloydtaylor]
39996,1753919001,love,Happy Mothers Day All my love,"[happy, mother, day, love]"
39997,1753919005,love,Happy Mother's Day to all the mommies out ther...,"[happy, mother, day, mommy, woman, man, long, ..."
39998,1753919043,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,"[niariley, wassup, beautiful, follow, peep, ne..."


In [5]:
# Cell 5: Load GloVe embeddings (300D recommended)
def load_glove_embeddings(glove_path):
    embeddings = {}
    with open(glove_path, 'r', encoding='utf8') as f:
        for line in tqdm(f, desc="Loading GloVe"):
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_model = load_glove_embeddings("../embeddings/glove.6B.300d.txt")


Loading GloVe: 400000it [00:10, 37019.99it/s]


In [6]:
# Cell 6: Convert text to vector
def get_sentence_vector(tokens, model, dim=300):
    vectors = [model[word] for word in tokens if word in model]
    return np.mean(vectors, axis=0) if vectors else np.zeros(dim)

X = np.array([get_sentence_vector(tokens, glove_model) for tokens in tqdm(df["tokens"], desc="Vectorizing")])


Vectorizing: 100%|████████████████████████████████████████████████| 40000/40000 [00:00<00:00, 105009.47it/s]


In [7]:
# Cell 7: Encode target labels
le = LabelEncoder()
y = le.fit_transform(df["sentiment"])


In [8]:
# Cell 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Cell 9: Train model and evaluate
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred, target_names=le.classes_))


              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        19
     boredom       0.00      0.00      0.00        31
       empty       0.00      0.00      0.00       162
  enthusiasm       0.00      0.00      0.00       163
         fun       0.06      0.01      0.01       338
   happiness       0.30      0.32      0.31      1028
        hate       0.38      0.12      0.18       268
        love       0.46      0.36      0.40       762
     neutral       0.30      0.51      0.37      1740
      relief       0.12      0.01      0.02       352
     sadness       0.33      0.16      0.21      1046
    surprise       0.30      0.03      0.06       425
       worry       0.31      0.49      0.38      1666

    accuracy                           0.31      8000
   macro avg       0.20      0.15      0.15      8000
weighted avg       0.29      0.31      0.28      8000



In [10]:
# Cell: Predict emotion of a new sentence
def predict_emotion(text):
    tokens = preprocess_text(text)
    vector = get_sentence_vector(tokens, glove_model)
    pred = clf.predict([vector])[0]
    return le.inverse_transform([pred])[0]

In [11]:
# Example
new_text = ""
predicted_emotion = predict_emotion(new_text)
print(f"Predicted Emotion: {predicted_emotion}")

Predicted Emotion: neutral


In [14]:
import joblib
joblib.dump(glove_model, "glove.6B.300d.pkl")

['glove.6B.300d.pkl']