# Emotion Classification
data loading and cleaning

In [None]:
import pandas as pd

In [2]:
train_df = pd.read_csv('train.txt', sep=';', header=None, names=['text', 'emotion'])
val_df = pd.read_csv('val.txt', sep=';', header=None, names=['text', 'emotion'])

df = pd.concat([train_df, val_df], ignore_index=True)
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [3]:
print(df['emotion'].value_counts())
print(f'Total samples: {len(df)}')

emotion
joy         6066
sadness     5216
anger       2434
fear        2149
love        1482
surprise     653
Name: count, dtype: int64
Total samples: 18000


In [4]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
stop_words= set(stopwords.words("english"))
lemmatizer= WordNetLemmatizer()

In [7]:
def clean_text(text):
  text=text.lower()
  words=[]
  text=word_tokenize(text)
  for word in text:
    if(word.isalnum() and word not in stop_words):
      words.append(word)
  for word in words:
    lemmatizer.lemmatize(word)

  return " ".join(words)



In [8]:
a="i am feeling bored, confused what to do next?"
clean_text(a)

'feeling bored confused next'

In [11]:
df.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [12]:
df['text']=df['text'].apply(clean_text)

In [13]:
df.head()

Unnamed: 0,text,emotion
0,didnt feel humiliated,sadness
1,go feeling hopeless damned hopeful around some...,sadness
2,im grabbing minute post feel greedy wrong,anger
3,ever feeling nostalgic fireplace know still pr...,love
4,feeling grouchy,anger


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [14]:
vectorizer = TfidfVectorizer(max_features=5000)

# Transforming text into TF-IDF vectors
X = vectorizer.fit_transform(df['text'])
y = df['emotion']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [16]:
model = LogisticRegression(max_iter=1000, multi_class='multinomial')
model.fit(X_train, y_train)



In [17]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

       anger       0.93      0.80      0.86       487
        fear       0.89      0.80      0.84       430
         joy       0.83      0.96      0.89      1213
        love       0.86      0.66      0.74       296
     sadness       0.90      0.94      0.92      1043
    surprise       0.85      0.51      0.64       131

    accuracy                           0.87      3600
   macro avg       0.88      0.78      0.81      3600
weighted avg       0.87      0.87      0.87      3600



In [18]:
def predict_emotion(text):
    text=clean_text(text)
    vector = vectorizer.transform([text])
    probs = model.predict_proba(vector)[0]
    labels = model.classes_
    results = sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)

    for emotion, confidence in results[:3]:
        print(f"{emotion}: {confidence * 100:.2f}%")


In [42]:
predict_emotion("I'm feeling happy")

joy: 88.13%
sadness: 6.22%
anger: 2.41%


In [38]:
from sklearn.svm import LinearSVC


In [39]:
model2 = LinearSVC()
model2.fit(X_train, y_train)

In [40]:
def predict_emotion2(text):
    text=clean_text(text)
    vector = vectorizer.transform([text])
    return model2.predict(vector)[0]

In [41]:
predict_emotion2("I'm feeling happy but sad")

'sadness'

In [43]:
import pickle

In [44]:
with open('emotion_model.pkl', 'wb') as f:
    pickle.dump(model, f)

with open('tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)