In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from collections import Counter

In [None]:
df = pd.read_csv('combined_emotion.csv')

In [None]:
df.head()

Unnamed: 0,sentence,emotion
0,i just feel really helpless and heavy hearted,fear
1,ive enjoyed being able to slouch about relax a...,sad
2,i gave up my internship with the dmrg and am f...,fear
3,i dont know i feel so lost,sad
4,i am a kindergarten teacher and i am thoroughl...,fear


In [None]:
df.describe()

Unnamed: 0,sentence,emotion
count,422746,422746
unique,393822,6
top,i feel more adventurous willing to take risks,joy
freq,16,143067


In [None]:
texts = df['sentence'].tolist()
labels = df['emotion'].tolist()

In [None]:
def build_vocab(texts, min_freq=1):
    vocab = Counter()
    for text in texts:
        vocab.update(text.lower().split())
    vocab = {word: idx for idx, (word, count) in enumerate(vocab.items()) if count >= min_freq}
    vocab['<UNK>'] = len(vocab)
    vocab['<PAD>'] = len(vocab)
    return vocab

In [None]:
vocab = build_vocab(texts)

In [None]:
#vectorize texts
def vectorize_text(text, vocab):
    tokens = text.lower().split()
    vector = [vocab.get(token, vocab['<UNK>']) for token in tokens]
    return vector

vectorized_texts = [vectorize_text(text, vocab) for text in texts]

label_to_index = {label: index for index, label in enumerate(set(labels))}
numerical_labels = [label_to_index[label] for label in labels]



In [None]:
class EmotionDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [None]:
#pad seq
def collate_fn(batch):
    texts, labels = zip(*batch)
    max_len = max(len(text) for text in texts)
    padded_texts = [text + [vocab['<PAD>']] * (max_len - len(text)) for text in texts]
    return torch.LongTensor(padded_texts), torch.LongTensor(labels)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(vectorized_texts, numerical_labels, test_size=0.2, random_state=42)
train_dataset = EmotionDataset(X_train, y_train)
test_dataset = EmotionDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


In [None]:
#MODEL
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab['<PAD>'])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)

    def forward(self, text):
        embedded = self.embedding(text)
        lstm_out, _ = self.lstm(embedded)
        output = self.fc(lstm_out[:, -1, :])
        return output

In [None]:
#init model
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 64
output_dim = len(set(labels))

In [None]:
model = BiLSTM(vocab_size, embedding_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = model.to(device)
criterion = criterion.to(device)

In [None]:
epochs = 20

for epoch in range(epochs):
  for batch in train_dataloader:
    texts, labels = batch
    texts = texts.to(device)
    labels = labels.to(device)

    optimizer.zero_grad()
    predictions = model(texts)
    loss = criterion(predictions, labels)
    loss.backward()
    optimizer.step()
  print(f"Epoch {epoch+1}/{epochs}, Loss: {loss.item()}")

# eval
correct_predictions = 0
total_predictions = 0

with torch.no_grad():
  for batch in test_dataloader:
    texts, labels = batch
    texts = texts.to(device)
    labels = labels.to(device)

    predictions = model(texts)
    _, predicted_labels = torch.max(predictions, 1)

    correct_predictions += (predicted_labels == labels).sum().item()
    total_predictions += labels.size(0)

accuracy = correct_predictions/total_predictions
print(f"Accuracy on test set: {accuracy}")

Epoch 1/20, Loss: 0.4428424835205078
Epoch 2/20, Loss: 0.1835319697856903
Epoch 3/20, Loss: 0.063074991106987
Epoch 4/20, Loss: 0.2888801395893097
Epoch 5/20, Loss: 0.02831648290157318
Epoch 6/20, Loss: 0.08236129581928253
Epoch 7/20, Loss: 0.00024313732865266502
Epoch 8/20, Loss: 0.10285604000091553
Epoch 9/20, Loss: 0.037705086171627045
Epoch 10/20, Loss: 0.03708933666348457
Epoch 11/20, Loss: 0.12581363320350647
Epoch 12/20, Loss: 0.00011758295295294374
Epoch 13/20, Loss: 0.0361747145652771
Epoch 14/20, Loss: 0.033500056713819504
Epoch 15/20, Loss: 0.12036186456680298
Epoch 16/20, Loss: 5.588752537732944e-05
Epoch 17/20, Loss: 0.1107918843626976
Epoch 18/20, Loss: 0.03963441029191017
Epoch 19/20, Loss: 0.029704075306653976
Epoch 20/20, Loss: 3.661984374048188e-05
Accuracy on test set: 0.9303015966883501


In [None]:
def predict_emotion(text, model, vocab, label_to_index, device):
    model.eval()
    with torch.no_grad():
        vectorized_text = vectorize_text(text, vocab)
        if not vectorized_text:
            return "unknown"
        padded_text = [vectorized_text + [vocab['<PAD>']] * (max(len(vectorized_text),1) - len(vectorized_text))]
        input_tensor = torch.LongTensor(padded_text).to(device)
        prediction = model(input_tensor)
        _, predicted_label = torch.max(prediction, 1)
        index_to_label = {index: label for label, index in label_to_index.items()}
        return index_to_label[predicted_label.item()]


new_sentences = ["I'm so happy", "This is terrifying."]
for sentence in new_sentences:
    predicted_emotion = predict_emotion(sentence, model, vocab, label_to_index, device)
    print(f"Sentence: {sentence}")
    print(f"Predicted Emotion: {predicted_emotion}")

Sentence: I'm so happy
Predicted Emotion: joy
Sentence: This is terrifying.
Predicted Emotion: anger
