In [31]:
!pip install tweet-preprocessor
import preprocessor as p

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [40]:
!pip install emoji
import emoji

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [32]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from transformers import AutoTokenizer, AutoModel
from tqdm.notebook import tqdm
from torch.optim import Adam

In [33]:

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load data
data = pd.read_csv("/kaggle/input/figure-eight-labelled-textual-dataset/text_emotion.csv")

Using device: cuda


In [34]:
# Preprocessing: Misspell corrections
misspell_data = pd.read_csv("/kaggle/input/spelling/aspell.txt", sep=":", names=["correction", "misspell"])
misspell_data.misspell = misspell_data.misspell.str.strip()
misspell_data.misspell = misspell_data.misspell.str.split(" ")
misspell_data = misspell_data.explode("misspell").reset_index(drop=True)
misspell_data.drop_duplicates("misspell", inplace=True)
miss_corr = dict(zip(misspell_data.misspell, misspell_data.correction))

def misspelled_correction(val):
    for x in val.split():
        if x in miss_corr.keys():
            val = val.replace(x, miss_corr[x])
    return val

data["clean_content"] = data.content.apply(lambda x: misspelled_correction(x))

In [35]:

# Preprocessing: Contractions
contractions = pd.read_csv("/kaggle/input/contractions/contractions.csv")
cont_dic = dict(zip(contractions.Contraction, contractions.Meaning))

def cont_to_meaning(val):
    for x in val.split():
        if x in cont_dic.keys():
            val = val.replace(x, cont_dic[x])
    return val

data.clean_content = data.clean_content.apply(lambda x: cont_to_meaning(x))


In [36]:
p.set_options(p.OPT.MENTION, p.OPT.URL)
p.clean("hello guys @alx #sport🔥 1245 https://github.com/s/preprocessor")

'hello guys #sport🔥 1245'

In [37]:
data["clean_content"]=data.content.apply(lambda x : p.clean(x))

In [38]:
def punctuation(val): 
  
    punctuations = '''()-[]{};:'"\,<>./@#$%^&_~'''
  
    for x in val.lower(): 
        if x in punctuations: 
            val = val.replace(x, " ") 
    return val

punctuation("test @ #ldfldlf??? !! ")

'test    ldfldlf??? !! '

In [41]:
data.clean_content = data.clean_content.apply(lambda x : ' '.join(punctuation(emoji.demojize(x)).split()))

In [42]:
def clean_text(val):
    val = misspelled_correction(val)
    val = cont_to_meaning(val)
    val = p.clean(val)
    val = ' '.join(punctuation(emoji.demojize(val)).split())
    
    return val

In [43]:
clean_text("isn't 💡 adultry @ttt good bad ... ! ? ")

'is not light bulb adultery good bad ! ?'

In [44]:
data = data[data.clean_content != ""]

In [45]:
data.sentiment.value_counts()

sentiment
neutral       8579
worry         8454
happiness     5208
sadness       5162
love          3841
surprise      2187
fun           1776
relief        1526
hate          1323
empty          815
enthusiasm     759
boredom        179
anger          110
Name: count, dtype: int64

In [46]:
# Map sentiments to IDs
sent_to_id = {"empty": 0, "sadness": 1, "enthusiasm": 2, "neutral": 3, "worry": 4,
              "surprise": 5, "love": 6, "fun": 7, "hate": 8, "happiness": 9, "boredom": 10, "relief": 11, "anger": 12}

data["sentiment_id"] = data['sentiment'].map(sent_to_id)

In [47]:
data

Unnamed: 0,tweet_id,sentiment,author,content,clean_content,sentiment_id
0,1956967341,empty,xoshayzers,@tiffanylue i know i was listenin to bad habi...,i know i was listenin to bad habit earlier and...,0
1,1956967666,sadness,wannamama,Layin n bed with a headache ughhhh...waitin o...,Layin n bed with a headache ughhhh waitin on y...,1
2,1956967696,sadness,coolfunky,Funeral ceremony...gloomy friday...,Funeral ceremony gloomy friday,1
3,1956967789,enthusiasm,czareaquino,wants to hang out with friends SOON!,wants to hang out with friends SOON!,2
4,1956968416,neutral,xkilljoyx,@dannycastillo We want to trade with someone w...,We want to trade with someone who has Houston ...,3
...,...,...,...,...,...,...
39994,1753918900,happiness,courtside101,Succesfully following Tayla!!,Succesfully following Tayla!!,9
39996,1753919001,love,drapeaux,Happy Mothers Day All my love,Happy Mothers Day All my love,6
39997,1753919005,love,JenniRox,Happy Mother's Day to all the mommies out ther...,Happy Mother s Day to all the mommies out ther...,6
39998,1753919043,happiness,ipdaman1,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...,WASSUP BEAUTIFUL!!! FOLLOW ME!! PEEP OUT MY NE...,9


In [48]:
# One-hot encoding of labels
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(data.sentiment_id)

onehot_encoder = OneHotEncoder(sparse_output=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
Y = onehot_encoder.fit_transform(integer_encoded)

In [49]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(data.clean_content, Y, test_size=0.2, random_state=1995, shuffle=True)

In [50]:

# Tokenizer setup
tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
max_len = 128



In [51]:

def regular_encode(texts, tokenizer, maxlen=128):
    texts = texts.tolist()
    enc_di = tokenizer.batch_encode_plus(
        texts,
        padding='max_length',
        truncation=True,
        max_length=maxlen,
        return_attention_mask=True
    )
    return np.array(enc_di['input_ids']), np.array(enc_di['attention_mask'])

X_train_t, train_mask = regular_encode(X_train, tokenizer, maxlen=max_len)
X_test_t, test_mask = regular_encode(X_test, tokenizer, maxlen=max_len)

In [52]:
# Dataset class
class EmotionDataset(Dataset):
    def __init__(self, input_ids, attention_mask, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_mask
        self.labels = labels

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': torch.tensor(self.input_ids[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }


In [53]:

# Create datasets and dataloaders
batch_size = 32

train_dataset = EmotionDataset(X_train_t, train_mask, y_train)
test_dataset = EmotionDataset(X_test_t, test_mask, y_test)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [54]:
# Model definition
class EmotionClassifier(nn.Module):
    def __init__(self, transformer, num_classes):
        super(EmotionClassifier, self).__init__()
        self.transformer = transformer
        self.dropout = nn.Dropout(0.2)
        self.fc1 = nn.Linear(transformer.config.hidden_size, 256)
        self.fc2 = nn.Linear(256, 128)
        self.out = nn.Linear(128, num_classes)

    def forward(self, input_ids, attention_mask):
        transformer_output = self.transformer(input_ids=input_ids, attention_mask=attention_mask)
        cls_token = transformer_output.last_hidden_state[:, 0, :]
        x = self.dropout(torch.relu(self.fc1(cls_token)))
        x = self.dropout(torch.relu(self.fc2(x)))
        return self.out(x)


In [55]:

# Load ALBERT transformer
transformer_model = AutoModel.from_pretrained('albert-base-v2')
model = EmotionClassifier(transformer_model, num_classes=13).to(device)

In [56]:
# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=2e-5)
epochs = 13

In [57]:
# Training loop
for epoch in range(epochs):
    model.train()
    train_loss = 0
    correct = 0
    total = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        _, predicted = torch.max(outputs, dim=1)
        _, labels_max = torch.max(labels, dim=1)
        total += labels.size(0)
        correct += (predicted == labels_max).sum().item()

    accuracy = 100 * correct / total
    print(f"Epoch {epoch+1}/{epochs}, Loss: {train_loss:.4f}, Accuracy: {accuracy:.2f}%")

Epoch 1/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 1/13, Loss: 1979.6299, Accuracy: 32.85%


Epoch 2/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 2/13, Loss: 1829.3707, Accuracy: 38.11%


Epoch 3/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 3/13, Loss: 1734.2468, Accuracy: 41.63%


Epoch 4/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 4/13, Loss: 1630.7456, Accuracy: 45.26%


Epoch 5/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 5/13, Loss: 1502.5213, Accuracy: 50.57%


Epoch 6/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 6/13, Loss: 1346.4827, Accuracy: 56.53%


Epoch 7/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 7/13, Loss: 1162.2054, Accuracy: 63.04%


Epoch 8/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 8/13, Loss: 979.9170, Accuracy: 69.77%


Epoch 9/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 9/13, Loss: 808.0934, Accuracy: 75.73%


Epoch 10/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 10/13, Loss: 638.1697, Accuracy: 80.90%


Epoch 11/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 11/13, Loss: 534.2378, Accuracy: 83.88%


Epoch 12/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 12/13, Loss: 448.4620, Accuracy: 86.62%


Epoch 13/13:   0%|          | 0/998 [00:00<?, ?it/s]

Epoch 13/13, Loss: 368.6887, Accuracy: 88.98%


In [58]:
torch.save(model.state_dict(), "emotion_classifier_model.pth")

In [59]:
!ls


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


emotion_classifier_model.pth


In [60]:
from IPython.display import FileLink
FileLink(r'emotion_classifier_model.pth')


In [61]:
def test_emotion_classifier(sentences):
    # Load model and tokenizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
    transformer_model = AutoModel.from_pretrained('albert-base-v2')
    
    model = EmotionClassifier(transformer_model, num_classes=13).to(device)
    model.load_state_dict(torch.load("emotion_classifier_model.pth", map_location=device))
    model.eval()

    # Sentiment mapping
    id_to_sent = {0: "empty", 1: "sadness", 2: "enthusiasm", 3: "neutral", 
                  4: "worry", 5: "surprise", 6: "love", 7: "fun", 
                  8: "hate", 9: "happiness", 10: "boredom", 11: "relief", 12: "anger"}

    # Encode sentences
    def encode_text(text):
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )
        return encoding['input_ids'].to(device), encoding['attention_mask'].to(device)

    # Predict emotions
    results = []
    with torch.no_grad():
        for sentence in sentences:
            input_ids, attention_mask = encode_text(sentence)
            output = model(input_ids, attention_mask)
            predicted = torch.argmax(output, dim=1).cpu().numpy()[0]
            results.append({
                'sentence': sentence, 
                'emotion': id_to_sent[predicted],
                'confidence': torch.softmax(output, dim=1).cpu().numpy()[0][predicted]
            })
    
    return results

# Example usage
test_sentences = [
    "I am happy",
    "I am sad",
    "I am excited about my new job",
    "I feel worried about the future",
    "This is the most boring day ever",
    "I love my family",
    "I am so angry right now"
]

results = test_emotion_classifier(test_sentences)
for result in results:
    print(f"Sentence: {result['sentence']}")
    print(f"Predicted Emotion: {result['emotion']}")
    print(f"Confidence: {result['confidence']:.2%}\n")

Sentence: I am happy
Predicted Emotion: happiness
Confidence: 82.65%

Sentence: I am sad
Predicted Emotion: sadness
Confidence: 99.02%

Sentence: I am excited about my new job
Predicted Emotion: happiness
Confidence: 88.96%

Sentence: I feel worried about the future
Predicted Emotion: worry
Confidence: 99.83%

Sentence: This is the most boring day ever
Predicted Emotion: sadness
Confidence: 96.83%

Sentence: I love my family
Predicted Emotion: love
Confidence: 99.00%

Sentence: I am so angry right now
Predicted Emotion: hate
Confidence: 58.87%



  model.load_state_dict(torch.load("emotion_classifier_model.pth", map_location=device))


In [62]:
def test_emotion_classifier(sentences):
    # Load model and tokenizer
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    tokenizer = AutoTokenizer.from_pretrained('albert-base-v2')
    transformer_model = AutoModel.from_pretrained('albert-base-v2')
    
    model = EmotionClassifier(transformer_model, num_classes=13).to(device)
    model.load_state_dict(torch.load("emotion_classifier_model.pth", map_location=device))
    model.eval()

    # Sentiment mapping
    id_to_sent = {0: "empty", 1: "sadness", 2: "enthusiasm", 3: "neutral", 
                  4: "worry", 5: "surprise", 6: "love", 7: "fun", 
                  8: "hate", 9: "happiness", 10: "boredom", 11: "relief", 12: "anger"}

    # Encode sentences
    def encode_text(text):
        encoding = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=128,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_attention_mask=True
        )
        return encoding['input_ids'].to(device), encoding['attention_mask'].to(device)

    # Predict emotions
    results = []
    with torch.no_grad():
        for sentence in sentences:
            input_ids, attention_mask = encode_text(sentence)
            output = model(input_ids, attention_mask)
            probabilities = torch.softmax(output, dim=1).cpu().numpy()[0]
            
            # Map probabilities to emotions and sort in descending order
            sorted_emotions = sorted(
                [(id_to_sent[idx], prob) for idx, prob in enumerate(probabilities)],
                key=lambda x: x[1], 
                reverse=True
            )

            results.append({
                'sentence': sentence, 
                'sorted_emotions': sorted_emotions
            })
    
    return results

# Example usage
test_sentences = [
    "I am happy",
    "I am sad",
    "I am excited about my new job",
    "I feel worried about the future",
    "This is the most boring day ever",
    "I love my family",
    "I am so angry right now"
]

results = test_emotion_classifier(test_sentences)
for result in results:
    print(f"Sentence: {result['sentence']}")
    print("Emotion Probabilities:")
    for emotion, percentage in result['sorted_emotions']:
        print(f"  {emotion}: {percentage:.2%}")
    print()


Sentence: I am happy
Emotion Probabilities:
  happiness: 82.65%
  worry: 4.91%
  relief: 3.25%
  love: 2.15%
  surprise: 1.76%
  enthusiasm: 1.63%
  neutral: 1.43%
  fun: 1.27%
  sadness: 0.50%
  empty: 0.19%
  boredom: 0.11%
  anger: 0.10%
  hate: 0.05%

Sentence: I am sad
Emotion Probabilities:
  sadness: 99.02%
  worry: 0.87%
  neutral: 0.06%
  relief: 0.01%
  happiness: 0.01%
  surprise: 0.01%
  empty: 0.01%
  love: 0.01%
  boredom: 0.00%
  hate: 0.00%
  enthusiasm: 0.00%
  fun: 0.00%
  anger: 0.00%

Sentence: I am excited about my new job
Emotion Probabilities:
  happiness: 88.96%
  surprise: 5.40%
  love: 2.43%
  relief: 0.88%
  worry: 0.69%
  enthusiasm: 0.57%
  fun: 0.42%
  neutral: 0.35%
  sadness: 0.11%
  empty: 0.08%
  boredom: 0.05%
  anger: 0.05%
  hate: 0.02%

Sentence: I feel worried about the future
Emotion Probabilities:
  worry: 99.83%
  sadness: 0.14%
  enthusiasm: 0.01%
  hate: 0.01%
  surprise: 0.00%
  happiness: 0.00%
  neutral: 0.00%
  relief: 0.00%
  empty: 0.00

  model.load_state_dict(torch.load("emotion_classifier_model.pth", map_location=device))


In [63]:
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np

def evaluate_model(model, test_loader, device):
    model.eval()
    true_labels = []
    predicted_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask)
            _, predicted = torch.max(outputs, dim=1)
            _, true_label = torch.max(labels, dim=1)

            true_labels.extend(true_label.cpu().numpy())
            predicted_labels.extend(predicted.cpu().numpy())

    # Calculate metrics
    f1 = f1_score(true_labels, predicted_labels, average='weighted')
    precision = precision_score(true_labels, predicted_labels, average='weighted')
    recall = recall_score(true_labels, predicted_labels, average='weighted')

    return {
        'F1 Score': f1,
        'Precision': precision,
        'Recall': recall
    }

# Usage in training script
metrics = evaluate_model(model, test_loader, device)
for metric, value in metrics.items():
    print(f"{metric}: {value:.4f}")

F1 Score: 0.3118
Precision: 0.3144
Recall: 0.3131


  _warn_prf(average, modifier, msg_start, len(result))


In [66]:
#part 1
test_sentences = [
i used to love playing in the attic as a kid.]

results = test_emotion_classifier(test_sentences)
for result in results:
    print(f"Sentence: {result['sentence']}")
    print("Emotion Probabilities:")
    for emotion, percentage in result['sorted_emotions']:
        print(f"  {emotion}: {percentage:.2%}")
    print()

Sentence: Aria, a young adventurer, finds an old map hidden in her attic.
Emotion Probabilities:
  neutral: 99.32%
  empty: 0.23%
  enthusiasm: 0.23%
  relief: 0.08%
  fun: 0.05%
  worry: 0.04%
  love: 0.01%
  sadness: 0.01%
  surprise: 0.01%
  happiness: 0.01%
  boredom: 0.00%
  anger: 0.00%
  hate: 0.00%



  model.load_state_dict(torch.load("emotion_classifier_model.pth", map_location=device))
