In [1]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
import pandas as pd
from typing import Counter
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def load_and_prepare_data(seed=42):
    dataset = load_dataset("dair-ai/emotion")
    train_validation = dataset["train"].train_test_split(test_size=0.25, seed=seed)
    train_data = train_validation["train"]
    validation_data = train_validation["test"]
    test_data = dataset["test"]
    return train_data, validation_data, test_data

train_data, validation_data, test_data = load_and_prepare_data()


In [3]:
def extract_features_labels(data):
    texts = data["text"]
    labels = data["label"]
    return texts, pd.Series(labels)

X_train, y_train = extract_features_labels(train_data)
X_val, y_val = extract_features_labels(validation_data)
X_test, y_test = extract_features_labels(test_data)

In [4]:
def compute_class_weights(y_train):
    label_counts = Counter(y_train)
    num_classes = len(label_counts)
    total_samples = sum(label_counts.values())
    class_weights = [
        total_samples / (num_classes * label_counts[i])
        for i in range(num_classes)
    ]
    return class_weights

class_weights = compute_class_weights(y_train)
class_weights

[0.5704506560182544,
 0.4973887092762994,
 2.0181634712411705,
 1.2338062924120914,
 1.3956734124214933,
 4.672897196261682]

In [5]:
embedding_dim = 90
model_name = "bert-base-uncased"

In [10]:
def tokenize_data(train_data, validation_data, test_data,autotokenizer):

  def _tokenize(batch):
    return autotokenizer(batch["text"], padding="max_length", truncation=True, max_length=embedding_dim)

  train_data_numericalized = train_data.map(_tokenize, batched=True, batch_size=len(train_data))
  validation_data_numericalized = validation_data.map(_tokenize, batched=True, batch_size=len(validation_data))
  test_data_numericalized = test_data.map(_tokenize, batched=True, batch_size=len(test_data))


  return train_data_numericalized, validation_data_numericalized, test_data_numericalized

autotokenizer = AutoTokenizer.from_pretrained(model_name)
train_data_numericalized, validation_data_numericalized, test_data_numericalized = tokenize_data(train_data, validation_data, test_data, autotokenizer)

Map: 100%|██████████| 2000/2000 [00:00<00:00, 15111.67 examples/s]


In [17]:
# Extracting features and labels for XGBoost
X_train_v = pd.DataFrame(train_data_numericalized["input_ids"])
X_val_v = pd.DataFrame(validation_data_numericalized["input_ids"])
X_test_v = pd.DataFrame(test_data_numericalized["input_ids"])

In [18]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
class_weight_dict = dict(zip(classes, class_weights))
print(class_weight_dict)

sample_weights = [class_weight_dict[label] for label in y_train]


{np.int64(0): np.float64(0.5704506560182544), np.int64(1): np.float64(0.4973887092762994), np.int64(2): np.float64(2.0181634712411705), np.int64(3): np.float64(1.2338062924120914), np.int64(4): np.float64(1.3956734124214933), np.int64(5): np.float64(4.672897196261682)}


In [48]:
xgb_model = XGBClassifier(
    objective='multi:softprob',
    num_class=6,
    eval_metric='mlogloss',
    n_estimators=10000,
    learning_rate=0.01,
    random_state=42,
    #use gpu cuda
    device='cuda:0'
 )

xgb_model.fit(X_train_v, y_train, eval_set=[(X_val_v, y_val)], verbose=500,sample_weight=sample_weights)

[0]	validation_0-mlogloss:1.79139
[500]	validation_0-mlogloss:1.72083
[1000]	validation_0-mlogloss:1.68804
[1500]	validation_0-mlogloss:1.66446
[2000]	validation_0-mlogloss:1.64889
[2500]	validation_0-mlogloss:1.63731
[3000]	validation_0-mlogloss:1.62686
[3500]	validation_0-mlogloss:1.62068
[4000]	validation_0-mlogloss:1.61683
[4500]	validation_0-mlogloss:1.61542
[5000]	validation_0-mlogloss:1.61683
[5500]	validation_0-mlogloss:1.62094
[6000]	validation_0-mlogloss:1.62760
[6500]	validation_0-mlogloss:1.63413
[7000]	validation_0-mlogloss:1.64302
[7500]	validation_0-mlogloss:1.65001
[8000]	validation_0-mlogloss:1.65829
[8500]	validation_0-mlogloss:1.66695
[9000]	validation_0-mlogloss:1.67459
[9500]	validation_0-mlogloss:1.68328
[9999]	validation_0-mlogloss:1.69178


0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,'cuda:0'
,early_stopping_rounds,
,enable_categorical,False


In [50]:
def calc_accuracy(y_true, y_pred):
    correct = sum(y_true == y_pred)
    total = len(y_true)
    return correct / total

In [51]:
from sklearn.metrics import classification_report
classes = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
target_names = [classes[i] for i in range(len(classes))]

y_pred = xgb_model.predict(X_test_v)
accuracy = calc_accuracy(y_test, y_pred)
print(f"Test Accuracy: {accuracy:.4f}")

print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=target_names))


Test Accuracy: 0.3680
Classification Report:
              precision    recall  f1-score   support

     sadness       0.38      0.43      0.40       581
         joy       0.44      0.56      0.49       695
        love       0.14      0.06      0.09       159
       anger       0.23      0.18      0.20       275
        fear       0.23      0.16      0.19       224
    surprise       0.22      0.03      0.05        66

    accuracy                           0.37      2000
   macro avg       0.27      0.24      0.24      2000
weighted avg       0.34      0.37      0.35      2000



In [60]:
label_map = {0: 'sadness', 1: 'joy', 2: 'love', 3: 'anger', 4: 'fear', 5: 'surprise'}
def predict_sentiment_examples(texts, xgb_model, tokenizer):
    obj = tokenizer(texts, padding="max_length", truncation=True, max_length=embedding_dim)
    ids = pd.DataFrame(obj['input_ids']) 
    pred = xgb_model.predict(ids)       
    probs = xgb_model.predict_proba(ids) 
    return pred, np.max(probs, axis=1)

In [61]:
def predict_sentiment_one_example(text, xgb_model, tokenizer):
    obj = tokenizer(text, padding="max_length", truncation=True, max_length=embedding_dim)
    ids = pd.DataFrame([obj['input_ids']]) 
    pred = xgb_model.predict(ids)       
    probs = xgb_model.predict_proba(ids) 
    return label_map[pred[0]], np.max(probs, axis=1)[0]

In [62]:
example_texts = [
    "iam very sad i lost my job and i dont know what to do i have no money and i have no friends",
    "i love you",
    "iam very upset iam sick",
    "i cant stand delaying my PhD defense any more",
    "i can punch them in the face right now",
    "the hardest part about growing up is saying goodby to childhood dreams",
    "how did you do that to me?",
    "how dare you stand where he stod?"
]

for text in example_texts:
    sentiment, prob = predict_sentiment_one_example(text, xgb_model, autotokenizer)
    print(f"Text: {text}\nPredicted: {sentiment} ({prob:.3f})\n")

Text: iam very sad i lost my job and i dont know what to do i have no money and i have no friends
Predicted: sadness (0.415)

Text: i love you
Predicted: joy (0.520)

Text: iam very upset iam sick
Predicted: sadness (0.558)

Text: i cant stand delaying my PhD defense any more
Predicted: sadness (0.934)

Text: i can punch them in the face right now
Predicted: joy (0.519)

Text: the hardest part about growing up is saying goodby to childhood dreams
Predicted: sadness (0.454)

Text: how did you do that to me?
Predicted: fear (0.339)

Text: how dare you stand where he stod?
Predicted: joy (0.468)



In [63]:

expanded_texts = [
    "iam very sad i lost my job and i dont know what to do i have no money and i have no friends",
    "It genuinely brightens my day to finally meet you — I’ve been looking forward to this moment for so long.",
    "I love you more deeply than words can express — your presence brings peace to my chaos and warmth to my coldest days.",
    "I feel completely drained and overwhelmed — being sick like this makes everything feel heavier, and I just want to curl up and disappear for a while.",
    "I’m beyond frustrated with how long this delay has dragged on — every day feels like a slap in the face to the effort I’ve poured into this PhD.",
    "I’m so furious I could scream — the way they treated me was completely disrespectful, and I honestly feel like punching them in the face.",
    "Growing up feels like slowly letting go of the dreams that once defined me — saying goodbye to those innocent hopes is the hardest part of all.",
    "After everything we’ve been through, how could you betray me like that? I trusted you, and now I feel completely shattered.",
    "How dare you stand where he stood, pretending like you belong — you have no right to be here after everything you’ve done."
]

preds, probs = predict_sentiment_examples(expanded_texts, xgb_model, autotokenizer)
for text, pred, prob in zip(expanded_texts, preds, probs):
    sentiment = label_map[pred]
    print(f"Text: {text}\nPredicted Sentiment: {sentiment} (Confidence: {prob:.3f})\n")


Text: iam very sad i lost my job and i dont know what to do i have no money and i have no friends
Predicted Sentiment: sadness (Confidence: 0.415)

Text: It genuinely brightens my day to finally meet you — I’ve been looking forward to this moment for so long.
Predicted Sentiment: sadness (Confidence: 0.468)

Text: I love you more deeply than words can express — your presence brings peace to my chaos and warmth to my coldest days.
Predicted Sentiment: joy (Confidence: 0.616)

Text: I feel completely drained and overwhelmed — being sick like this makes everything feel heavier, and I just want to curl up and disappear for a while.
Predicted Sentiment: sadness (Confidence: 0.793)

Text: I’m beyond frustrated with how long this delay has dragged on — every day feels like a slap in the face to the effort I’ve poured into this PhD.
Predicted Sentiment: joy (Confidence: 0.537)

Text: I’m so furious I could scream — the way they treated me was completely disrespectful, and I honestly feel like 

In [1]:
import gradio as gr

# Gradio wrapper
def gradio_predict(text):
    sentiment, prob = predict_sentiment_one_example(text, xgb_model, autotokenizer)
    return f"Predicted Emotion: {sentiment} ({prob:.3f} confidence)"

# Launch Gradio interface
gr.Interface(
    fn=gradio_predict,
    inputs=gr.Textbox(lines=4, placeholder="Enter a sentence to analyze..."),
    outputs="text",
    title="Emotion Classifier",
    description="Enter a sentence and get its predicted emotion using XGBoost and TF-IDF/BERT features.",
    examples=[[text] for text in example_texts]

).launch()

  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'example_texts' is not defined

In [57]:
import winsound
winsound.Beep(1000, 500)  # Beep sound to indicate completion