**Emotion Detection from Text**

In [1]:
!pip install datasets transformers torch scikit-learn plotly wordcloud streamlit pyngrok -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from datasets import load_dataset
import pandas as pd

# Load the emotion dataset (6 emotions, ~20k samples)
dataset = load_dataset('dair-ai/emotion')

# Convert to pandas
train_df = pd.DataFrame(dataset['train'])
test_df  = pd.DataFrame(dataset['test'])

# Emotion label mapping
label_map = {0: 'Sadness 😢', 1: 'Joy 🤩', 2: 'Love ❤️', 3: 'Anger 😠', 4: 'Fear 😨', 5: 'Surprise 😲'}
train_df['emotion'] = train_df['label'].map(label_map)

print(f'Training samples: {len(train_df)}')
print(f'Test samples: {len(test_df)}')
print('\nEmotion Distribution:')
print(train_df['emotion'].value_counts())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]



split/train-00000-of-00001.parquet:   0%|          | 0.00/1.03M [00:00<?, ?B/s]

split/validation-00000-of-00001.parquet:   0%|          | 0.00/127k [00:00<?, ?B/s]

split/test-00000-of-00001.parquet:   0%|          | 0.00/129k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Training samples: 16000
Test samples: 2000

Emotion Distribution:
emotion
Joy 🤩         5362
Sadness 😢     4666
Anger 😠       2159
Fear 😨        1937
Love ❤️       1304
Surprise 😲     572
Name: count, dtype: int64


In [3]:
import plotly.express as px
import plotly.graph_objects as go

# Emotion distribution bar chart
emotion_counts = train_df['emotion'].value_counts().reset_index()
emotion_counts.columns = ['Emotion', 'Count']

fig = px.bar(
    emotion_counts,
    x='Emotion', y='Count',
    color='Emotion',
    title='📊 Emotion Distribution in Training Data',
    color_discrete_sequence=px.colors.qualitative.Bold
)
fig.update_layout(showlegend=False, plot_bgcolor='white')
fig.show()

# Show sample texts
print('\n🔍 Sample texts:')
for emotion in train_df['emotion'].unique():
    sample = train_df[train_df['emotion'] == emotion]['text'].iloc[0]
    print(f'{emotion}: "{sample[:80]}..."')


🔍 Sample texts:
Sadness 😢: "i didnt feel humiliated..."
Anger 😠: "im grabbing a minute to post i feel greedy wrong..."
Love ❤️: "i am ever feeling nostalgic about the fireplace i will know that it is still on ..."
Surprise 😲: "ive been taking or milligrams or times recommended amount and ive fallen asleep ..."
Fear 😨: "i feel as confused about life as a teenager or as jaded as a year old man..."
Joy 🤩: "i have been with petronas for years i feel that petronas has performed well and ..."


In [4]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset

# Check GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Using device: {device} 🚀')

# Load tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Custom Dataset class
class EmotionDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=max_len)
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

# Prepare data
train_texts = list(dataset['train']['text'])
train_labels = list(dataset['train']['label'])
test_texts  = list(dataset['test']['text'])
test_labels  = list(dataset['test']['label'])

train_dataset = EmotionDataset(train_texts, train_labels, tokenizer)
test_dataset  = EmotionDataset(test_texts, test_labels, tokenizer)

print(f'Train dataset size: {len(train_dataset)}')
print(f'Test dataset size: {len(test_dataset)}')

Using device: cuda 🚀


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Train dataset size: 16000
Test dataset size: 2000


In [6]:
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# Load DistilBERT for classification (6 labels)
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6)

# Metrics function
def compute_metrics(pred):
    labels = pred.label_ids
    preds  = pred.predictions.argmax(-1)
    acc = accuracy_score(labels, preds)
    f1  = f1_score(labels, preds, average='weighted')
    return {'accuracy': acc, 'f1': f1}

# Training arguments
training_args = TrainingArguments(
    output_dir='./emotion_model',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy='epoch', # Changed from evaluation_strategy
    save_strategy='epoch',
    load_best_model_at_end=True,
    logging_steps=50,
    report_to='none'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train! ⏳ (~15-20 mins on T4 GPU)
print('🚀 Starting training...')
trainer.train()
print('✅ Training complete!')

Loading weights:   0%|          | 0/100 [00:00<?, ?it/s]

DistilBertForSequenceClassification LOAD REPORT from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.
`logging_dir` is deprecated and will be removed in v5.2. Please set `TENSORBOARD_LOGGING_DIR` instead.


🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.214227,0.202843,0.9165,0.917011
2,0.110625,0.143594,0.928,0.927333
3,0.081059,0.157075,0.932,0.930625


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].


✅ Training complete!


In [7]:
from sklearn.metrics import classification_report, confusion_matrix
import plotly.figure_factory as ff

# Get predictions
predictions = trainer.predict(test_dataset)
preds = predictions.predictions.argmax(-1)
true_labels = predictions.label_ids

emotion_names = ['Sadness 😢', 'Joy 🤩', 'Love ❤️', 'Anger 😠', 'Fear 😨', 'Surprise 😲']

# Classification report
print('📊 Classification Report:')
print(classification_report(true_labels, preds, target_names=emotion_names))

# Confusion Matrix Heatmap
cm = confusion_matrix(true_labels, preds)
fig = ff.create_annotated_heatmap(
    z=cm,
    x=emotion_names,
    y=emotion_names,
    colorscale='Blues'
)
fig.update_layout(title='🎯 Confusion Matrix', xaxis_title='Predicted', yaxis_title='Actual')
fig.show()

📊 Classification Report:
              precision    recall  f1-score   support

   Sadness 😢       0.96      0.98      0.97       581
       Joy 🤩       0.93      0.97      0.95       695
     Love ❤️       0.93      0.72      0.81       159
     Anger 😠       0.96      0.89      0.92       275
      Fear 😨       0.90      0.88      0.89       224
  Surprise 😲       0.68      0.85      0.76        66

    accuracy                           0.93      2000
   macro avg       0.89      0.88      0.88      2000
weighted avg       0.93      0.93      0.93      2000



In [8]:
# Save model and tokenizer
model.save_pretrained('./saved_emotion_model')
tokenizer.save_pretrained('./saved_emotion_model')
print('✅ Model saved to ./saved_emotion_model')

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

✅ Model saved to ./saved_emotion_model


In [9]:
# Write the Streamlit app to a file
app_code = '''
import streamlit as st
import torch
import plotly.graph_objects as go
import plotly.express as px
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
import torch.nn.functional as F
import pandas as pd

# Page config
st.set_page_config(page_title="🎭 Emotion Detector", layout="wide")

# Emotion config
EMOTIONS = [
    {"name": "Sadness",  "emoji": "😢", "color": "#6495ED"},
    {"name": "Joy",      "emoji": "🤩", "color": "#FFD700"},
    {"name": "Love",     "emoji": "❤️",  "color": "#FF69B4"},
    {"name": "Anger",    "emoji": "😠", "color": "#FF4500"},
    {"name": "Fear",     "emoji": "😨", "color": "#9370DB"},
    {"name": "Surprise", "emoji": "😲", "color": "#32CD32"},
]

@st.cache_resource
def load_model():
    tokenizer = DistilBertTokenizerFast.from_pretrained("./saved_emotion_model")
    model = DistilBertForSequenceClassification.from_pretrained("./saved_emotion_model")
    model.eval()
    return tokenizer, model

def predict_emotion(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1).squeeze().tolist()
    pred_idx = probs.index(max(probs))
    return pred_idx, probs

# Load model
tokenizer, model = load_model()

# Header
st.title("🎭 Emotion Detection from Text")
st.markdown("**Beyond Positive/Negative — Detect the real human emotion behind any text!**")
st.divider()

# Input
col1, col2 = st.columns([2, 1])
with col1:
    user_input = st.text_area("✍️ Enter any text here:", height=150,
        placeholder="e.g. I just got my first job offer! I can't believe it!")
    detect_btn = st.button("🔍 Detect Emotion", use_container_width=True, type="primary")

# History tracker
if "history" not in st.session_state:
    st.session_state.history = []

if detect_btn and user_input.strip():
    pred_idx, probs = predict_emotion(user_input, tokenizer, model)
    emotion = EMOTIONS[pred_idx]

    # Store history
    st.session_state.history.append({
        "text": user_input[:60] + "...",
        "emotion": f"{emotion[\"emoji\"]} {emotion[\"name\"]}",
        "confidence": f"{max(probs)*100:.1f}%"
    })

    # Result card
    st.markdown(f"""
    <div style="background:{emotion[\"color\"]}22; border-left: 5px solid {emotion[\"color\"]};
    padding:20px; border-radius:10px; margin:10px 0">
        <h2>{emotion[\"emoji\"]} Detected Emotion: <b>{emotion[\"name\"]}</b></h2>
        <h4>Confidence: {max(probs)*100:.1f}%</h4>
    </div>
    """, unsafe_allow_html=True)

    st.divider()

    # Charts
    c1, c2 = st.columns(2)
    with c1:
        # Bar chart - all emotion probabilities
        labels = [f"{e[\"emoji\"]} {e[\"name\"]}" for e in EMOTIONS]
        colors = [e[\"color\"] for e in EMOTIONS]
        fig_bar = go.Figure(go.Bar(
            x=labels, y=[p*100 for p in probs],
            marker_color=colors, text=[f"{p*100:.1f}%" for p in probs],
            textposition="outside"
        ))
        fig_bar.update_layout(title="📊 Emotion Probability Distribution",
            yaxis_title="Probability (%)", plot_bgcolor="white", showlegend=False)
        st.plotly_chart(fig_bar, use_container_width=True)

    with c2:
        # Gauge meter for top emotion
        fig_gauge = go.Figure(go.Indicator(
            mode="gauge+number+delta",
            value=max(probs)*100,
            title={"text": f"{emotion[\"emoji\"]} {emotion[\"name\"]} Confidence"},
            gauge={
                "axis": {"range": [0, 100]},
                "bar": {"color": emotion[\"color\"]},
                "steps": [
                    {"range": [0, 40], "color": "#f0f0f0"},
                    {"range": [40, 70], "color": "#e0e0e0"},
                    {"range": [70, 100], "color": "#d0d0d0"}
                ]
            }
        ))
        fig_gauge.update_layout(title="🎯 Confidence Meter")
        st.plotly_chart(fig_gauge, use_container_width=True)

# History table
if st.session_state.history:
    st.divider()
    st.subheader("🕐 Emotion History")
    history_df = pd.DataFrame(st.session_state.history)
    st.dataframe(history_df, use_container_width=True)

    # History emotion frequency
    freq = history_df["emotion"].value_counts().reset_index()
    freq.columns = ["Emotion", "Count"]
    fig_pie = px.pie(freq, names="Emotion", values="Count",
        title="🥧 Your Emotion History Breakdown",
        color_discrete_sequence=px.colors.qualitative.Bold)
    st.plotly_chart(fig_pie, use_container_width=True)
'''

with open('app.py', 'w') as f:
    f.write(app_code)

print('✅ Streamlit app written to app.py')

✅ Streamlit app written to app.py


In [12]:
# Install Gradio
!pip install gradio -q

import gradio as gr
import torch
import torch.nn.functional as F
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification

# Load saved model
tokenizer = DistilBertTokenizerFast.from_pretrained('./saved_emotion_model')
model = DistilBertForSequenceClassification.from_pretrained('./saved_emotion_model')
model.eval()

EMOTIONS = ['Sadness 😢', 'Joy 🤩', 'Love ❤️', 'Anger 😠', 'Fear 😨', 'Surprise 😲']

def predict(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    probs = F.softmax(outputs.logits, dim=-1).squeeze().tolist()
    return {EMOTIONS[i]: probs[i] for i in range(len(EMOTIONS))}

# Launch dashboard
demo = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=3, placeholder="Type any text here..."),
    outputs=gr.Label(num_top_classes=6),
    title="🎭 Emotion Detection from Text",
    description="Detects 6 human emotions using fine-tuned DistilBERT!",
    examples=[
        ["I just got my dream job offer! I can't believe it!"],
        ["I miss my best friend so much, it hurts."],
        ["How dare they do this to me!"],
    ]
)

demo.launch(share=True)  # share=True gives you a public link instantly!

Loading weights:   0%|          | 0/104 [00:00<?, ?it/s]

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://eddae7657e02717db6.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


