<a href="https://colab.research.google.com/github/sru36/CyberBullyingEmotionDetection/blob/main/cyberbullying_emotion_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# cyberbullying_emotion_detection.py


# 1. IMPORT LIBRARIES

import pandas as pd
import numpy as np
import re
import nltk
import textblob

from textblob import TextBlob

from nltk.corpus import stopwords
from transformers import pipeline


# 2. DOWNLOAD REQUIRED NLTK DATA

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# 3. LOAD DATASET

df = pd.read_csv("/content/CBTweets.csv")

print("Dataset loaded successfully")
print(df.head())

Dataset loaded successfully
                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying


In [3]:
# 4. TEXT CLEANING FUNCTION

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)      # remove URLs
    text = re.sub(r"@\w+", "", text)          # remove mentions
    text = re.sub(r"#\w+", "", text)          # remove hashtags
    text = re.sub(r"[^a-z\s]", "", text)      # remove special characters

    tokens = text.split()                     # SAFE tokenization
    tokens = [word for word in tokens if word not in stop_words]

    return " ".join(tokens)

# Apply text cleaning
df["clean_text"] = df["tweet_text"].apply(clean_text)

print("Text cleaning completed")

Text cleaning completed


In [4]:
# 5. LOAD EMOTION CLASSIFIER

emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=False
)

print("Emotion classification model loaded")


# 6. EMOTION DETECTION FUNCTION

def detect_emotion(text):
    try:
        if not isinstance(text, str) or text.strip() == "":
            return "neutral"

        polarity = TextBlob(text).sentiment.polarity

        if polarity > 0.1:
            return "positive"
        elif polarity < -0.1:
            return "negative"
        else:
            return "neutral"
    except:
        return "unknown"


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: j-hartmann/emotion-english-distilroberta-base
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


Emotion classification model loaded


In [5]:
# 7. CREATE EMOTION COLUMN
print("Emotion detection started (this may take time)...")

# TIP: For testing, use df.head(500)
df["Emotion"] = df["clean_text"].apply(detect_emotion)

print("Emotion column added successfully")


Emotion detection started (this may take time)...
Emotion column added successfully


In [6]:
df[["clean_text", "Emotion"]].head()


Unnamed: 0,clean_text,Emotion
0,words food crapilicious,neutral
1,white,neutral
2,classy whore red velvet cupcakes,neutral
3,meh p thanks heads concerned another angry dud...,negative
4,isis account pretending kurdish account like i...,neutral


In [7]:
# 8. SAVE FINAL DATASET

output_file = "cyberbullying_with_emotion.xlsx"
df.to_excel(output_file, index=False)

print(f"Final dataset saved as {output_file}")


# 9. QUICK SUMMARy
print("\nEmotion Distribution:")
print(df["Emotion"].value_counts())


Final dataset saved as cyberbullying_with_emotion.xlsx

Emotion Distribution:
Emotion
neutral     18881
negative    15424
positive    13387
Name: count, dtype: int64
