In [3]:
# cyberbullying_emotion_detection.py

# ===============================
# 1. IMPORT LIBRARIES
# ===============================
import pandas as pd
import numpy as np
import re
import nltk

from nltk.corpus import stopwords
from transformers import pipeline

# ===============================
# 2. DOWNLOAD REQUIRED NLTK DATA
# ===============================
nltk.download('stopwords')

# ===============================
# 3. LOAD DATASET
# ===============================
# Ensure CBTweets.csv is in the same directory
df = pd.read_csv("CBTweets.csv")

print("Dataset loaded successfully")
print(df.head())

# ===============================
# 4. TEXT CLEANING FUNCTION
# ===============================
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)      # remove URLs
    text = re.sub(r"@\w+", "", text)          # remove mentions
    text = re.sub(r"#\w+", "", text)          # remove hashtags
    text = re.sub(r"[^a-z\s]", "", text)      # remove special characters

    tokens = text.split()                     # SAFE tokenization
    tokens = [word for word in tokens if word not in stop_words]

    return " ".join(tokens)

# Apply text cleaning
df["clean_text"] = df["tweet_text"].apply(clean_text)

print("Text cleaning completed")

# ===============================
# 5. LOAD EMOTION CLASSIFIER
# ===============================
emotion_classifier = pipeline(
    "text-classification",
    model="j-hartmann/emotion-english-distilroberta-base",
    return_all_scores=False
)

print("Emotion classification model loaded")

# ===============================
# 6. EMOTION DETECTION FUNCTION
# ===============================
def detect_emotion(text):
    try:
        result = emotion_classifier(text[:512])
        return result[0]['label']
    except:
        return "unknown"

# ===============================
# 7. CREATE EMOTION COLUMN
# ===============================
print("Emotion detection started (this may take time)...")

df["Emotion"] = df["clean_text"].apply(detect_emotion)

print("Emotion column added successfully")

# ===============================
# 8. SAVE FINAL DATASET
# ===============================
output_file = "cyberbullying_with_emotion.xlsx"
df.to_excel(output_file, index=False)

print(f"Final dataset saved as {output_file}")

# ===============================
# 9. QUICK SUMMARY
# ===============================
print("\nEmotion Distribution:")
print(df["Emotion"].value_counts())


[nltk_data] Downloading package stopwords to C:\Users\Srushti
[nltk_data]     Rawal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset loaded successfully
                                          tweet_text cyberbullying_type
0  In other words #katandandre, your food was cra...  not_cyberbullying
1  Why is #aussietv so white? #MKR #theblock #ImA...  not_cyberbullying
2  @XochitlSuckkks a classy whore? Or more red ve...  not_cyberbullying
3  @Jason_Gio meh. :P  thanks for the heads up, b...  not_cyberbullying
4  @RudhoeEnglish This is an ISIS account pretend...  not_cyberbullying
Text cleaning completed


config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/105 [00:00<?, ?it/s]

[1mRobertaForSequenceClassification LOAD REPORT[0m from: j-hartmann/emotion-english-distilroberta-base
Key                             | Status     |  | 
--------------------------------+------------+--+-
roberta.embeddings.position_ids | UNEXPECTED |  | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.[0m


tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Emotion classification model loaded
Emotion detection started (this may take time)...
Emotion column added successfully
Final dataset saved as cyberbullying_with_emotion.xlsx

Emotion Distribution:
Emotion
anger       16423
neutral     10374
fear         6302
sadness      5013
joy          4176
surprise     3041
disgust      2363
Name: count, dtype: int64
