# **Cyberbullying Detection Using Explainable AI and Machine Learning**

***Importing Libraries***

In [None]:
!pip install pandas scikit-learn nltk lime transformers torch joblib

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/275.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m266.2/275.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=3fe05bb07005fbcf189f12de1ead631ab3b5eba98b2d0601b215a05daa47ace2
  Stored in directory: /root/.cache/pip/wheels/e7/5d/0e/4b4fff9a47468fed5633211fb3b76d1db43fe806a17fb7486a
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer
from transformers import pipeline as hf_pipeline
from sklearn.preprocessing import LabelEncoder
import numpy as np

***Load the Dataset***

In [None]:
import pandas as pd
from google.colab import files

uploaded = files.upload()

DATA_PATH = "cyberbullying_tweets.csv"


Saving cyberbullying_tweets.csv to cyberbullying_tweets.csv


***Setup***

In [None]:
nltk.download('stopwords')
STOPWORDS = set(stopwords.words('english'))

TFIDF_MAX_FEAT = 10000
TEST_SIZE = 0.2
RANDOM_STATE = 42

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


***Preprocessing Function***

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    t = str(text).lower()
    t = ''.join(ch for ch in t if ch not in string.punctuation)
    t = " ".join(t.split())
    return t


***Load and PreProcess Dataset***

In [None]:
df = pd.read_csv(DATA_PATH)
df['tweet_text'] = df['tweet_text'].astype(str)
df['clean_text'] = df['tweet_text'].apply(preprocess_text)

print(f"Loaded dataset: {df.shape[0]} rows")
df[['tweet_text','cyberbullying_type']].head()


Loaded dataset: 47692 rows


Unnamed: 0,tweet_text,cyberbullying_type
0,"In other words #katandandre, your food was cra...",not_cyberbullying
1,Why is #aussietv so white? #MKR #theblock #ImA...,not_cyberbullying
2,@XochitlSuckkks a classy whore? Or more red ve...,not_cyberbullying
3,"@Jason_Gio meh. :P thanks for the heads up, b...",not_cyberbullying
4,@RudhoeEnglish This is an ISIS account pretend...,not_cyberbullying


***Train/Test Split***

In [None]:
X = df['clean_text']
y = df['cyberbullying_type']

try:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )
except Exception:
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE
    )

print(f"Training samples: {len(X_train)}, Test samples: {len(X_test)}")


Training samples: 38153, Test samples: 9539


***Build and Train Cyberbullying Classifier***

In [None]:
tfidf = TfidfVectorizer(max_features=TFIDF_MAX_FEAT, ngram_range=(1,2), stop_words='english')
svc = LinearSVC(max_iter=20000,class_weight='balanced')
calibrated = CalibratedClassifierCV(svc)

pipeline = make_pipeline(tfidf, calibrated)

print("Training cyberbullying classifier...")
pipeline.fit(X_train, y_train)
print("Training completed.")


Training cyberbullying classifier...
Training completed.


***Evaluate Classifier***

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

y_pred = pipeline.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print(f"Test Accuracy: {acc:.4f}\n")
print("Classification Report:")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred, labels=pipeline.classes_)
cm_df = pd.DataFrame(cm, index=pipeline.classes_, columns=pipeline.classes_)
print("Confusion matrix (rows=true, cols=pred):")
cm_df


Test Accuracy: 0.8260

Classification Report:
                     precision    recall  f1-score   support

                age       0.97      0.97      0.97      1598
          ethnicity       0.97      0.97      0.97      1592
             gender       0.88      0.86      0.87      1595
  not_cyberbullying       0.59      0.53      0.56      1589
other_cyberbullying       0.59      0.68      0.63      1565
           religion       0.96      0.95      0.95      1600

           accuracy                           0.83      9539
          macro avg       0.83      0.83      0.83      9539
       weighted avg       0.83      0.83      0.83      9539

Confusion matrix (rows=true, cols=pred):


Unnamed: 0,age,ethnicity,gender,not_cyberbullying,other_cyberbullying,religion
age,1548,5,7,24,13,1
ethnicity,1,1543,6,9,29,4
gender,1,8,1368,106,108,4
not_cyberbullying,34,15,82,841,561,56
other_cyberbullying,9,19,78,396,1060,3
religion,3,0,6,52,20,1519


***Sentiment and Emotion Pipeline***

In [None]:

sentiment_pipe = hf_pipeline("sentiment-analysis")

emotion_model_name = "j-hartmann/emotion-english-distilroberta-base"
try:
    emotion_pipe = hf_pipeline("text-classification", model=emotion_model_name, return_all_scores=True)
    print(f"Loaded emotion model: {emotion_model_name}")
except Exception:
    emotion_pipe = hf_pipeline("text-classification", return_all_scores=True)
    print("Loaded default emotion detection pipeline")


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cuda:0


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


Loaded emotion model: j-hartmann/emotion-english-distilroberta-base




***LIME***

In [None]:
explainer = LimeTextExplainer(class_names=list(pipeline.classes_))

def get_top_lime_word(text, num_features=1):
    try:
        exp = explainer.explain_instance(text, pipeline.predict_proba, num_features=num_features)
        as_list = exp.as_list()
        top_word = as_list[0][0] if as_list else None
        top_word_clean = top_word.strip().split()[0] if top_word else None
        return top_word_clean, exp
    except Exception:
        toks = text.lower().translate(str.maketrans('', '', string.punctuation)).split()
        return toks[0] if toks else None, None


***Explainable AI (LIME) setup***

In [None]:
from lime.lime_text import LimeTextExplainer
le = LabelEncoder()
y_encoded = le.fit_transform(df['cyberbullying_type'])
explainer = LimeTextExplainer(class_names=le.classes_)


***Helper Functions: Sentiment & Emotion***

In [None]:
def get_sentiment_ml(text):
    try:
        out = sentiment_pipe(text[:512])
        if isinstance(out, list):
            out = out[0]
        label = out['label']
        score = float(out['score'])
        if label.lower().startswith('pos'):
            label_u = "POSITIVE"
        elif label.lower().startswith('neg'):
            label_u = "NEGATIVE"
        else:
            label_u = label.upper()
        return label_u, score
    except Exception:
        return "UNKNOWN", 0.0

def get_emotion_ml(text):
    try:
        out = emotion_pipe(text[:512])
        if isinstance(out, list) and len(out) > 0 and isinstance(out[0], list):
            scores = out[0]
        elif isinstance(out, list) and isinstance(out[0], dict):
            scores = out
        else:
            return "UNKNOWN", 0.0
        best = max(scores, key=lambda d: d.get('score', 0.0))
        return str(best.get('label')).upper(), float(best.get('score', 0.0))
    except Exception:
        return "UNKNOWN", 0.0


***Main Function***

In [None]:
def predict(text, sentiment_threshold=0.8):
    clean = preprocess_text(text)
    pred = pipeline.predict([clean])[0]
    probs = pipeline.predict_proba([clean])[0]
    classes = list(pipeline.classes_)

    sentiment_label, sentiment_score = get_sentiment_ml(text)
    emotion_label, emotion_score = get_emotion_ml(text)

    if sentiment_label == "POSITIVE" and sentiment_score >= sentiment_threshold:
        detected_flag = "Not Detected ❌"
        pred = "not_cyberbullying"
        top_word = None
    else:

        not_labels = set([lab.lower() for lab in classes if 'not' in lab.lower() or lab.strip().lower() in ['none','no','not_cyberbullying','not cyberbullying']])
        if pred.lower() in not_labels:
            detected_flag = "Not Detected ❌"
            top_word = None
        else:
            detected_flag = "Cyberbullying Detected ✅"
            top_word, lime_exp = get_top_lime_word(text, num_features=3)
            if top_word is None:
                top_word = "(none)"

    print(f"\nText: {text}")
    print(f"{detected_flag} (Category: {pred})")
    if top_word:
        print(f"MAIN REASON FOR CYBERBULLYING DETECTION: WORD → {str(top_word).upper()}")
    print(f"Sentiment: {sentiment_label} ({sentiment_score:.2f})")
    print(f"Emotion: {emotion_label} ({emotion_score:.2f})")

    return {
        "text": text,
        "prediction": pred,
        "probabilities": dict(zip(classes, probs)),
        "main_word": top_word,
        "sentiment": (sentiment_label, sentiment_score),
        "emotion": (emotion_label, emotion_score)
    }


***Sample Input Test***

In [None]:
samples = [
    "I will find you and hurt you!",
    "Have a wonderful day, you are great!",
    "She is from a different country, so weird",
    "Men like you are useless.",
    "Shut up, you’re an idiot!",
    "you’re brainwashed by your religion",
    "Great Women",
    "The way her kind of people speak is so stupid"
]

for s in samples:
    predict(s)


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset



Text: I will find you and hurt you!
Cyberbullying Detected ✅ (Category: other_cyberbullying)
MAIN REASON FOR CYBERBULLYING DETECTION: WORD → HURT
Sentiment: NEGATIVE (0.99)
Emotion: ANGER (0.80)

Text: Have a wonderful day, you are great!
Not Detected ❌ (Category: not_cyberbullying)
Sentiment: POSITIVE (1.00)
Emotion: JOY (0.94)

Text: She is from a different country, so weird
Cyberbullying Detected ✅ (Category: other_cyberbullying)
MAIN REASON FOR CYBERBULLYING DETECTION: WORD → DIFFERENT
Sentiment: NEGATIVE (0.99)
Emotion: DISGUST (0.75)

Text: Men like you are useless.
Not Detected ❌ (Category: not_cyberbullying)
Sentiment: NEGATIVE (1.00)
Emotion: DISGUST (0.57)

Text: Shut up, you’re an idiot!
Cyberbullying Detected ✅ (Category: other_cyberbullying)
MAIN REASON FOR CYBERBULLYING DETECTION: WORD → IDIOT
Sentiment: NEGATIVE (1.00)
Emotion: ANGER (0.96)

Text: you’re brainwashed by your religion
Cyberbullying Detected ✅ (Category: religion)
MAIN REASON FOR CYBERBULLYING DETECTION: W

***CODE TO TEST FROM USER INPUT***

In [None]:
while True:
    user_input = input("\nEnter text to check for cyberbullying (or 'exit' to quit): ")
    if user_input.lower() == "exit":
        break
    predict(user_input)



Enter text to check for cyberbullying (or 'exit' to quit): exit
