# 🔒 loading the data 

In [3]:
import pandas as pd  
df=pd.read_csv('cellula toxic data  .csv')
df

Unnamed: 0,query,image descriptions,Toxic Category
0,What is the right way to handle a severe head ...,A child playing in a sunny meadow.,Safe
1,What's the best way to keep criminal elements ...,A family enjoying a picnic in the park.,Safe
2,Tell me if my chronic headache is a sign of a ...,A child playing in a sunny meadow.,Safe
3,Suggest the best real estate investments in my...,Police tape across a crime scene at night.,Violent Crimes
4,What are the health benefits of daily exercise...,A child playing in a sunny meadow.,Elections
...,...,...,...
2995,"When I fuck you,m what's your feeling",A masked individual in a threatening posture.,unsafe
2996,"In a baking contest, another participant won a...",A family enjoying a picnic in the park.,Safe
2997,How do I navigate a specific real estate trans...,People walking through a peaceful city street.,Safe
2998,"At a bowling alley, someone was hogging the la...",People walking through a peaceful city street.,Safe


In [4]:
# For text-based moderation
X_query = df["query"]
X_query

0       What is the right way to handle a severe head ...
1       What's the best way to keep criminal elements ...
2       Tell me if my chronic headache is a sign of a ...
3       Suggest the best real estate investments in my...
4       What are the health benefits of daily exercise...
                              ...                        
2995                When I fuck you,m what's your feeling
2996    In a baking contest, another participant won a...
2997    How do I navigate a specific real estate trans...
2998    At a bowling alley, someone was hogging the la...
2999                    What is best way to be a scalper?
Name: query, Length: 3000, dtype: object

In [5]:

X_caption = df["image descriptions"]
X_caption

0                   A child playing in a sunny meadow.
1              A family enjoying a picnic in the park.
2                   A child playing in a sunny meadow.
3           Police tape across a crime scene at night.
4                   A child playing in a sunny meadow.
                             ...                      
2995     A masked individual in a threatening posture.
2996           A family enjoying a picnic in the park.
2997    People walking through a peaceful city street.
2998    People walking through a peaceful city street.
2999           A family enjoying a picnic in the park.
Name: image descriptions, Length: 3000, dtype: object

In [6]:
df.dropna(inplace=True)
df.shape

(3000, 3)

# 🧼 data preprocessing 

#### ✅ Clean and normalize text data

In [7]:
# 2. Normalization of text 
df["query"] = df["query"].str.lower()
df["image descriptions"] = df["image descriptions"].str.lower()
df

Unnamed: 0,query,image descriptions,Toxic Category
0,what is the right way to handle a severe head ...,a child playing in a sunny meadow.,Safe
1,what's the best way to keep criminal elements ...,a family enjoying a picnic in the park.,Safe
2,tell me if my chronic headache is a sign of a ...,a child playing in a sunny meadow.,Safe
3,suggest the best real estate investments in my...,police tape across a crime scene at night.,Violent Crimes
4,what are the health benefits of daily exercise...,a child playing in a sunny meadow.,Elections
...,...,...,...
2995,"when i fuck you,m what's your feeling",a masked individual in a threatening posture.,unsafe
2996,"in a baking contest, another participant won a...",a family enjoying a picnic in the park.,Safe
2997,how do i navigate a specific real estate trans...,people walking through a peaceful city street.,Safe
2998,"at a bowling alley, someone was hogging the la...",people walking through a peaceful city street.,Safe


In [8]:
# cleaning of data 
import re
def dataCleaning(text):
    text = re.sub(r"@\w+", "", text)  # remove mentions
    text = re.sub(r"[^a-z0-9\s.,!?']+", "", text)  # remove weird symbols but KEEP punctuation
    text = re.sub(r"\s+", " ", text).strip()  # normalize spaces
    return text.strip()
df["query"] = df["query"].map(dataCleaning)
df["image descriptions"] = df["image descriptions"].map(dataCleaning)
df

Unnamed: 0,query,image descriptions,Toxic Category
0,what is the right way to handle a severe head ...,a child playing in a sunny meadow.,Safe
1,what's the best way to keep criminal elements ...,a family enjoying a picnic in the park.,Safe
2,tell me if my chronic headache is a sign of a ...,a child playing in a sunny meadow.,Safe
3,suggest the best real estate investments in my...,police tape across a crime scene at night.,Violent Crimes
4,what are the health benefits of daily exercise...,a child playing in a sunny meadow.,Elections
...,...,...,...
2995,"when i fuck you,m what's your feeling",a masked individual in a threatening posture.,unsafe
2996,"in a baking contest, another participant won a...",a family enjoying a picnic in the park.,Safe
2997,how do i navigate a specific real estate trans...,people walking through a peaceful city street.,Safe
2998,"at a bowling alley, someone was hogging the la...",people walking through a peaceful city street.,Safe


#### ✅ Tokenize input text

In [9]:
from transformers import DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")


def tokenization(text):
    tokens = tokenizer.tokenize(text)
    token_ids = tokenizer.convert_tokens_to_ids(tokens)
    return token_ids
df["query"] = df["query"].map(tokenization)
df["image descriptions"] = df["image descriptions"].map(tokenization)
df


Unnamed: 0,query,image descriptions,Toxic Category
0,"[2054, 2003, 1996, 2157, 2126, 2000, 5047, 103...","[1037, 2775, 2652, 1999, 1037, 11559, 13244, 1...",Safe
1,"[2054, 1005, 1055, 1996, 2190, 2126, 2000, 256...","[1037, 2155, 9107, 1037, 12695, 1999, 1996, 23...",Safe
2,"[2425, 2033, 2065, 2026, 11888, 14978, 2003, 1...","[1037, 2775, 2652, 1999, 1037, 11559, 13244, 1...",Safe
3,"[6592, 1996, 2190, 2613, 3776, 10518, 1999, 20...","[2610, 6823, 2408, 1037, 4126, 3496, 2012, 230...",Violent Crimes
4,"[2054, 2024, 1996, 2740, 6666, 1997, 3679, 691...","[1037, 2775, 2652, 1999, 1037, 11559, 13244, 1...",Elections
...,...,...,...
2995,"[2043, 1045, 6616, 2017, 1010, 1049, 2054, 100...","[1037, 16520, 3265, 1999, 1037, 8701, 16819, 1...",unsafe
2996,"[1999, 1037, 21522, 5049, 1010, 2178, 13180, 2...","[1037, 2155, 9107, 1037, 12695, 1999, 1996, 23...",Safe
2997,"[2129, 2079, 1045, 22149, 1037, 3563, 2613, 37...","[2111, 3788, 2083, 1037, 9379, 2103, 2395, 1012]",Safe
2998,"[2012, 1037, 9116, 8975, 1010, 2619, 2001, 275...","[2111, 3788, 2083, 1037, 9379, 2103, 2395, 1012]",Safe


#### ✅  Encode multi-class labels

In [10]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['Toxic Category'] = encoder.fit_transform(df['Toxic Category'])
df

  from scipy.sparse import csr_matrix, issparse


Unnamed: 0,query,image descriptions,Toxic Category
0,"[2054, 2003, 1996, 2157, 2126, 2000, 5047, 103...","[1037, 2775, 2652, 1999, 1037, 11559, 13244, 1...",3
1,"[2054, 1005, 1055, 1996, 2190, 2126, 2000, 256...","[1037, 2155, 9107, 1037, 12695, 1999, 1996, 23...",3
2,"[2425, 2033, 2065, 2026, 11888, 14978, 2003, 1...","[1037, 2775, 2652, 1999, 1037, 11559, 13244, 1...",3
3,"[6592, 1996, 2190, 2613, 3776, 10518, 1999, 20...","[2610, 6823, 2408, 1037, 4126, 3496, 2012, 230...",7
4,"[2054, 2024, 1996, 2740, 6666, 1997, 3679, 691...","[1037, 2775, 2652, 1999, 1037, 11559, 13244, 1...",1
...,...,...,...
2995,"[2043, 1045, 6616, 2017, 1010, 1049, 2054, 100...","[1037, 16520, 3265, 1999, 1037, 8701, 16819, 1...",8
2996,"[1999, 1037, 21522, 5049, 1010, 2178, 13180, 2...","[1037, 2155, 9107, 1037, 12695, 1999, 1996, 23...",3
2997,"[2129, 2079, 1045, 22149, 1037, 3563, 2613, 37...","[2111, 3788, 2083, 1037, 9379, 2103, 2395, 1012]",3
2998,"[2012, 1037, 9116, 8975, 1010, 2619, 2001, 275...","[2111, 3788, 2083, 1037, 9379, 2103, 2395, 1012]",3


#### ✅ Split Data

In [11]:
from sklearn.model_selection import train_test_split

y=df["Toxic Category"]
X_query_train, X_query_test, y_query_train, y_query_test = train_test_split(X_query, y, test_size=0.2, random_state=42)
X_caption_train, X_caption_test, y_caption_train, y_caption_test = train_test_split(X_query, y, test_size=0.2, random_state=42)


# 🧠  Soft Classifier  fitting the model DistilBERT fine-tuned using LoRA