# Exploratory Datat Analysis

In [1]:
import pandas as pd
import seaborn as sns

# Load data
data = pd.read_csv('../Data/train.csv')
data.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [2]:
print(f"total samples: {data.shape[0]}")
print("*"*50)
print(f"total features: {data.shape[1]}")

total samples: 159571
**************************************************
total features: 8


In [3]:
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB
None
               toxic   severe_toxic        obscene         threat  \
count  159571.000000  159571.000000  159571.000000  159571.000000   
mean        0.095844       0.009996       0.052948       0.002996   
std         0.294379       0.099477       0.223931       0.054650   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000

In [4]:
from sklearn.model_selection import train_test_split

SEED = 42
LLM_SAMPLE_SIZE = 200

# STEP 1: LOAD & CONSOLIDATE LABELS
df = data

toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Create a single binary target
df['final_label'] = df[toxicity_cols].max(axis=1)

print(f"Data loaded. Total rows: {len(df)}")
print(f"Toxic samples: {df['final_label'].sum()} | Non-Toxic samples: {len(df) - df['final_label'].sum()}")


# STEP 2: SPLIT & CREATE GOLDEN SAMPLE (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED)

# Now, extract the small "Golden Sample" specifically for the LLM
# We use .copy() to avoid SettingWithCopy warnings
llm_sample = test_df.sample(n=LLM_SAMPLE_SIZE, random_state=SEED).copy()

print(f"LLM Sample of {len(llm_sample)} rows (from Test set).")

# STEP 3: MINIMAL CLEANING
llm_sample['cleaned_text'] = llm_sample['comment_text'].str.replace('\n', ' ', regex=False)

# STEP 4: PROMPT ENGINEERING
prompt_template = """You are the best content moderator in the history.
                     If you fail in your job, your company will shutdown.
                     Classify the following text as 'Toxic' or 'Non-Toxic'.
                     Reply with ONLY one word.

                     Text: "{}"
                     Answer:
"""

# Apply the template to every row
llm_sample['llm_prompt'] = llm_sample['cleaned_text'].apply(lambda x: prompt_template.format(x))


# STEP 5: SAVE ARTIFACTS
output_cols = ['id', 'final_label', 'llm_prompt']
llm_sample[output_cols].to_csv('../Data/llm/llm_baseline_inputs.csv', index=False)
llm_sample['id'].to_csv('../Data/RoBERTa/golden_sample_ids.csv', index=False)

print("\nProcessing complete!")
print("1. 'llm_baseline_inputs.csv' -> Feed this to your LLM.")
print("2. 'golden_sample_ids.csv' -> Use this to filter RoBERTa predictions later.")

Data loaded. Total rows: 159571
Toxic samples: 16225 | Non-Toxic samples: 143346
LLM Sample of 200 rows (from Test set).

Processing complete!
1. 'llm_baseline_inputs.csv' -> Feed this to your LLM.
2. 'golden_sample_ids.csv' -> Use this to filter RoBERTa predictions later.
