# Exploratory Datat Analysis

In [1]:
import pandas as pd
import seaborn as sns

# Load data
data = pd.read_csv('../Data/train.csv')
data.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [2]:
print(f"total samples: {data.shape[0]}")
print("*"*50)
print(f"total features: {data.shape[1]}")

total samples: 159571
**************************************************
total features: 8


In [3]:
print(data.info())
print(data.describe())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB
None
               toxic   severe_toxic        obscene         threat  \
count  159571.000000  159571.000000  159571.000000  159571.000000   
mean        0.095844       0.009996       0.052948       0.002996   
std         0.294379       0.099477       0.223931       0.054650   
min         0.000000       0.000000       0.000000       0.000000   
25%         0.000000       0.000000       0.000000

In [4]:
from sklearn.model_selection import train_test_split

SEED = 42
LLM_SAMPLE_SIZE = 200

# STEP 1: LOAD & CONSOLIDATE LABELS
df = data

toxicity_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Create a single binary target: If any column is 1, the text is Toxic (1)
df['final_label'] = df[toxicity_cols].max(axis=1)

print(f"Data loaded. Total rows: {len(df)}")
print(f"Toxic samples: {df['final_label'].sum()} | Non-Toxic samples: {len(df) - df['final_label'].sum()}")


# STEP 2: SPLIT & CREATE GOLDEN SAMPLE (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=SEED)

# Now, extract the small "Golden Sample" specifically for the LLM
# We use .copy() to avoid SettingWithCopy warnings
llm_sample = test_df.sample(n=LLM_SAMPLE_SIZE, random_state=SEED).copy()

print(f"LLM Sample of {len(llm_sample)} rows (from Test set).")

# STEP 3: MINIMAL CLEANING
llm_sample['cleaned_text'] = llm_sample['comment_text'].str.replace('\n', ' ', regex=False)

# STEP 4: PROMPT ENGINEERING
prompt_template = """You are the best content moderator in the history.
                     If you fail in your job, your company will shutdown.
                     Classify the following text as 'Toxic' or 'Non-Toxic'.
                     Reply with ONLY one word.

                     Text: "{}"
                     Answer:
"""

# Apply the template to every row
llm_sample['llm_prompt'] = llm_sample['cleaned_text'].apply(lambda x: prompt_template.format(x))


# STEP 5: SAVE ARTIFACTS
output_cols = ['id', 'final_label', 'llm_prompt']
llm_sample[output_cols].to_csv('../Data/llm/llm_baseline_inputs.csv', index=False)

# OPTIONAL: Save the indices/IDs to a separate file so you can filter
# the same rows when you evaluate RoBERTa later.
llm_sample['id'].to_csv('../Data/RoBERTa/golden_sample_ids.csv', index=False)

print("\nProcessing complete!")
print("1. 'llm_baseline_inputs.csv' -> Feed this to your LLM.")
print("2. 'golden_sample_ids.csv' -> Use this to filter RoBERTa predictions later.")

Data loaded. Total rows: 159571
Toxic samples: 16225 | Non-Toxic samples: 143346
LLM Sample of 200 rows (from Test set).

Processing complete!
1. 'llm_baseline_inputs.csv' -> Feed this to your LLM.
2. 'golden_sample_ids.csv' -> Use this to filter RoBERTa predictions later.


# Step 3

In [5]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# CONFIGURATION
# Recommended: 'TinyLlama/TinyLlama-1.1B-Chat-v1.0' for speed/testing
# Recommended: 'mistralai/Mistral-7B-Instruct-v0.2' for better quality (requires GPU + quantization)
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
INPUT_FILE = '../Data/llm/llm_baseline_inputs.csv'
OUTPUT_FILE = '../Data/llm/llm_baseline_results.csv'

# 1. LOAD MODEL & TOKENIZER
print(f"Loading model: {MODEL_ID}...")

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Load model with 4-bit quantization if using a large model on Colab (saves memory)
# quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,
    # quantization_config=quantization_config # Uncomment if using Mistral-7B
)

# Create a text generation pipeline
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=10,   # We only expect "Toxic" or "Non-Toxic"
    temperature=0.1      # Low temp for deterministic results
)

# ==========================================
# 2. LOAD DATA
# ==========================================
df = pd.read_csv(INPUT_FILE)
print(f"Loaded {len(df)} samples for inference.")

# ==========================================
# 3. INFERENCE LOOP
# ==========================================
predictions = []
latencies = []
raw_outputs = []

print("Starting inference...")

for index, row in df.iterrows():
    prompt = row['llm_prompt']

    # Measure Latency: Start Timer
    start_time = time.time()

    # Run Inference
    # return_full_text=False ensures we only get the new generated part
    result = pipe(prompt, return_full_text=False)[0]['generated_text']

    # Measure Latency: Stop Timer
    end_time = time.time()
    latency_ms = (end_time - start_time) * 1000

    # Store raw results
    raw_outputs.append(result)
    latencies.append(latency_ms)

    # Simple Parsing Logic (Convert text to 0/1)
    clean_result = result.strip().lower()
    if "non-toxic" in clean_result:
        pred = 0
    elif "toxic" in clean_result:
        pred = 1
    else:
        pred = 0 # Default fallback or handle as "Unknown" (-1)

    predictions.append(pred)

    if index % 10 == 0:
        print(f"Processed {index}/{len(df)} | Last Latency: {latency_ms:.2f}ms")

# ==========================================
# 4. SAVE RESULTS
# ==========================================
df['llm_raw_output'] = raw_outputs
df['llm_pred_label'] = predictions
df['latency_ms'] = latencies

df.to_csv(OUTPUT_FILE, index=False)
print(f"Done! Results saved to {OUTPUT_FILE}")

  from .autonotebook import tqdm as notebook_tqdm


Loading model: TinyLlama/TinyLlama-1.1B-Chat-v1.0...


`torch_dtype` is deprecated! Use `dtype` instead!
Some parameters are on the meta device because they were offloaded to the disk.
Device set to use mps


Loaded 200 samples for inference.
Starting inference...
Processed 0/200 | Last Latency: 37955.29ms


RuntimeError: probability tensor contains either `inf`, `nan` or element < 0