##

## DATA PREPARATION AND MODEL TRAINING

In [37]:
import json
import pandas as pd

# Load the JSON file
with open("dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Check sample structure
print(list(data.keys())[:5])  # Print first 5 keys (post IDs)
print(data[list(data.keys())[0]])  # Print first post details


['1179055004553900032_twitter', '1179063826874032128_twitter', '1178793830532956161_twitter', '1179088797964763136_twitter', '1179085312976445440_twitter']
{'post_id': '1179055004553900032_twitter', 'annotators': [{'label': 'normal', 'annotator_id': 1, 'target': ['None']}, {'label': 'normal', 'annotator_id': 2, 'target': ['None']}, {'label': 'normal', 'annotator_id': 3, 'target': ['None']}], 'rationales': [], 'post_tokens': ['i', 'dont', 'think', 'im', 'getting', 'my', 'baby', 'them', 'white', '9', 'he', 'has', 'two', 'white', 'j', 'and', 'nikes', 'not', 'even', 'touched']}


In [1]:
import json
import pandas as pd
from collections import Counter

# Load the JSON file
with open("dataset.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Extract text and majority label
texts, labels = [], []

for post_id, post_data in data.items():
    text = " ".join(post_data["post_tokens"])  # Convert tokens to sentence
    
    # Get all annotator labels
    all_labels = [ann["label"] for ann in post_data["annotators"]]
    
    # Majority vote for final label
    final_label = Counter(all_labels).most_common(1)[0][0]

    texts.append(text)
    labels.append(final_label)

# Convert to Pandas DataFrame
df = pd.DataFrame({"text": texts, "label": labels})

# Save as CSV
df.to_csv("hatexplain_processed.csv", index=False)

# Display first few rows
print(df.head())


                                                text       label
0  i dont think im getting my baby them white 9 h...      normal
1  we cannot continue calling ourselves feminists...      normal
2                      nawt yall niggers ignoring me      normal
3  <user> i am bit confused coz chinese ppl can n...  hatespeech
4  this bitch in whataburger eating a burger with...  hatespeech


In [2]:
print(df["label"].value_counts())


label
normal        8153
hatespeech    6234
offensive     5761
Name: count, dtype: int64


## OVERSAMPLING THE MINOR DATATSET

In [3]:
from imblearn.over_sampling import RandomOverSampler
import pandas as pd

# Load dataset
df = pd.read_csv("hatexplain_processed.csv")

# Define oversampling strategy
ros = RandomOverSampler(random_state=42)

# Apply oversampling
X_resampled, y_resampled = ros.fit_resample(df[["text"]], df["label"])

# Convert back to DataFrame
df_balanced = pd.DataFrame({"text": X_resampled["text"], "label": y_resampled})

# Save the balanced dataset
df_balanced.to_csv("hatexplain_balanced.csv", index=False)

# Check new class distribution
print(df_balanced["label"].value_counts())


label
normal        8153
hatespeech    8153
offensive     8153
Name: count, dtype: int64


In [2]:
import torch
from transformers import DistilBertForSequenceClassification
from datasets import Dataset
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


## ENCODING

In [20]:
# Load dataset
df = pd.read_csv("hatexplain_balanced.csv")

# Check unique labels
print("Unique labels before conversion:", df["label"].unique())

# Convert labels from text to numeric values
label_mapping = {"normal": 0, "hatespeech": 1, "offensive": 2}
df["label"] = df["label"].map(label_mapping)

# Ensure labels are converted correctly
print("Unique labels after conversion:", df["label"].unique())

# Convert to Hugging Face Dataset
dataset = Dataset.from_pandas(df)


Unique labels before conversion: ['normal' 'hatespeech' 'offensive']
Unique labels after conversion: [0 1 2]


In [15]:
# Load tokenizer
from transformers import AutoTokenizer

# Load DistilBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

print("✅ Tokenizer loaded successfully!")



✅ Tokenizer loaded successfully!


In [16]:
def tokenize(batch):
    encoding = tokenizer(batch["text"], padding="max_length", truncation=True)
    encoding["labels"] = batch["label"]  # Ensure labels are included
    return encoding

# Apply tokenization
dataset = dataset.map(tokenize, batched=True)


Map: 100%|██████████| 24459/24459 [00:08<00:00, 2961.19 examples/s]


In [17]:
train_test = dataset.train_test_split(test_size=0.2)
train_dataset = train_test["train"]
eval_dataset = train_test["test"]


# MODEL : tinybert

In [37]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "huawei-noah/TinyBERT_General_6L_768D"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)  # 3 classes: normal, hatespeech, offensive



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_6L_768D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
from transformers import EarlyStoppingCallback
from transformers import TrainingArguments
from transformers import Trainer, EarlyStoppingCallback

# Modify Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,  # Set a higher value; early stopping will stop it automatically
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,  # Ensures we save the best model
    metric_for_best_model="eval_loss",  # Stop when validation loss stops improving
    greater_is_better=False,  # Since lower loss is better
)

# Define Trainer with Early Stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],  # Stop after 2 epochs of no improvement
)




In [31]:
print("🚀 Starting training...")
trainer.train()
print("✅ Training complete.")
model.save_pretrained("tinybert")
tokenizer.save_pretrained("tinybert_hate_speech")
print("✅ Model and tokenizer saved successfully.")


🚀 Starting training...


Epoch,Training Loss,Validation Loss
1,0.7466,0.776482
2,0.7767,0.733485
3,0.5652,0.752519
4,0.4894,0.786026


Error while downloading from https://cdn-lfs.hf.co/huawei-noah/TinyBERT_General_6L_768D/93343d2b799d2f2d29ef6c2c7ce01906d4fc47f58dbb908048c58c5d76a018b3?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27model.safetensors%3B+filename%3D%22model.safetensors%22%3B&Expires=1742919950&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0MjkxOTk1MH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5oZi5jby9odWF3ZWktbm9haC9UaW55QkVSVF9HZW5lcmFsXzZMXzc2OEQvOTMzNDNkMmI3OTlkMmYyZDI5ZWY2YzJjN2NlMDE5MDZkNGZjNDdmNThkYmI5MDgwNDhjNThjNWQ3NmEwMThiMz9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=VAB9MpbDYdqpyq9MdHxEoqnyDe5P7A4%7EDpFNOACmE3JJI-9rElzIPQZhqoYS1vzrZyFJ5S6ZCxGTltokG25%7EylOuvO1f-7qPfY4HBZCGiY6IZcVvRQkbF7-QN2hxQmgjr-EtqaHfcL2Xb4zsnKQiS%7EuqxrD0NY5W--ORaPS4LR0F7S5QAgE6RW2cRXveFFr6PQ%7ERgwBWqYa47Afut6l2Px1CfaDfvfGWspmJR8VbcLZ58MoTaIh1Hl3C0XKwadcKzseLFABdfxbDLYwbbYGsuaYd8xC2jonTCSWS6VroAEAfkLj9h%7E88v5CLI3ssiOjAiXn2d5R6Ck5FTD%7EoBcWDcw__&Key-Pair

✅ Training complete.
✅ Model and tokenizer saved successfully.


In [41]:
from transformers import AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
from safetensors.torch import load_file
import torch

# ✅ Define Paths
model_path = "tinybert"
tokenizer_path = "tinybert_hate_speech"
model_file = f"{model_path}/model.safetensors"

# ✅ Load Config
config = AutoConfig.from_pretrained(model_path)  

# ✅ Load Model Architecture
model = AutoModelForSequenceClassification.from_config(config)

# ✅ Load Weights from Safetensors
state_dict = load_file(model_file)
model.load_state_dict(state_dict)
model.eval()  # Set to evaluation mode

# ✅ Load Tokenizer from Different Directory
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

print("✅ Model and tokenizer loaded successfully!")


✅ Model and tokenizer loaded successfully!


In [44]:
def predict(text, model, tokenizer):
    # Tokenize input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

    # Move input tensors to model's device (CPU/GPU)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    # Get model predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Convert logits to probabilities (softmax)
    probs = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Get predicted label
    predicted_class = torch.argmax(probs, dim=-1).item()

    # Label mapping (modify if your dataset uses different labels)
    label_map = {0: "normal", 1: "hatespeech", 2: "offensive"}
    return label_map[predicted_class], probs.tolist()

# Example usage
text = "fuck you!"  # Change this for testing
prediction, probabilities = predict(text, model, tokenizer)

print(f"Prediction: {prediction}")
print(f"Probabilities: {probabilities}")


Prediction: hatespeech
Probabilities: [[0.09392083436250687, 0.5404053330421448, 0.36567381024360657]]
