In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split

hindi_train = pd.read_csv("/kaggle/input/hate-speech/hindi_train.csv")
tamil_train = pd.read_csv("/kaggle/input/hate-speech/tamil_train.csv")

In [10]:
eng_df = pd.read_csv('/kaggle/input/hate-speech-english/english_dataset.tsv', sep='\t')
eng_df = eng_df.drop(columns=['text_id','task_2', 'task_3'])

eng_df = eng_df.rename(columns={'task_1': 'label'})

# Map label values
label_map = {'NOT': 0, 'HOF': 1}
eng_df['label'] = eng_df['label'].map(label_map)
eng_df.head
eng_train, eng_val = train_test_split(eng_df, test_size=0.3, random_state=42, shuffle=True)

In [11]:
import re,string
def normalize_text(text):
  # text = text.lower()
  text = re.sub('\[.*?\]', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', ' ', text)
  text = re.sub('<.*?>+', ' ', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
  text = re.sub('\n', ' ', text)
  text = re.sub('\w*\d\w*', ' ', text)
  text = re.sub('<handle replaced>', '', text)
  return text

In [12]:
hindi_train['text'] = hindi_train['text'].apply(lambda x: normalize_text(x))
tamil_train['text'] = tamil_train['text'].apply(lambda x: normalize_text(x))
eng_train['text'] = eng_train['text'].apply(lambda x: normalize_text(x))

In [13]:
combined_train = pd.concat([eng_train, hindi_train, tamil_train], ignore_index=True)
combined_train.to_csv('combined_train.csv')

hindi_val = pd.read_csv("/kaggle/input/hate-speech/hindi_val.csv")
tamil_val = pd.read_csv("/kaggle/input/hate-speech/tamil_val.csv")

hindi_val['text'] = hindi_val['text'].apply(lambda x: normalize_text(x))
tamil_val['text'] = tamil_val['text'].apply(lambda x: normalize_text(x))
eng_val['text'] = eng_val['text'].apply(lambda x: normalize_text(x))

combined_val = pd.concat([eng_val, hindi_val, tamil_val], ignore_index=True)
combined_val.to_csv('combined_val.csv')

In [14]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

MODEL_NAME = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['text'])
        label = int(self.data.iloc[idx]['label'])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label)
        }


train_dataset = TextDataset(combined_train, tokenizer)
val_dataset = TextDataset(combined_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


In [15]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

# ✅ Training Loop with tqdm
def train_epoch(model, loader):
    model.train()
    total_loss = 0
    loop = tqdm(loader, desc="Training")
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())  # Show loss in tqdm bar
    return total_loss / len(loader)

def eval_model(model, loader):
    model.eval()
    preds, labels_list = [], []
    loop = tqdm(loader, desc="Evaluating")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            labels_list.extend(labels.cpu().numpy())
    f1 = f1_score(labels_list, preds, average='macro')
    return f1

for epoch in range(3):
    print(f"Epoch {epoch+1}")
    train_loss = train_epoch(model, train_loader)
    val_f1 = eval_model(model, val_loader)
    print(f"Train Loss = {train_loss:.4f}, Val F1 = {val_f1:.4f}")

torch.save(model.state_dict(), "stage1_hate_speech.pth")


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1


Training: 100%|██████████| 2643/2643 [08:18<00:00,  5.30it/s, loss=0.382]
Evaluating: 100%|██████████| 453/453 [00:54<00:00,  8.29it/s]


Train Loss = 0.5810, Val F1 = 0.7658
Epoch 2


Training: 100%|██████████| 2643/2643 [08:18<00:00,  5.30it/s, loss=0.448] 
Evaluating: 100%|██████████| 453/453 [00:54<00:00,  8.31it/s]


Train Loss = 0.4289, Val F1 = 0.7880
Epoch 3


Training: 100%|██████████| 2643/2643 [08:18<00:00,  5.30it/s, loss=0.0897]
Evaluating: 100%|██████████| 453/453 [00:54<00:00,  8.29it/s]


Train Loss = 0.3742, Val F1 = 0.7844


In [16]:
def data_preprocessing(df, language):
    # Select annotator columns based on language
    if language == "tamil":
        annotator_cols = ["ta_a1", "ta_a2", "ta_a3", "ta_a4", "ta_a5", "ta_a6"]
    elif language == "hindi":
        annotator_cols = ["hi_a1", "hi_a2", "hi_a3", "hi_a4", "hi_a5"]
    else:
        annotator_cols = ["en_a1", "en_a2", "en_a3", "en_a4", "en_a5", "en_a6"]

    df[annotator_cols] = df[annotator_cols].fillna("")
    def majority_label(row):
        votes = []
        for col in annotator_cols:
            val = row[col]
            try:
                if val != "":
                    num_val = float(val)
                    if num_val in [0.0, 1.0]:
                        votes.append(int(num_val))
            except:
                continue  
        if not votes:
            return 0
        return 1 if votes.count(1) > votes.count(0) else 0

    # Apply the majority label function
    df["label"] = df.apply(majority_label, axis=1)

    # Ensure label column is integer type
    df["label"] = df["label"].astype(int)

    # Drop annotation columns and unnecessary 'key' column
    df = df.drop(columns=annotator_cols + ["key"])

    # Show label distribution
    print(df["label"].value_counts())

    return df

In [17]:
df_hindi = pd.read_csv("/kaggle/input/gender-abuse-dataset/train_hi_l1.csv")
rows, columns = df_hindi.shape
print(f"Rows: {rows}, Columns: {columns}")
hindi_train_gender = data_preprocessing(df_hindi,"hindi")
hindi_train_gender['text'] = hindi_train_gender['text'].apply(lambda x: normalize_text(x))
rows, columns = hindi_train_gender.shape
print(f"Rows: {rows}, Columns: {columns}")

Rows: 6197, Columns: 7
label
0    4437
1    1760
Name: count, dtype: int64
Rows: 6197, Columns: 2


In [18]:
df_tamil = pd.read_csv("/kaggle/input/gender-abuse-dataset/train_ta_l1.csv")
rows, columns = df_tamil.shape
print(f"Rows: {rows}, Columns: {columns}")
tamil_train_gender = data_preprocessing(df_tamil,"tamil")
tamil_train_gender['text'] = tamil_train_gender['text'].apply(lambda x: normalize_text(x))
rows, columns = tamil_train_gender.shape
print(f"Rows: {rows}, Columns: {columns}")

Rows: 6779, Columns: 8
label
0    3890
1    2889
Name: count, dtype: int64
Rows: 6779, Columns: 2


In [19]:
df_eng = pd.read_csv("/kaggle/input/gender-abuse-dataset/train_en_l1.csv")
rows, columns = df_eng.shape
print(f"Rows: {rows}, Columns: {columns}")
eng_train_gender = data_preprocessing(df_eng,"english")
eng_train_gender['text'] = eng_train_gender['text'].apply(lambda x: normalize_text(x))
rows, columns = eng_train_gender.shape
print(f"Rows: {rows}, Columns: {columns}")

Rows: 6531, Columns: 8
label
0    5269
1    1262
Name: count, dtype: int64
Rows: 6531, Columns: 2


In [20]:
from tqdm import tqdm

def train_stage2(model, train_loader, epochs=3):
    model.train()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    for epoch in range(epochs):
        total_loss = 0
        loop = tqdm(train_loader, desc=f"Training Stage 2 - Epoch {epoch+1}")
        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}: Train Loss = {avg_loss:.4f}")

    # Save fine-tuned model
    torch.save(model.state_dict(), "stage2_gendered_abuse.pth")
    print("Stage 2 Model Saved: stage2_gendered_abuse.pth")


In [21]:
combined_train_gender = pd.concat([eng_train_gender, hindi_train_gender, tamil_train_gender], ignore_index=True)
combined_train_gender.to_csv('combined_train_gender.csv')

gendered_train_dataset = TextDataset(combined_train_gender, tokenizer)
gendered_train_loader = DataLoader(gendered_train_dataset, batch_size=16, shuffle=True)

model.load_state_dict(torch.load("/kaggle/working/stage1_hate_speech.pth"))

# Fine-tune on gendered abuse
train_loss = train_stage2(model, gendered_train_loader)

# Save final model
torch.save(model.state_dict(), "stage2_gendered_abuse.pth")


  model.load_state_dict(torch.load("/kaggle/working/stage1_hate_speech.pth"))
Training Stage 2 - Epoch 1: 100%|██████████| 1220/1220 [03:51<00:00,  5.28it/s, loss=0.652]


Epoch 1: Train Loss = 0.5152


Training Stage 2 - Epoch 2: 100%|██████████| 1220/1220 [03:51<00:00,  5.28it/s, loss=0.123]


Epoch 2: Train Loss = 0.4440


Training Stage 2 - Epoch 3: 100%|██████████| 1220/1220 [03:50<00:00,  5.28it/s, loss=0.268] 


Epoch 3: Train Loss = 0.3908
Stage 2 Model Saved: stage2_gendered_abuse.pth


In [22]:
df_eng_test = pd.read_csv("/kaggle/input/gender-abuse-dataset/test_en_l1.csv")
rows, columns = df_eng_test.shape
print(f"Rows: {rows}, Columns: {columns}")
eng_test_gender = data_preprocessing(df_eng_test,"english")
rows, columns = eng_test_gender.shape
print(f"Rows: {rows}, Columns: {columns}")

df_hindi_test = pd.read_csv("/kaggle/input/gender-abuse-dataset/test_hi_l1.csv", engine='python')
rows, columns = df_hindi_test.shape
print(f"Rows: {rows}, Columns: {columns}")
hindi_test_gender = data_preprocessing(df_hindi_test,"hindi")
rows, columns = hindi_test_gender.shape
print(f"Rows: {rows}, Columns: {columns}")

df_tamil_test = pd.read_csv("/kaggle/input/gender-abuse-dataset/test_ta_l1.csv")
rows, columns = df_tamil_test.shape
print(f"Rows: {rows}, Columns: {columns}")
tamil_test_gender = data_preprocessing(df_tamil_test,"tamil")
rows, columns = tamil_test_gender.shape
print(f"Rows: {rows}, Columns: {columns}")

combined_test_gender = pd.concat([hindi_test_gender, tamil_test_gender], ignore_index=True)

gendered_test_dataset = TextDataset(combined_test_gender, tokenizer)
gendered_test_loader = DataLoader(gendered_test_dataset, batch_size=16, shuffle=True)


Rows: 1107, Columns: 8
label
0    877
1    230
Name: count, dtype: int64
Rows: 1107, Columns: 2
Rows: 1517, Columns: 7
label
0    1159
1     358
Name: count, dtype: int64
Rows: 1517, Columns: 2
Rows: 1135, Columns: 8
label
0    596
1    539
Name: count, dtype: int64
Rows: 1135, Columns: 2


In [23]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

model.load_state_dict(torch.load("stage2_gendered_abuse.pth"))
model.eval()

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("stage2_gendered_abuse.pth"))


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features

In [24]:
from sklearn.metrics import f1_score, classification_report
from tqdm import tqdm

def inference_on_gendered_abuse_test(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing on Gendered Abuse", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    # F1 Score (Macro)
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    print(f"\nTest F1 Score (Macro): {f1_macro:.4f}")
    print("\nClassification Report:\n", classification_report(all_labels, all_preds, digits=4))
    return f1_macro

f1_macro_test = inference_on_gendered_abuse_test(model, gendered_test_loader)
print(f"Final Gendered Abuse Test F1 Score (Macro): {f1_macro_test:.4f}")

                                                                            


Test F1 Score (Macro): 0.7177

Classification Report:
               precision    recall  f1-score   support

           0     0.7735    0.9436    0.8501      1755
           1     0.8063    0.4593    0.5852       897

    accuracy                         0.7798      2652
   macro avg     0.7899    0.7014    0.7177      2652
weighted avg     0.7846    0.7798    0.7605      2652

Final Gendered Abuse Test F1 Score (Macro): 0.7177


