In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split

tamil_train = pd.read_csv("/kaggle/input/hate-speech/tamil_train.csv")

In [11]:
import re,string
def normalize_text(text):
  # text = text.lower()
  text = re.sub('\[.*?\]', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', ' ', text)
  text = re.sub('<.*?>+', ' ', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
  text = re.sub('\n', ' ', text)
  text = re.sub('\w*\d\w*', ' ', text)
  text = re.sub('<handle replaced>', '', text)
  return text

In [12]:
tamil_train['text'] = tamil_train['text'].apply(lambda x: normalize_text(x))
tamil_train.head

<bound method NDFrame.head of        label                                               text
0          1                          அம்மு காலை வணக்கம் அழகுடி
1          1          அசத்தலான அழகு நடிப்பு சூப்பர் ப்ரண்ட்ஸ் 💖
2          0             இப் புண்டா மவனே வீடியோ அளி டி தேவிடியா
3          1                      சானி அள்ளு போ காஞ்சிறப்போகுது
4          1                          மிகவும் அழகான இளவரசி 💖👸💖✨
...      ...                                                ...
17995      0  அடியே மூதேவி உனக்கு கொஞ்சம் கூட மானம் மரியாதை ...
17996      0                பெருசா இருக்கு  ரெண்டு காய் d ponda
17997      0  உப்பு   செருப்பு   சிரிப்பு ஏதும் இல்லை    பூ ...
17998      1                           நீ நயண்தாரா தங்கச்சியா 😜
17999      1  நீங்கள் புடவை கட்டும் விதம்  சூப்பர் என்று சொல...

[18000 rows x 2 columns]>

In [13]:
tamil_val = pd.read_csv("/kaggle/input/hate-speech/tamil_val.csv")
tamil_val['text'] = tamil_val['text'].apply(lambda x: normalize_text(x))

In [14]:
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

MODEL_NAME = "ai4bharat/indic-bert"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        text = str(self.data.iloc[idx]['text'])
        label = int(self.data.iloc[idx]['label'])
        encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_len, return_tensors="pt")
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label)
        }


train_dataset = TextDataset(tamil_train, tokenizer)
val_dataset = TextDataset(tamil_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)


config.json:   0%|          | 0.00/507 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/5.65M [00:00<?, ?B/s]

In [15]:
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, AdamW
from sklearn.metrics import f1_score
from tqdm import tqdm 

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
loss_fn = nn.CrossEntropyLoss()

def train_epoch(model, loader):
    model.train()
    total_loss = 0
    loop = tqdm(loader, desc="Training")
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    return total_loss / len(loader)

def eval_model(model, loader):
    model.eval()
    preds, labels_list = [], []
    loop = tqdm(loader, desc="Evaluating")
    with torch.no_grad():
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            labels_list.extend(labels.cpu().numpy())
    f1 = f1_score(labels_list, preds, average='macro')
    return f1

for epoch in range(3):
    print(f"Epoch {epoch+1}")
    train_loss = train_epoch(model, train_loader)
    val_f1 = eval_model(model, val_loader)
    print(f"Train Loss = {train_loss:.4f}, Val F1 = {val_f1:.4f}")

torch.save(model.state_dict(), "stage1_hate_speech.pth")


pytorch_model.bin:   0%|          | 0.00/135M [00:00<?, ?B/s]

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1


Training: 100%|██████████| 1125/1125 [03:31<00:00,  5.31it/s, loss=0.371]
Evaluating: 100%|██████████| 188/188 [00:22<00:00,  8.46it/s]


Train Loss = 0.5236, Val F1 = 0.8118
Epoch 2


Training: 100%|██████████| 1125/1125 [03:30<00:00,  5.33it/s, loss=0.215] 
Evaluating: 100%|██████████| 188/188 [00:22<00:00,  8.42it/s]


Train Loss = 0.3570, Val F1 = 0.8299
Epoch 3


Training: 100%|██████████| 1125/1125 [03:30<00:00,  5.33it/s, loss=0.0775]
Evaluating: 100%|██████████| 188/188 [00:22<00:00,  8.43it/s]


Train Loss = 0.3046, Val F1 = 0.8344


In [16]:
def data_preprocessing(df, language):
    if language == "tamil":
        annotator_cols = ["ta_a1", "ta_a2", "ta_a3", "ta_a4", "ta_a5", "ta_a6"]
    elif language == "hindi":
        annotator_cols = ["hi_a1", "hi_a2", "hi_a3", "hi_a4", "hi_a5"]
    else:
        annotator_cols = ["en_a1", "en_a2", "en_a3", "en_a4", "en_a5", "en_a6"]

    df[annotator_cols] = df[annotator_cols].fillna("")
    def majority_label(row):
        votes = []
        for col in annotator_cols:
            val = row[col]
            try:
                if val != "":
                    num_val = float(val)
                    if num_val in [0.0, 1.0]:
                        votes.append(int(num_val))
            except:
                continue  
        if not votes:
            return 0
        return 1 if votes.count(1) > votes.count(0) else 0

    df["label"] = df.apply(majority_label, axis=1)
    df["label"] = df["label"].astype(int)
    df = df.drop(columns=annotator_cols + ["key"])

    # Show label distribution
    print(df["label"].value_counts())

    return df

In [17]:
df_tamil = pd.read_csv("/kaggle/input/gender-abuse-dataset/train_ta_l1.csv")
rows, columns = df_tamil.shape
print(f"Rows: {rows}, Columns: {columns}")
tamil_train_gender = data_preprocessing(df_tamil,"tamil")
tamil_train_gender['text'] = tamil_train_gender['text'].apply(lambda x: normalize_text(x))
rows, columns = tamil_train_gender.shape
print(f"Rows: {rows}, Columns: {columns}")
tamil_train_gender.head

Rows: 6779, Columns: 8
label
0    3890
1    2889
Name: count, dtype: int64
Rows: 6779, Columns: 2


<bound method NDFrame.head of                                                    text  label
0            முரசொலி அலுவலகம் அமைந்துள்ள இடம் பஞ்சமி...      0
1        சோத்துக்கு பிச்சை எடுக்கிற கடங்கார நாய்களுக...      0
2              தத்தபுத்த தத்தபுத்த ன்னு எதாவது புரியுதா      0
3         பச்சை மொளகா காரம் vicky அம்மா புண்டை நாறும் 😆      1
4       என்ன உடம்பு டா சாமி  சும்மா வளுவளுனு   முலை ...      1
...                                                 ...    ...
6774                      😭😭😭 ஒம்மாள படிக்கல் புண்ட 😭😭😭      1
6775  🙄🙄🙄🙄 என்ன எழவுயா இது      இதெல்லாம் ஒரு பெருமை...      0
6776  🚨எக்ஸ் பிரஸ் பேர்ல் கப்பல் தீ விபத்துக்கு உள்ள...      0
6777  🤣 🤣 சல்லி ஜாதி வெறி முட்டா புண்ட உங்க பொண்ணுங்...      0
6778  🤣🤣🤣 நீ சொல்றது எல்லாமும் அந்த திம்கவோட தம்பி  ...      1

[6779 rows x 2 columns]>

In [18]:
from tqdm import tqdm

def train_stage2(model, train_loader, epochs=3):
    model.train()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    for epoch in range(epochs):
        total_loss = 0
        loop = tqdm(train_loader, desc=f"Training Stage 2 - Epoch {epoch+1}")
        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())
        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1}: Train Loss = {avg_loss:.4f}")

    # Save fine-tuned model
    torch.save(model.state_dict(), "stage2_gendered_abuse.pth")
    print("Stage 2 Model Saved: stage2_gendered_abuse.pth")


In [19]:
gendered_train_dataset = TextDataset(tamil_train_gender, tokenizer)
gendered_train_loader = DataLoader(gendered_train_dataset, batch_size=16, shuffle=True)

model.load_state_dict(torch.load("/kaggle/working/stage1_hate_speech.pth"))

train_loss = train_stage2(model, gendered_train_loader)

torch.save(model.state_dict(), "stage2_gendered_abuse.pth")


  model.load_state_dict(torch.load("/kaggle/working/stage1_hate_speech.pth"))
Training Stage 2 - Epoch 1: 100%|██████████| 424/424 [01:19<00:00,  5.30it/s, loss=0.571]


Epoch 1: Train Loss = 0.5704


Training Stage 2 - Epoch 2: 100%|██████████| 424/424 [01:20<00:00,  5.30it/s, loss=0.544]


Epoch 2: Train Loss = 0.4623


Training Stage 2 - Epoch 3: 100%|██████████| 424/424 [01:20<00:00,  5.30it/s, loss=0.43]  


Epoch 3: Train Loss = 0.4083
Stage 2 Model Saved: stage2_gendered_abuse.pth


In [20]:
df_tamil_test = pd.read_csv("/kaggle/input/gender-abuse-dataset/test_ta_l1.csv")
rows, columns = df_tamil_test.shape
print(f"Rows: {rows}, Columns: {columns}")
tamil_test_gender = data_preprocessing(df_tamil_test,"tamil")
rows, columns = tamil_test_gender.shape
print(f"Rows: {rows}, Columns: {columns}")

gendered_test_dataset = TextDataset(tamil_test_gender, tokenizer)
gendered_test_loader = DataLoader(gendered_test_dataset, batch_size=16, shuffle=True)


Rows: 1135, Columns: 8
label
0    596
1    539
Name: count, dtype: int64
Rows: 1135, Columns: 2


In [21]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)

model.load_state_dict(torch.load("stage2_gendered_abuse.pth"))
model.eval()

Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("stage2_gendered_abuse.pth"))


AlbertForSequenceClassification(
  (albert): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertSdpaAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features

In [22]:
from sklearn.metrics import f1_score, classification_report
from tqdm import tqdm

def inference_on_gendered_abuse_test(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing on Gendered Abuse", leave=False):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    f1_macro = f1_score(all_labels, all_preds, average='macro')
    print(f"\nTest F1 Score (Macro): {f1_macro:.4f}")
    print("\nClassification Report:\n", classification_report(all_labels, all_preds, digits=4))
    return f1_macro

f1_macro_test = inference_on_gendered_abuse_test(model, gendered_test_loader)
print(f"Final Gendered Abuse Test F1 Score (Macro): {f1_macro_test:.4f}")

                                                                          


Test F1 Score (Macro): 0.7719

Classification Report:
               precision    recall  f1-score   support

           0     0.7661    0.8188    0.7916       596
           1     0.7831    0.7236    0.7522       539

    accuracy                         0.7736      1135
   macro avg     0.7746    0.7712    0.7719      1135
weighted avg     0.7742    0.7736    0.7729      1135

Final Gendered Abuse Test F1 Score (Macro): 0.7719


