In [2]:


import pandas as pd
from sklearn.model_selection import train_test_split

tamil_train = pd.read_csv("/kaggle/input/hertlex-hindi-data/tamil_train.csv")



In [3]:
import re,string
def normalize_text(text):
  # text = text.lower()
  text = re.sub('\[.*?\]', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', ' ', text)
  text = re.sub('<.*?>+', ' ', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
  text = re.sub('\n', ' ', text)
  text = re.sub('\w*\d\w*', ' ', text)
  text = re.sub('<handle replaced>', '', text)
  return text

In [4]:
tamil_train['text'] = tamil_train['text'].apply(lambda x: normalize_text(x))
tamil_train.head


<bound method NDFrame.head of        label                                               text
0          1  சலோமியா             சுண்ட கஞ்சி   சோறு டா குழம...
1          0  டேய் பொட்ட பாடு  உன் வாழுல  ஊரா ன்   சாமானை வை...
2          1                      நான்கு கிளிகள் மூன்று பூக்கள்
3          0                               நல்ல மூடு போல உனக்கு
4          0            இந்த கேலட்டு புண்டை தொல்லை தாங்க முடியல
...      ...                                                ...
23995      0                    ஏ பாவம் டா லூசு மெண்டல் பயலுவலா
23996      0                           டேய் பொட்ட நீ பெய் ஊம்பு
23997      1  அட   ஏன்டா இப்பிடி பன்ரிங்க கிருக்கு மெண்டல் ப...
23998      0  பாத்து  ஆட்டு  அத்தாச்சி  ஒடஞ்சி  போக போது 😁😁😁...
23999      0  யாரும் தப்பா கமன்ட் பண்ணாதீங்க நு சொன்னா மட்டு...

[24000 rows x 2 columns]>

In [5]:


tamil_val = pd.read_csv("/kaggle/input/hertlex-hindi-data/tamil_val.csv")
tamil_val['text'] = tamil_val['text'].apply(lambda x: normalize_text(x))



In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def clean_text(text):
    text = normalize_text(text)
    return word_tokenize(text)

In [8]:
all_text = tamil_train['text'].tolist() + tamil_val['text'].tolist()
tokenized_texts = [clean_text(t) for t in all_text]
vocab = {'<PAD>': 0, '<UNK>': 1}
for sentence in tokenized_texts:
    for word in sentence:
        if word not in vocab:
            vocab[word] = len(vocab)

In [9]:
def load_glove(file_path, vocab, embedding_dim=100):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector

    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for word, idx in vocab.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return torch.tensor(embedding_matrix, dtype=torch.float32)


In [10]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.vec.gz
!gunzip cc.ta.300.vec.gz


--2025-04-13 19:00:34--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ta.300.vec.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 3.163.189.14, 3.163.189.108, 3.163.189.51, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|3.163.189.14|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1213735937 (1.1G) [binary/octet-stream]
Saving to: ‘cc.ta.300.vec.gz’


2025-04-13 19:00:37 (358 MB/s) - ‘cc.ta.300.vec.gz’ saved [1213735937/1213735937]



In [11]:
embedding_dim = 300
glove_path = "/kaggle/working/cc.ta.300.vec"  # Update if needed
embedding_matrix = load_glove(glove_path, vocab, embedding_dim)


In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, word2idx, max_len):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()
        self.vocab = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    
    def __getitem__(self, idx):
        tokens = clean_text(self.texts[idx])
        ids = [self.vocab.get(w, self.vocab['<UNK>']) for w in tokens]
        if len(ids) < self.max_len:
            ids += [self.vocab['<PAD>']] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]
        return torch.tensor(ids), torch.tensor(self.labels[idx])

In [14]:
train_dataset = TextDataset(tamil_train, vocab,max_len=100)
val_dataset = TextDataset(tamil_val, vocab,max_len=100)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [15]:
class CNNBiLSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super(CNNBiLSTMClassifier, self).__init__()
        vocab_size, embed_size = embedding_matrix.shape
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.dropout1 = nn.Dropout(0.1)
        self.conv1 = nn.Conv1d(embed_size, 64, kernel_size=2)
        self.lstm = nn.LSTM(64, 128, batch_first=True, bidirectional=True)
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.1)
        self.out = nn.Linear(128, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout1(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)
        x = x.permute(0, 2, 1)
        x = self.avgpool(x).squeeze(2)
        x = torch.relu(self.fc(x))
        x = self.dropout2(x)
        return self.out(x)


In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNBiLSTMClassifier(embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)


  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))


In [20]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for inputs,labels in tqdm(loader, desc="Training"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def eval_model(model, loader):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for inputs , labels in tqdm(loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            pred = torch.argmax(outputs, dim=1)
            preds.extend(pred.cpu().numpy())
            targets.extend(labels.cpu().numpy())
    return f1_score(targets, preds, average='macro')

In [21]:
for epoch in range(8):
    print(f"\nEpoch {epoch+1}")
    train_loss = train_epoch(model, train_loader,optimizer, criterion)
    val_f1 = eval_model(model, val_loader)
    print(f"Train Loss: {train_loss:.4f} | Val F1: {val_f1:.4f}")
import matplotlib.pyplot as plt
train_losses = []
val_f1s = []
torch.save(model.state_dict(), "tamil_glove_cnn_bilstm_model.pth")





Epoch 1


Training: 100%|██████████| 750/750 [00:06<00:00, 110.16it/s]
Evaluating: 100%|██████████| 94/94 [00:00<00:00, 166.41it/s]


Train Loss: 0.6919 | Val F1: 0.3568

Epoch 2


Training: 100%|██████████| 750/750 [00:06<00:00, 110.68it/s]
Evaluating: 100%|██████████| 94/94 [00:00<00:00, 167.60it/s]


Train Loss: 0.6625 | Val F1: 0.6770

Epoch 3


Training: 100%|██████████| 750/750 [00:06<00:00, 111.10it/s]
Evaluating: 100%|██████████| 94/94 [00:00<00:00, 165.03it/s]


Train Loss: 0.5802 | Val F1: 0.7430

Epoch 4


Training: 100%|██████████| 750/750 [00:06<00:00, 108.34it/s]
Evaluating: 100%|██████████| 94/94 [00:00<00:00, 167.30it/s]


Train Loss: 0.5092 | Val F1: 0.7697

Epoch 5


Training: 100%|██████████| 750/750 [00:06<00:00, 109.25it/s]
Evaluating: 100%|██████████| 94/94 [00:00<00:00, 162.24it/s]


Train Loss: 0.4675 | Val F1: 0.7813

Epoch 6


Training: 100%|██████████| 750/750 [00:06<00:00, 110.31it/s]
Evaluating: 100%|██████████| 94/94 [00:00<00:00, 166.57it/s]


Train Loss: 0.4478 | Val F1: 0.7962

Epoch 7


Training: 100%|██████████| 750/750 [00:06<00:00, 111.14it/s]
Evaluating: 100%|██████████| 94/94 [00:00<00:00, 168.15it/s]


Train Loss: 0.4268 | Val F1: 0.8039

Epoch 8


Training: 100%|██████████| 750/750 [00:06<00:00, 109.52it/s]
Evaluating: 100%|██████████| 94/94 [00:00<00:00, 167.86it/s]


Train Loss: 0.4135 | Val F1: 0.8079


In [22]:
def data_preprocessing(df, language):
    if language == "tamil":
        annotator_cols = ["ta_a1", "ta_a2", "ta_a3", "ta_a4", "ta_a5", "ta_a6"]
    elif language == "hindi":
        annotator_cols = ["hi_a1", "hi_a2", "hi_a3", "hi_a4", "hi_a5"]
    else:
        annotator_cols = ["en_a1", "en_a2", "en_a3", "en_a4", "en_a5", "en_a6"]

    df[annotator_cols] = df[annotator_cols].fillna("")

    def majority_label(row):
        votes = []
        for col in annotator_cols:
            val = row[col]
            try:
                if val != "":
                    num_val = float(val)
                    if num_val in [0.0, 1.0]:
                        votes.append(int(num_val))
            except:
                continue
        if not votes:
            return 0
        return 1 if votes.count(1) > votes.count(0) else 0

    df["label"] = df.apply(majority_label, axis=1)
    df["label"] = df["label"].astype(int)
    df = df.drop(columns=annotator_cols + ["key"])
    print(df["label"].value_counts())
    return df

In [25]:
df_tamil = pd.read_csv("/kaggle/input/hertlex-hindi-data/train_ta_l1.csv")
rows, columns = df_tamil.shape
print(f"Rows: {rows}, Columns: {columns}")
tamil_train_gender = data_preprocessing(df_tamil,"tamil")
tamil_train_gender['text'] = tamil_train_gender['text'].apply(lambda x: normalize_text(x))
rows, columns = tamil_train_gender.shape
print(f"Rows: {rows}, Columns: {columns}")


Rows: 6779, Columns: 8
label
0    3890
1    2889
Name: count, dtype: int64
Rows: 6779, Columns: 2


In [26]:
gendered_train_dataset = TextDataset(tamil_train_gender, vocab,max_len=100)
gendered_train_loader = DataLoader(gendered_train_dataset, batch_size=32, shuffle=True)


In [27]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CNNBiLSTMClassifier(embedding_matrix).to(device)
model.load_state_dict(torch.load("/kaggle/working/tamil_glove_cnn_bilstm_model.pth"))
print("\nLoaded Stage 1 model weights (hate speech) for fine-tuning.")



Loaded Stage 1 model weights (hate speech) for fine-tuning.


  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
  model.load_state_dict(torch.load("/kaggle/working/tamil_glove_cnn_bilstm_model.pth"))


In [None]:
def train_stage2(model, train_loader, epochs=5):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.00005)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0.0
        loop = tqdm(train_loader, desc=f"Training Stage 2 - Epoch {epoch+1}")
        for inputs,labels in loop:
            optimizer.zero_grad()
            inputs,labels = inputs.to(device),labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")
        
    torch.save(model.state_dict(), "stage2_gendered_abuse.pth")
    print(" Stage 2 Model Saved: stage2_gendered_abuse.pth")

train_stage2(model, gendered_train_loader)


Training Stage 2 - Epoch 1: 100%|██████████| 212/212 [00:02<00:00, 82.23it/s, loss=0.301]


Epoch 1 Average Loss: 0.4565


Training Stage 2 - Epoch 2: 100%|██████████| 212/212 [00:02<00:00, 81.62it/s, loss=0.431]


Epoch 2 Average Loss: 0.4526


Training Stage 2 - Epoch 3: 100%|██████████| 212/212 [00:02<00:00, 82.21it/s, loss=0.376]


Epoch 3 Average Loss: 0.4526


Training Stage 2 - Epoch 4: 100%|██████████| 212/212 [00:02<00:00, 81.86it/s, loss=0.429]


Epoch 4 Average Loss: 0.4518


Training Stage 2 - Epoch 5: 100%|██████████| 212/212 [00:02<00:00, 82.23it/s, loss=0.473]


Epoch 5 Average Loss: 0.4479
✅ Stage 2 Model Saved: stage2_gendered_abuse.pth


In [None]:
from sklearn.metrics import f1_score, classification_report
df_tamil_test = pd.read_csv("/kaggle/input/hertlex-hindi-data/test_ta_l1.csv", engine='python')
print("Test Set:", df_tamil_test.shape)
tamil_test_gender = data_preprocessing(df_tamil_test, "tamil")
tamil_test_gender["text"] = tamil_test_gender["text"].apply(lambda x: normalize_text(x))

gendered_test_dataset = TextDataset(tamil_test_gender, vocab,max_len=100)
gendered_test_loader = DataLoader(gendered_test_dataset, batch_size=32,shuffle=True)

model = CNNBiLSTMClassifier(embedding_matrix).to(device)
model.load_state_dict(torch.load("/kaggle/working/stage2_gendered_abuse.pth"))
model.eval()

def inference_on_gendered_abuse_test(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        
        loop = tqdm(test_loader, desc=f"Training Stage 2 - Epoch {epoch+1}")
        for inputs,labels in loop:
            inputs,labels = inputs.to(device),labels.to(device)



            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1_macro = f1_score(all_labels, all_preds, average='macro')
    print(f"\nTest F1 Score (Macro): {f1_macro:.4f}")
    print("\nClassification Report:\n", classification_report(all_labels, all_preds, digits=4))
    return f1_macro

f1_macro_test = inference_on_gendered_abuse_test(model, gendered_test_loader)
print(f" Final Gendered Abuse Test F1 Score (Macro): {f1_macro_test:.4f}")


Test Set: (1135, 8)
label
0    596
1    539
Name: count, dtype: int64


  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
  model.load_state_dict(torch.load("/kaggle/working/stage2_gendered_abuse.pth"))
Training Stage 2 - Epoch 8: 100%|██████████| 36/36 [00:00<00:00, 139.19it/s]


Test F1 Score (Macro): 0.7519

Classification Report:
               precision    recall  f1-score   support

           0     0.7450    0.8087    0.7755       596
           1     0.7664    0.6939    0.7283       539

    accuracy                         0.7542      1135
   macro avg     0.7557    0.7513    0.7519      1135
weighted avg     0.7551    0.7542    0.7531      1135

✅ Final Gendered Abuse Test F1 Score (Macro): 0.7519



