In [68]:
import pandas as pd
from sklearn.model_selection import train_test_split

eng_df = pd.read_csv("/kaggle/input/hertlex-hindi-data/english_dataset.tsv",sep='\t')

eng_df = eng_df.drop(columns=['text_id','task_2', 'task_3'])

In [69]:
eng_df = eng_df.rename(columns={'task_1': 'label'})
label_map = {'NOT': 0, 'HOF': 1}
eng_df['label'] = eng_df['label'].map(label_map)
eng_df.head
eng_train, eng_val = train_test_split(eng_df, test_size=0.3, random_state=42, shuffle=True)


In [None]:
import re,string
def normalize_text(text):
  # text = text.lower()
  emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"  
                              u"\U0001F300-\U0001F5FF"  
                              u"\U0001F680-\U0001F6FF"  
                              u"\U0001F700-\U0001F77F"  
                              u"\U0001F780-\U0001F7FF"  
                              u"\U0001F800-\U0001F8FF"  
                              u"\U0001F900-\U0001F9FF"  
                              u"\U0001FA00-\U0001FA6F"  
                              u"\U0001FA70-\U0001FAFF"  
                              u"\U00002702-\U000027B0"  
                              u"\U000024C2-\U0001F251"
                              "]+", flags=re.UNICODE)
  text = text.lower()
  text = re.sub('\[.*?\]', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', ' ', text)
  text = re.sub('<.*?>+', ' ', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
  text = re.sub('\n', ' ', text)
  text = re.sub('\w*\d\w*', ' ', text)
  text = re.sub('<handle replaced>', '', text)
  text = emoji_pattern.sub(r'', text)
  return text

In [71]:
eng_train['text'] = eng_train['text'].apply(lambda x: normalize_text(x))
eng_train
eng_val['text'] = eng_val['text'].apply(lambda x: normalize_text(x))

In [72]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [73]:
def clean_text(text):
    text = normalize_text(text)
    return word_tokenize(text)

In [74]:
all_text = tamil_train['text'].tolist() + tamil_val['text'].tolist()
tokenized_texts = [clean_text(t) for t in all_text]
vocab = {'<PAD>': 0, '<UNK>': 1}
for sentence in tokenized_texts:
    for word in sentence:
        if word not in vocab:
            vocab[word] = len(vocab)

In [75]:
def load_glove(file_path, vocab, embedding_dim=100):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector

    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for word, idx in vocab.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return torch.tensor(embedding_matrix, dtype=torch.float32)


In [None]:
embedding_dim = 300
glove_path = "/kaggle/input/hertlex-hindi-data/glove.6B.300d.txt"  
embedding_matrix = load_glove(glove_path, vocab, embedding_dim)


In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, word2idx, max_len):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()
        self.vocab = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)



    def __getitem__(self, idx):
        tokens = clean_text(self.texts[idx])
        ids = [self.vocab.get(w, self.vocab['<UNK>']) for w in tokens]
        if len(ids) < self.max_len:
            ids += [self.vocab['<PAD>']] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]
        return torch.tensor(ids), torch.tensor(self.labels[idx])

In [78]:
train_dataset = TextDataset(eng_train, vocab,max_len=100)
val_dataset = TextDataset(eng_val, vocab,max_len=100)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [79]:
class CNNBiLSTMClassifier(nn.Module):
    def __init__(self, embedding_matrix):
        super(CNNBiLSTMClassifier, self).__init__()
        vocab_size, embed_size = embedding_matrix.shape
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.dropout1 = nn.Dropout(0.1)
        self.conv1 = nn.Conv1d(embed_size, 64, kernel_size=2)
        self.lstm = nn.LSTM(64, 128, batch_first=True, bidirectional=True)
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.1)
        self.out = nn.Linear(128, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout1(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)
        x = x.permute(0, 2, 1)
        x = self.avgpool(x).squeeze(2)
        x = torch.relu(self.fc(x))
        x = self.dropout2(x)
        return self.out(x)


In [80]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNBiLSTMClassifier(embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)


  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))


In [81]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for inputs,labels in tqdm(loader, desc="Training"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def eval_model(model, loader):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for inputs , labels in tqdm(loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            pred = torch.argmax(outputs, dim=1)
            preds.extend(pred.cpu().numpy())
            targets.extend(labels.cpu().numpy())
    return f1_score(targets, preds, average='macro')

In [92]:
for epoch in range(15):
    print(f"\nEpoch {epoch+1}")
    train_loss = train_epoch(model, train_loader,optimizer, criterion)
    val_f1 = eval_model(model, val_loader)
    print(f"Train Loss: {train_loss:.4f} | Val F1: {val_f1:.4f}")
torch.save(model.state_dict(), "eng_glove_cnn_bilstm_model.pth")





Epoch 1


Training: 100%|██████████| 128/128 [00:01<00:00, 88.00it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 121.66it/s]


Train Loss: 0.8709 | Val F1: 0.4097

Epoch 2


Training: 100%|██████████| 128/128 [00:01<00:00, 86.14it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 121.87it/s]


Train Loss: 0.8702 | Val F1: 0.4097

Epoch 3


Training: 100%|██████████| 128/128 [00:01<00:00, 87.95it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 122.90it/s]


Train Loss: 0.8707 | Val F1: 0.4097

Epoch 4


Training: 100%|██████████| 128/128 [00:01<00:00, 88.59it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 121.53it/s]


Train Loss: 0.8684 | Val F1: 0.4097

Epoch 5


Training: 100%|██████████| 128/128 [00:01<00:00, 88.31it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 122.50it/s]


Train Loss: 0.8701 | Val F1: 0.4097

Epoch 6


Training: 100%|██████████| 128/128 [00:01<00:00, 88.04it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 121.72it/s]


Train Loss: 0.8717 | Val F1: 0.4097

Epoch 7


Training: 100%|██████████| 128/128 [00:01<00:00, 84.58it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 114.26it/s]


Train Loss: 0.8687 | Val F1: 0.4097

Epoch 8


Training: 100%|██████████| 128/128 [00:01<00:00, 87.95it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 123.56it/s]


Train Loss: 0.8700 | Val F1: 0.4097

Epoch 9


Training: 100%|██████████| 128/128 [00:01<00:00, 87.72it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 121.18it/s]


Train Loss: 0.8687 | Val F1: 0.4097

Epoch 10


Training: 100%|██████████| 128/128 [00:01<00:00, 88.94it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 123.19it/s]


Train Loss: 0.8698 | Val F1: 0.4097

Epoch 11


Training: 100%|██████████| 128/128 [00:01<00:00, 89.03it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 123.40it/s]


Train Loss: 0.8725 | Val F1: 0.4097

Epoch 12


Training: 100%|██████████| 128/128 [00:01<00:00, 86.32it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 120.45it/s]


Train Loss: 0.8683 | Val F1: 0.4097

Epoch 13


Training: 100%|██████████| 128/128 [00:01<00:00, 87.72it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 122.89it/s]


Train Loss: 0.8705 | Val F1: 0.4097

Epoch 14


Training: 100%|██████████| 128/128 [00:01<00:00, 89.00it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 122.67it/s]


Train Loss: 0.8687 | Val F1: 0.4097

Epoch 15


Training: 100%|██████████| 128/128 [00:01<00:00, 88.44it/s]
Evaluating: 100%|██████████| 55/55 [00:00<00:00, 122.94it/s]


Train Loss: 0.8709 | Val F1: 0.4097


In [93]:
def data_preprocessing(df, language):
    if language == "tamil":
        annotator_cols = ["ta_a1", "ta_a2", "ta_a3", "ta_a4", "ta_a5", "ta_a6"]
    elif language == "hindi":
        annotator_cols = ["hi_a1", "hi_a2", "hi_a3", "hi_a4", "hi_a5"]
    else:
        annotator_cols = ["en_a1", "en_a2", "en_a3", "en_a4", "en_a5", "en_a6"]

    df[annotator_cols] = df[annotator_cols].fillna("")

    def majority_label(row):
        votes = []
        for col in annotator_cols:
            val = row[col]
            try:
                if val != "":
                    num_val = float(val)
                    if num_val in [0.0, 1.0]:
                        votes.append(int(num_val))
            except:
                continue
        if not votes:
            return 0
        return 1 if votes.count(1) > votes.count(0) else 0

    df["label"] = df.apply(majority_label, axis=1)
    df["label"] = df["label"].astype(int)
    df = df.drop(columns=annotator_cols + ["key"])
    print(df["label"].value_counts())
    return df

In [94]:
df_eng = pd.read_csv("/kaggle/input/hertlex-hindi-data/train_en_l1.csv")
rows, columns = df_eng.shape
print(f"Rows: {rows}, Columns: {columns}")
eng_train_gender = data_preprocessing(df_eng,"english")
eng_train_gender['text'] = eng_train_gender['text'].apply(lambda x: normalize_text(x))
rows, columns = eng_train_gender.shape
print(f"Rows: {rows}, Columns: {columns}")


Rows: 6531, Columns: 8
label
0    5269
1    1262
Name: count, dtype: int64
Rows: 6531, Columns: 2


In [95]:
gendered_train_dataset = TextDataset(eng_train_gender, vocab,max_len=100)
gendered_train_loader = DataLoader(gendered_train_dataset, batch_size=32, shuffle=True)


In [96]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CNNBiLSTMClassifier(embedding_matrix).to(device)
model.load_state_dict(torch.load("/kaggle/working/eng_glove_cnn_bilstm_model.pth"))
print("\nLoaded Stage 1 model weights (hate speech) for fine-tuning.")



Loaded Stage 1 model weights (hate speech) for fine-tuning.


  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
  model.load_state_dict(torch.load("/kaggle/working/eng_glove_cnn_bilstm_model.pth"))


In [None]:
def train_stage2(model, train_loader, epochs=15):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0.0
        loop = tqdm(train_loader, desc=f"Training Stage 2 - Epoch {epoch+1}")
        for inputs,labels in loop:
            optimizer.zero_grad()
            inputs,labels = inputs.to(device),labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")
        
    torch.save(model.state_dict(), "stage2_gendered_abuse.pth")
    print("Stage 2 Model Saved: stage2_gendered_abuse.pth")

train_stage2(model, gendered_train_loader)


Training Stage 2 - Epoch 1: 100%|██████████| 205/205 [00:02<00:00, 85.65it/s, loss=0.178]


Epoch 1 Average Loss: 0.3789


Training Stage 2 - Epoch 2: 100%|██████████| 205/205 [00:02<00:00, 84.90it/s, loss=0.174]


Epoch 2 Average Loss: 0.3666


Training Stage 2 - Epoch 3: 100%|██████████| 205/205 [00:02<00:00, 85.76it/s, loss=0.223]


Epoch 3 Average Loss: 0.3567


Training Stage 2 - Epoch 4: 100%|██████████| 205/205 [00:02<00:00, 86.59it/s, loss=0.684]


Epoch 4 Average Loss: 0.3467


Training Stage 2 - Epoch 5: 100%|██████████| 205/205 [00:02<00:00, 85.78it/s, loss=0.114]


Epoch 5 Average Loss: 0.3360


Training Stage 2 - Epoch 6: 100%|██████████| 205/205 [00:02<00:00, 84.06it/s, loss=0.0758]


Epoch 6 Average Loss: 0.3210


Training Stage 2 - Epoch 7: 100%|██████████| 205/205 [00:02<00:00, 85.24it/s, loss=0.319]


Epoch 7 Average Loss: 0.3118


Training Stage 2 - Epoch 8: 100%|██████████| 205/205 [00:02<00:00, 84.25it/s, loss=0.157]


Epoch 8 Average Loss: 0.3022


Training Stage 2 - Epoch 9: 100%|██████████| 205/205 [00:02<00:00, 85.61it/s, loss=0.0409]


Epoch 9 Average Loss: 0.2864


Training Stage 2 - Epoch 10: 100%|██████████| 205/205 [00:02<00:00, 84.00it/s, loss=0.0292]


Epoch 10 Average Loss: 0.2676


Training Stage 2 - Epoch 11: 100%|██████████| 205/205 [00:02<00:00, 84.92it/s, loss=0.216]


Epoch 11 Average Loss: 0.2585


Training Stage 2 - Epoch 12: 100%|██████████| 205/205 [00:02<00:00, 84.58it/s, loss=0.0536]


Epoch 12 Average Loss: 0.2396


Training Stage 2 - Epoch 13: 100%|██████████| 205/205 [00:02<00:00, 86.16it/s, loss=0.169] 


Epoch 13 Average Loss: 0.2394


Training Stage 2 - Epoch 14: 100%|██████████| 205/205 [00:02<00:00, 85.68it/s, loss=0.109] 


Epoch 14 Average Loss: 0.2234


Training Stage 2 - Epoch 15: 100%|██████████| 205/205 [00:02<00:00, 86.88it/s, loss=0.0138]


Epoch 15 Average Loss: 0.2019
✅ Stage 2 Model Saved: stage2_gendered_abuse.pth


In [None]:
from sklearn.metrics import f1_score, classification_report
df_eng_test = pd.read_csv("/kaggle/input/hertlex-hindi-data/test_en_l1.csv", engine='python')
print("Test Set:", df_eng_test.shape)
eng_test_gender = data_preprocessing(df_eng_test, "english")
eng_test_gender["text"] = eng_test_gender["text"].apply(lambda x: normalize_text(x))

gendered_test_dataset = TextDataset(eng_test_gender, vocab,max_len=100)
gendered_test_loader = DataLoader(gendered_test_dataset, batch_size=32,shuffle=True)

model = CNNBiLSTMClassifier(embedding_matrix).to(device)
model.load_state_dict(torch.load("/kaggle/working/stage2_gendered_abuse.pth"))
model.eval()

def inference_on_gendered_abuse_test(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        
        loop = tqdm(test_loader, desc=f"Training Stage 2 - Epoch {epoch+1}")
        for inputs,labels in loop:
            inputs,labels = inputs.to(device),labels.to(device)



            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1_macro = f1_score(all_labels, all_preds, average='macro')
    print(f"\nTest F1 Score (Macro): {f1_macro:.4f}")
    print("\nClassification Report:\n", classification_report(all_labels, all_preds, digits=4))
    return f1_macro

f1_macro_test = inference_on_gendered_abuse_test(model, gendered_test_loader)
print(f" Final Gendered Abuse Test F1 Score (Macro): {f1_macro_test:.4f}")


Test Set: (1107, 8)
label
0    877
1    230
Name: count, dtype: int64


  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
  model.load_state_dict(torch.load("/kaggle/working/stage2_gendered_abuse.pth"))
Training Stage 2 - Epoch 15: 100%|██████████| 35/35 [00:00<00:00, 147.57it/s]


Test F1 Score (Macro): 0.5408

Classification Report:
               precision    recall  f1-score   support

           0     0.8075    0.9327    0.8656       877
           1     0.3723    0.1522    0.2160       230

    accuracy                         0.7706      1107
   macro avg     0.5899    0.5424    0.5408      1107
weighted avg     0.7171    0.7706    0.7307      1107

✅ Final Gendered Abuse Test F1 Score (Macro): 0.5408



