In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

hindi_train = pd.read_csv("/kaggle/input/hertlex-hindi-data/hindi_train.csv")


In [None]:
import re,string
def normalize_text(text):
  # text = text.lower()
  emoji_pattern = re.compile("["
                              u"\U0001F600-\U0001F64F"  
                              u"\U0001F300-\U0001F5FF"  
                              u"\U0001F680-\U0001F6FF"  
                              u"\U0001F700-\U0001F77F"  
                              u"\U0001F780-\U0001F7FF"  
                              u"\U0001F800-\U0001F8FF"  
                              u"\U0001F900-\U0001F9FF"  
                              u"\U0001FA00-\U0001FA6F"  
                              u"\U0001FA70-\U0001FAFF"  
                              u"\U00002702-\U000027B0"  
                              u"\U000024C2-\U0001F251"
                              "]+", flags=re.UNICODE)
  text = text.lower()
  text = re.sub('\[.*?\]', ' ', text)
  text = re.sub('https?://\S+|www\.\S+', ' ', text)
  text = re.sub('<.*?>+', ' ', text)
  text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
  text = re.sub('\n', ' ', text)
  text = re.sub('\w*\d\w*', ' ', text)
  text = re.sub('<handle replaced>', '', text)
  text = emoji_pattern.sub(r'', text)
  return text

In [4]:
hindi_train['text'] = hindi_train['text'].apply(lambda x: normalize_text(x))
hindi_train

Unnamed: 0,label,text
0,1,जन्म दिन मुबारक हो
1,0,तेरी मां की चोदो साला तू ने केसे कह दिया तू सा...
2,0,रंडी दूसरे से कहने से पहले अपना लैंग्वेज सुधार...
3,0,मुझे तो हिजड़ा सा लग रहा है
4,1,आप अजमेर शरीफ मे थे मेरे दोस्त इमरान के साथ फो...
...,...,...
26906,0,नाम मोदी काम बकचोदी
26907,1,मेरी बेटी का neetमें सिलेक्शन हो जाए ए मां दुर्गा
26908,1,स्त्री हे क क्या मतलब
26909,0,जो बचे कपडे हे वो भी निकाल दे तकी ओर ilke ओर फ...


In [5]:
hindi_val = pd.read_csv("/kaggle/input/hertlex-hindi-data/hindi_val.csv")
hindi_val['text'] = hindi_val['text'].apply(lambda x: normalize_text(x))


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import re
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
def clean_text(text):
    text = normalize_text(text)
    return word_tokenize(text)


In [8]:
all_text = hindi_train['text'].tolist() + hindi_val['text'].tolist()
tokenized_texts = [clean_text(t) for t in all_text]
vocab = {'<PAD>': 0, '<UNK>': 1}
for sentence in tokenized_texts:
    for word in sentence:
        if word not in vocab:
            vocab[word] = len(vocab)

In [9]:
def load_glove(file_path, vocab, embedding_dim=100):
    embeddings_index = {}
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = vector

    embedding_matrix = np.zeros((len(vocab), embedding_dim))
    for word, idx in vocab.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[idx] = embedding_vector
        else:
            embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))
    return torch.tensor(embedding_matrix, dtype=torch.float32)


In [11]:
embedding_dim = 300
glove_path = "/kaggle/input/hertlex-hindi-data/cc.hi.300.vec"  # Update if needed
embedding_matrix = load_glove(glove_path, vocab, embedding_dim)


In [None]:
class TextDataset(Dataset):
    def __init__(self, dataframe, word2idx, max_len):
        self.texts = dataframe['text'].tolist()
        self.labels = dataframe['label'].tolist()
        self.vocab = word2idx
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)



    def __getitem__(self, idx):
        tokens = clean_text(self.texts[idx])
        ids = [self.vocab.get(w, self.vocab['<UNK>']) for w in tokens]
        if len(ids) < self.max_len:
            ids += [self.vocab['<PAD>']] * (self.max_len - len(ids))
        else:
            ids = ids[:self.max_len]
        return torch.tensor(ids), torch.tensor(self.labels[idx])

In [13]:

train_dataset = TextDataset(hindi_train, vocab,max_len=100)
val_dataset = TextDataset(hindi_val, vocab,max_len=100)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)

In [None]:
class CNNBiLSTMClassifier(nn.Module):
    

    def __init__(self, embedding_matrix):
        super(CNNBiLSTMClassifier, self).__init__()
        vocab_size, embed_size = embedding_matrix.shape
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        self.dropout1 = nn.Dropout(0.1)
        self.conv1 = nn.Conv1d(embed_size, 64, kernel_size=2)
        self.lstm = nn.LSTM(64, 128, batch_first=True, bidirectional=True)
        self.avgpool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(256, 128)
        self.dropout2 = nn.Dropout(0.1)
        self.out = nn.Linear(128, 2)

    def forward(self, x):
        x = self.embedding(x)
        x = self.dropout1(x)
        x = x.permute(0, 2, 1)
        x = self.conv1(x)
        x = x.permute(0, 2, 1)
        x, _ = self.lstm(x)
        x = x.permute(0, 2, 1)
        x = self.avgpool(x).squeeze(2)
        x = torch.relu(self.fc(x))
        x = self.dropout2(x)
        return self.out(x)

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNBiLSTMClassifier(embedding_matrix).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))


In [16]:
def train_epoch(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for inputs,labels in tqdm(loader, desc="Training"):
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def eval_model(model, loader):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for inputs , labels in tqdm(loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            pred = torch.argmax(outputs, dim=1)
            preds.extend(pred.cpu().numpy())
            targets.extend(labels.cpu().numpy())
    return f1_score(targets, preds, average='macro')


In [17]:
for epoch in range(8):
    print(f"\nEpoch {epoch+1}")
    train_loss = train_epoch(model, train_loader,optimizer, criterion)
    val_f1 = eval_model(model, val_loader)
    print(f"Train Loss: {train_loss:.4f} | Val F1: {val_f1:.4f}")
import matplotlib.pyplot as plt
train_losses = []
val_f1s = []
torch.save(model.state_dict(), "glove_cnn_bilstm_model.pth")





Epoch 1


Training: 100%|██████████| 841/841 [00:09<00:00, 92.63it/s] 
Evaluating: 100%|██████████| 106/106 [00:00<00:00, 147.51it/s]


Train Loss: 0.6595 | Val F1: 0.7258

Epoch 2


Training: 100%|██████████| 841/841 [00:08<00:00, 103.32it/s]
Evaluating: 100%|██████████| 106/106 [00:00<00:00, 150.33it/s]


Train Loss: 0.4916 | Val F1: 0.7720

Epoch 3


Training: 100%|██████████| 841/841 [00:08<00:00, 102.67it/s]
Evaluating: 100%|██████████| 106/106 [00:00<00:00, 152.07it/s]


Train Loss: 0.4345 | Val F1: 0.8052

Epoch 4


Training: 100%|██████████| 841/841 [00:08<00:00, 103.30it/s]
Evaluating: 100%|██████████| 106/106 [00:00<00:00, 154.93it/s]


Train Loss: 0.4100 | Val F1: 0.8082

Epoch 5


Training: 100%|██████████| 841/841 [00:08<00:00, 102.72it/s]
Evaluating: 100%|██████████| 106/106 [00:00<00:00, 157.55it/s]


Train Loss: 0.3932 | Val F1: 0.8098

Epoch 6


Training: 100%|██████████| 841/841 [00:08<00:00, 103.57it/s]
Evaluating: 100%|██████████| 106/106 [00:00<00:00, 155.30it/s]


Train Loss: 0.3772 | Val F1: 0.8030

Epoch 7


Training: 100%|██████████| 841/841 [00:08<00:00, 101.99it/s]
Evaluating: 100%|██████████| 106/106 [00:00<00:00, 152.24it/s]


Train Loss: 0.3650 | Val F1: 0.8114

Epoch 8


Training: 100%|██████████| 841/841 [00:08<00:00, 103.40it/s]
Evaluating: 100%|██████████| 106/106 [00:00<00:00, 157.75it/s]


Train Loss: 0.3541 | Val F1: 0.8146


In [None]:
def data_preprocessing(df, language):
    if language == "tamil":
        annotator_cols = ["ta_a1", "ta_a2", "ta_a3", "ta_a4", "ta_a5", "ta_a6"]
    elif language == "hindi":
        annotator_cols = ["hi_a1", "hi_a2", "hi_a3", "hi_a4", "hi_a5"]
    else:
        annotator_cols = ["en_a1", "en_a2", "en_a3", "en_a4", "en_a5", "en_a6"]

    df[annotator_cols] = df[annotator_cols].fillna("")

    def majority_label(row):
        votes = []
        for col in annotator_cols:
            val = row[col]
            try:
                if val != "":
                    num_val = float(val)
                    if num_val in [0.0, 1.0]:
                        votes.append(int(num_val))
            except:
                continue
        if not votes:
            return 0
        return 1 if votes.count(1) > votes.count(0) else 0

    df["label"] = df.apply(majority_label, axis=1)
    df["label"] = df["label"].astype(int)
    df = df.drop(columns=annotator_cols + ["key"])
    print(df["label"].value_counts())
    return df


In [19]:
df_hindi = pd.read_csv("/kaggle/input/hertlex-hindi-data/train_hi_l1.csv")
rows, columns = df_hindi.shape
print(f"Rows: {rows}, Columns: {columns}")
hindi_train_gender = data_preprocessing(df_hindi,"hindi")
hindi_train_gender['text'] = hindi_train_gender['text'].apply(lambda x: normalize_text(x))
rows, columns = hindi_train_gender.shape
print(f"Rows: {rows}, Columns: {columns}")


Rows: 6197, Columns: 7
label
0    4437
1    1760
Name: count, dtype: int64
Rows: 6197, Columns: 2


In [20]:
gendered_train_dataset = TextDataset(hindi_train_gender, vocab,max_len=100)
gendered_train_loader = DataLoader(gendered_train_dataset, batch_size=32, shuffle=True)


In [21]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = CNNBiLSTMClassifier(embedding_matrix).to(device)
model.load_state_dict(torch.load("/kaggle/working/glove_cnn_bilstm_model.pth"))
print("\nLoaded Stage 1 model weights (hate speech) for fine-tuning.")



Loaded Stage 1 model weights (hate speech) for fine-tuning.


  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
  model.load_state_dict(torch.load("/kaggle/working/glove_cnn_bilstm_model.pth"))


In [None]:
def train_stage2(model, train_loader, epochs=8):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(epochs):
        total_loss = 0.0
        loop = tqdm(train_loader, desc=f"Training Stage 2 - Epoch {epoch+1}")
        for inputs,labels in loop:
            optimizer.zero_grad()
            inputs,labels = inputs.to(device),labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
            loop.set_postfix(loss=loss.item())

        avg_loss = total_loss / len(train_loader)
        print(f"Epoch {epoch+1} Average Loss: {avg_loss:.4f}")
        
    torch.save(model.state_dict(), "stage2_gendered_abuse.pth")
    print("Stage 2 Model Saved: stage2_gendered_abuse.pth")

train_stage2(model, gendered_train_loader)


Training Stage 2 - Epoch 1: 100%|██████████| 194/194 [00:02<00:00, 81.72it/s, loss=0.545]


Epoch 1 Average Loss: 0.5743


Training Stage 2 - Epoch 2: 100%|██████████| 194/194 [00:02<00:00, 78.79it/s, loss=0.46] 


Epoch 2 Average Loss: 0.5665


Training Stage 2 - Epoch 3: 100%|██████████| 194/194 [00:02<00:00, 83.95it/s, loss=0.527]


Epoch 3 Average Loss: 0.5587


Training Stage 2 - Epoch 4: 100%|██████████| 194/194 [00:02<00:00, 83.63it/s, loss=0.688]


Epoch 4 Average Loss: 0.5559


Training Stage 2 - Epoch 5: 100%|██████████| 194/194 [00:02<00:00, 81.57it/s, loss=0.499]


Epoch 5 Average Loss: 0.5500


Training Stage 2 - Epoch 6: 100%|██████████| 194/194 [00:02<00:00, 83.49it/s, loss=0.592]


Epoch 6 Average Loss: 0.5470


Training Stage 2 - Epoch 7: 100%|██████████| 194/194 [00:02<00:00, 82.29it/s, loss=0.647]


Epoch 7 Average Loss: 0.5428


Training Stage 2 - Epoch 8: 100%|██████████| 194/194 [00:02<00:00, 82.32it/s, loss=0.523]


Epoch 8 Average Loss: 0.5406
✅ Stage 2 Model Saved: stage2_gendered_abuse.pth


In [None]:
from sklearn.metrics import f1_score, classification_report
df_hindi_test = pd.read_csv("/kaggle/input/hertlex-hindi-data/test_hi_l1.csv", engine='python')
print("Test Set:", df_hindi_test.shape)
hindi_test_gender = data_preprocessing(df_hindi_test, "hindi")
hindi_test_gender["text"] = hindi_test_gender["text"].apply(lambda x: normalize_text(x))

gendered_test_dataset = TextDataset(hindi_test_gender, vocab,max_len=100)
gendered_test_loader = DataLoader(gendered_test_dataset, batch_size=32,shuffle=True)

model = CNNBiLSTMClassifier(embedding_matrix).to(device)
model.load_state_dict(torch.load("/kaggle/working/stage2_gendered_abuse.pth"))
model.eval()

def inference_on_gendered_abuse_test(model, test_loader):
    model.eval()
    all_preds = []
    all_labels = []

    with torch.no_grad():
        
        loop = tqdm(test_loader, desc=f"Training Stage 2 - Epoch {epoch+1}")
        for inputs,labels in loop:
            inputs,labels = inputs.to(device),labels.to(device)



            outputs = model(inputs)
            preds = torch.argmax(outputs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    f1_macro = f1_score(all_labels, all_preds, average='macro')
    print(f"\nTest F1 Score (Macro): {f1_macro:.4f}")
    print("\nClassification Report:\n", classification_report(all_labels, all_preds, digits=4))
    return f1_macro

f1_macro_test = inference_on_gendered_abuse_test(model, gendered_test_loader)
print(f" Final Gendered Abuse Test F1 Score (Macro): {f1_macro_test:.4f}")


  self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
  model.load_state_dict(torch.load("/kaggle/working/stage2_gendered_abuse.pth"))


Test Set: (1517, 7)
label
0    1159
1     358
Name: count, dtype: int64


Training Stage 2 - Epoch 8: 100%|██████████| 48/48 [00:00<00:00, 137.14it/s]


Test F1 Score (Macro): 0.5899

Classification Report:
               precision    recall  f1-score   support

           0     0.7976    0.9215    0.8551      1159
           1     0.4888    0.2430    0.3246       358

    accuracy                         0.7614      1517
   macro avg     0.6432    0.5823    0.5899      1517
weighted avg     0.7247    0.7614    0.7299      1517

✅ Final Gendered Abuse Test F1 Score (Macro): 0.5899



