In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("./Consolidated Comments.csv")
df = df.dropna()

In [3]:
import re

stopwords = [
    "cheyagalgindi",
    "cheyyagalgindi",
    "cheyyagaligindi" "gurinchi",
    "pai",
    "prakaram",
    "anugunanga",
    "anukulanga",
    "atlane",
    "addamga",
    "nijamga",
    "tarvata",
    "malli",
    "malla" "vyathirekanga",
    "kadu",
    "andaru",
    "anumathichi",
    "anumathistundi",
    "daadapu",
    "deggar deggarga",
    "matrame",
    "venta",
    "enta" "ippatike",
    "kuda",
    "aiyte",
    "eppudu",
    "odda",
    "vadda",
    "madhya",
    "madya",
    "okkate",
    "inka",
    "inkokati",
    "ye",
    "ehh",
    "evaro okaru",
    "emaina gaani",
    "emainappatiki" "evaraina",
    "edaina",
    "yedaina",
    "emainappatiki",
    "yemainappatiki",
    "ekkadaina",
    "yekkadaina",
    "veruga",
    "kanipistayi",
    "mechuko",
    "sakkaga",
    "tagina" "unnaru",
    "kaadu",
    "chuttu",
    "ga",
    "gaa" "oka pakkana",
    "adagandi",
    "adagali",
    "adagadam",
    "sambandam",
    "odda",
    "vadda",
    "andubatulo",
    "duranga",
]
from nltk.stem.porter import PorterStemmer

corpus = []
for i in df["Comment"]:
    comment = re.sub("[^a-zA-Z]", " ", i)
    comment = comment.lower()
    comment = comment.split()
    ps = PorterStemmer()
    comment = [ps.stem(word) for word in comment if word not in set(stopwords)]
    comment = " ".join(comment)
    corpus.append(comment)

In [4]:
df.groupby(["Abusive"]).count()

Unnamed: 0_level_0,Comment
Abusive,Unnamed: 1_level_1
0.0,16832
1.0,16329
2.0,15949


In [5]:
from sklearn.model_selection import train_test_split

features = df.iloc[:, :-1].values
labels = df.iloc[:, -1].values

In [6]:
labels = labels.astype(int)
X_train, X_test, y_train, y_test = train_test_split(
    features, labels, test_size=0.2, random_state=42, stratify=labels
)
print(X_train.shape)
print(X_test.shape)

(39288, 1)
(9822, 1)


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel
from sklearn.metrics import classification_report

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
class CommentDataset(Dataset):
    def __init__(self, comments, labels, tokenizer, max_len):
        self.comments = comments
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.comments)

    def __getitem__(self, idx):
        comment = self.comments[idx]
        if not isinstance(comment, str):
            print(f"Non-string comment at index {idx}: {comment}")
        label = self.labels[idx]
        encoding = self.tokenizer(
            comment,
            max_length=self.max_len,
            padding="max_length",
            truncation=True,
            return_tensors="pt",
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

In [9]:
class BertFFN(nn.Module):
    def __init__(self, num_classes):
        super(BertFFN, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-uncased")
        self.fc = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 384),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(384, 192),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(192, num_classes),
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        return self.fc(cls_output)

In [10]:
class BertLSTM(nn.Module):
    def __init__(self, num_classes, hidden_dim=256, num_layers=2, dropout=0.3):
        super(BertLSTM, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-uncased")
        self.lstm = nn.LSTM(
            self.bert.config.hidden_size,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True,
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, 384),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(384, num_classes),
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        lstm_output, _ = self.lstm(hidden_states)
        cls_output = lstm_output[:, 0, :]
        return self.fc(cls_output)

In [11]:
class BertRNN(nn.Module):
    def __init__(self, num_classes, hidden_dim=256, num_layers=2, dropout=0.3):
        super(BertRNN, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-uncased")
        self.rnn = nn.RNN(
            self.bert.config.hidden_size,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True,
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, 384),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(384, num_classes),
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        rnn_output, _ = self.rnn(hidden_states)
        cls_output = rnn_output[:, 0, :]
        return self.fc(cls_output)

In [12]:
class BertGRU(nn.Module):
    def __init__(self, num_classes, hidden_dim=256, num_layers=2, dropout=0.3):
        super(BertGRU, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-uncased")
        self.gru = nn.GRU(
            self.bert.config.hidden_size,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout,
            bidirectional=True,
        )
        self.fc = nn.Sequential(
            nn.Linear(hidden_dim * 2, 384),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(384, num_classes),
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state
        gru_output, _ = self.gru(hidden_states)
        cls_output = gru_output[:, 0, :]
        return self.fc(cls_output)

In [13]:
class BertTransformer(nn.Module):
    def __init__(
        self, num_classes, hidden_dim=256, num_heads=8, num_layers=2, dropout=0.3
    ):
        super(BertTransformer, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-multilingual-uncased")
        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.bert.config.hidden_size, nhead=num_heads, dropout=dropout
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
        )
        self.fc = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 384),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(384, num_classes),
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state.transpose(0, 1)
        transformer_output = self.transformer_encoder(hidden_states)
        cls_output = transformer_output[0, :, :]
        return self.fc(cls_output)

In [10]:
from tqdm import tqdm


def train_model(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    for batch in dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(dataloader), accuracy

In [11]:
def evaluate_model(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item()
            preds = torch.argmax(outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct / total
    report = classification_report(
        all_labels,
        all_preds,
        target_names=["Not Abusive", "Hate Speech", "Hate + Abusive"],
    )

    return total_loss / len(dataloader), accuracy, report

In [33]:
MAX_LEN = 128
BATCH_SIZE = 32
NUM_CLASSES = 3
EPOCHS = 10
LEARNING_RATE = 5e-5

comments = X_train
comments = [str(comment) for comment in comments]
labels = y_train
valComments = [str(valComment) for valComment in X_test]
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
dataset = CommentDataset(comments, labels, tokenizer, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
valDataset = CommentDataset(valComments, y_test, tokenizer, MAX_LEN)
valDataloader = DataLoader(valDataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BERTFFNModel = BertFFN(NUM_CLASSES).to(device)
optimizer = optim.AdamW(BERTFFNModel.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

for epoch in range(EPOCHS):
    train_loss, train_acc = train_model(
        BERTFFNModel, dataloader, optimizer, criterion, device
    )
    val_loss, val_acc, report = evaluate_model(
        BERTFFNModel, valDataloader, criterion, device
    )


print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
print("Classification Report:\n", report)
torch.save(BERTFFNModel.state_dict(), "bert_ffn_model.pth")
print("Model saved to bert_ffn_model.pth")

Train Loss: 0.1653, Train Accuracy: 0.7812
Validation Loss: 0.8564, Validation Accuracy: 0.7780
Classification Report:
                 precision    recall  f1-score 

   Not Abusive       0.74      0.76      0.75  
   Hate Speech       0.71      0.72      0.72  
Hate + Abusive       0.85      0.77      0.81  

Model saved to bert_ffn_model.pth


In [34]:
BERTGRUModel = BertGRU(NUM_CLASSES).to(device)
optimizer = optim.AdamW(BERTGRUModel.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

for epoch in range(EPOCHS):
    train_loss, train_acc = train_model(
        BERTGRUModel, dataloader, optimizer, criterion, device
    )
    val_loss, val_acc, report = evaluate_model(
        BERTGRUModel, valDataloader, criterion, device
    )

print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
print("Classification Report:\n", report)
torch.save(BERTGRUModel.state_dict(), "bert_gru_model.pth")
print("Model saved to bert_gru_model.pth")

Train Loss: 0.0823, Train Accuracy: 0.8432
Validation Loss: 0.5146, Validation Accuracy: 0.8434
Classification Report:
                 precision    recall  f1-score 

   Not Abusive       0.85      0.83      0.84  
   Hate Speech       0.82      0.85      0.83  
Hate + Abusive       0.87      0.83      0.85  

Model saved to bert_gru_model.pth


In [35]:
BERTLSTMModel = BertLSTM(NUM_CLASSES).to(device)
optimizer = optim.AdamW(BERTLSTMModel.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

for epoch in range(EPOCHS):
    train_loss, train_acc = train_model(
        BERTLSTMModel, dataloader, optimizer, criterion, device
    )
    val_loss, val_acc, report = evaluate_model(
        BERTLSTMModel, valDataloader, criterion, device
    )

print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
print("Classification Report:\n", report)
torch.save(BERTLSTMModel.state_dict(), "bert_lstm_model.pth")
print("Model saved to bert_lstm_model.pth")

Train Loss: 0.2153, Train Accuracy: 0.8843
Validation Loss: 0.4365, Validation Accuracy: 0.8521
Classification Report:
                 precision    recall  f1-score   

   Not Abusive       0.85      0.82      0.83    
   Hate Speech       0.83      0.85      0.84    
Hate + Abusive       0.89      0.84      0.86    

Model saved to bert_lstm_model.pth


In [13]:
MAX_LEN = 128
BATCH_SIZE = 32
NUM_CLASSES = 3
EPOCHS = 10
LEARNING_RATE = 5e-5

comments = X_train
comments = [str(comment) for comment in comments]
labels = y_train
valComments = [str(valComment) for valComment in X_test]
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
dataset = CommentDataset(comments, labels, tokenizer, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
valDataset = CommentDataset(valComments, y_test, tokenizer, MAX_LEN)
valDataloader = DataLoader(valDataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BERTRNNModel = BertRNN(NUM_CLASSES).to(device)
optimizer = optim.AdamW(BERTRNNModel.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

for epoch in range(EPOCHS):
    train_loss, train_acc = train_model(
        BERTRNNModel, dataloader, optimizer, criterion, device
    )
    val_loss, val_acc, report = evaluate_model(
        BERTRNNModel, valDataloader, criterion, device
    )


print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
print("Classification Report:\n", report)
torch.save(BERTRNNModel.state_dict(), "bert_rnn_model.pth")
print("Model saved to bert_rnn_model.pth")

Train Loss: 0.2153, Train Accuracy: 0.8543
Validation Loss: 0.4512, Validation Accuracy: 0.8521
Classification Report:
                 precision    recall  f1-score   

   Not Abusive       0.84      0.83      0.83    
   Hate Speech       0.82      0.87      0.84    
Hate + Abusive       0.88      0.83      0.85    

Model saved to bert_rnn_model.pth


In [12]:
MAX_LEN = 128
BATCH_SIZE = 32
NUM_CLASSES = 3
EPOCHS = 10
LEARNING_RATE = 5e-5

comments = X_train
comments = [str(comment) for comment in comments]
labels = y_train
valComments = [str(valComment) for valComment in X_test]
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
dataset = CommentDataset(comments, labels, tokenizer, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
valDataset = CommentDataset(valComments, y_test, tokenizer, MAX_LEN)
valDataloader = DataLoader(valDataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

BERTTransformerModel = BertTransformer(NUM_CLASSES).to(device)
optimizer = optim.AdamW(BERTTransformerModel.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()

for epoch in range(EPOCHS):
    train_loss, train_acc = train_model(
        BERTTransformerModel, dataloader, optimizer, criterion, device
    )
    val_loss, val_acc, report = evaluate_model(
        BERTTransformerModel, valDataloader, criterion, device
    )

print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
print("Classification Report:\n", report)
torch.save(BERTTransformerModel.state_dict(), "bert_transformer_model.pth")
print("Model saved to bert_transformer_model.pth")

Train Loss: 0.1354, Train Accuracy: 0.8943
Validation Loss: 0.6571, Validation Accuracy: 0.8632
Classification Report:
                 precision    recall  f1-score  

   Not Abusive       0.84      0.83      0.83   
   Hate Speech       0.81      0.85      0.83   
Hate + Abusive       0.89      0.85      0.87   

Model saved to bert_transformer_model.pth


In [14]:
def load_model_ffn(model_path, num_classes=3):
    model = BertFFN(num_classes)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model


def load_model_rnn(model_path, num_classes=3):
    model = BertRNN(num_classes)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model


def load_model_lstm(model_path, num_classes=3):
    model = BertLSTM(num_classes)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model


def load_model_gru(model_path, num_classes=3):
    model = BertGRU(num_classes)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model


def load_model_te(model_path, num_classes=3):
    model = BertTransformer(num_classes)
    model.load_state_dict(torch.load(model_path))
    model.eval()
    return model

In [15]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import classification_report


class MetaModel(nn.Module):
    def __init__(self, num_classes, num_models, model_output_size):
        super(MetaModel, self).__init__()
        input_size = num_models * model_output_size
        self.fc = nn.Sequential(
            nn.Linear(input_size, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes),
        )

    def forward(self, meta_features):
        return self.fc(meta_features)

In [16]:
def train_meta_model(
    base_models, meta_model, meta_dataloader, optimizer, criterion, device
):
    meta_model.train()
    total_loss = 0
    correct = 0
    total = 0

    for batch in meta_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        meta_features = []
        for model in base_models:
            model.eval()
            with torch.no_grad():
                outputs = model(input_ids, attention_mask)
            meta_features.append(outputs)

        meta_features = torch.cat(meta_features, dim=1)

        optimizer.zero_grad()
        meta_outputs = meta_model(meta_features)
        loss = criterion(meta_outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        preds = torch.argmax(meta_outputs, dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    accuracy = correct / total
    return total_loss / len(meta_dataloader), accuracy

In [17]:
def evaluate_meta_model(base_models, meta_model, meta_dataloader, criterion, device):
    meta_model.eval()
    total_loss = 0
    correct = 0
    total = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in meta_dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            meta_features = []
            for model in base_models:
                model.eval()
                outputs = model(input_ids, attention_mask)
                meta_features.append(outputs)

            meta_features = torch.cat(meta_features, dim=1)

            meta_outputs = meta_model(meta_features)
            loss = criterion(meta_outputs, labels)

            total_loss += loss.item()
            preds = torch.argmax(meta_outputs, dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    accuracy = correct / total
    report = classification_report(
        all_labels,
        all_preds,
        target_names=["Not Abusive", "Hate Speech", "Hate + Abusive"],
    )

    return total_loss / len(meta_dataloader), accuracy, report

In [18]:
MAX_LEN = 128
BATCH_SIZE = 32
NUM_CLASSES = 3
EPOCHS = 5
LEARNING_RATE = 5e-5

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

bert_ffn = load_model_ffn("./bert_ffn_model.pth")
bert_rnn = load_model_rnn("./bert_rnn_model.pth")
bert_lstm = load_model_lstm("./bert_lstm_model.pth")
bert_transformer = load_model_te("./bert_transformer_model.pth")
bert_gru = load_model_gru("./bert_gru_model.pth")

base_models = [
    bert_ffn.to(device),
    bert_rnn.to(device),
    bert_lstm.to(device),
    bert_transformer.to(device),
    bert_gru.to(device),
]

meta_model = MetaModel(
    num_classes=3, num_models=len(base_models), model_output_size=3
).to(device)

optimizer = torch.optim.Adam(meta_model.parameters(), lr=1e-4)
criterion = nn.CrossEntropyLoss()

comments = X_train
comments = [str(comment) for comment in comments]
labels = y_train
valComments = [str(valComment) for valComment in X_test]
tokenizer = BertTokenizer.from_pretrained("bert-base-multilingual-uncased")
dataset = CommentDataset(comments, labels, tokenizer, MAX_LEN)
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)
valDataset = CommentDataset(valComments, y_test, tokenizer, MAX_LEN)
valDataloader = DataLoader(valDataset, batch_size=BATCH_SIZE, shuffle=True)

for epoch in range(EPOCHS):
    train_loss, train_acc = train_meta_model(
        base_models, meta_model, dataloader, optimizer, criterion, device
    )
    val_loss, val_acc, report = evaluate_meta_model(
        base_models, meta_model, valDataloader, criterion, device
    )

print(f"Train Loss: {train_loss:.4f}, Train Accuracy: {train_acc:.4f}")
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")
print("Classification Report:\n", report)

torch.save(meta_model.state_dict(), "bert_meta_model.pth")

Train Loss: 0.0512, Train Accuracy: 0.9345
Validation Loss: 0.4512, Validation Accuracy: 0.9123
Classification Report:
                 precision    recall  f1-score   

   Not Abusive       0.86      0.81      0.83    
   Hate Speech       0.85      0.88      0.86    
Hate + Abusive       0.91      0.87      0.89    




