In [20]:
import pandas as pd

train_path = '/kaggle/input/marathihate-clean2/clean2_train.xlsx'
test_path = '/kaggle/input/marathihate-clean2/clean2_test.xlsx'
val_path = '/kaggle/input/marathihate-clean2/clean2_valid.xlsx'

train_data = pd.read_excel(train_path)
test_data = pd.read_excel(test_path)
val_data = pd.read_excel(val_path)


In [21]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, XLMRobertaConfig, AutoModel
import torch.nn.functional as F

from sklearn.model_selection import train_test_split
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import copy
import torch
import torch.nn as nn
from transformers import AutoModel

In [22]:
train_data = pd.read_excel(train_path)
test_data = pd.read_excel(test_path)
val_data = pd.read_excel(val_path)



In [23]:
hate_words = ['मठ्ठ', 'हरामकोर','निच','नशेडी','नक्षली','माफिया','बिनडोक','बेअक्कल','भाडखावू','गद्दारांनो',
              'नीच','गद्दार','जिहादी','दलिंदर','हलकट','हरामखोर','विकृत','माजोरडे','औकात','वसुलीबाज',
              'अवलादी','लबाडी','मूर्ख','बिकावु','छपरी','निर्लज्ज','दळभद्री','पुचाट','उपद्रवी','वंगाळ',
              'दुतौंडी','फालतू','ढोंगी','औलादी','नशेड़ी','मुर्दाबाद','पळपुटे','भित्रे','भाड्यानो','महाभकास',
              'अक्कलशून्य','लाळपुसे','नौटंकी','लफडेबाज','डुक्कर','कोंबडीचोर','अंधभक्त','खटारा','भ्रष्ट','चोर']
              
offensive_words = ['देशद्रोही', 'नालायक','वाकडतोंडया','शेंबड्या','लाळचाट्या','माठ्या','भंगारवाल्या','भडवा','वासाड्या','शेमन्या',
                   'रताळ्या','भिखारड्या','बायल्या','टकल्या','भिकारडा','बडव्या','भुसनाळ्या','झाकणझुल्या','हिजड्या','केळ्या',
                   'बाजारभुंग्या','काळतोंडया','भुरट्या','लुक्क्या','मरतुकड्या','कुत्र्या','नेपाळ्या','हरामी','भिकाऱ्या','माजोरड्या',
                   'झिपऱ्या','चाट्या','भोसडीच्या','हागऱ्या','भूसनाळ्या','तोंड्या','चुत्या','ढाप्या','पोंग्या','गुजरगांड्या',
                   'नरसाळ्या','हुंग्या','लबाड्या','वाकडतोंड्या','लांडया','गांजाडा','फोद्रीच्या','अमुश्या','चोमण्या','उपट्या']

profane_words = ['झक', 'बुल्ला', 'झाटू','शेट','झवन्या','झव','गांडू','पुच्ची','नपुसंक','नागडा',
                 'रांडेच्या','लवड्या','येडझव्या','आईझवाड्या','बुल्ल्या','रंडीबाज','भडव्या', 'भेंचोद','चुत्यागीरी','झवाड्या',
                  'Zवणे','मादरचोद','झवझव','भेंचोत','चुतीया','गांडीत','बूल्ला','ढुंगण','आईघाल्या','झवले',
                 'चुतमारीच्या','संडास','च्युत्या','झाट्या','चुतमारीच्याला','झवायला','रांडच्या','पुची','रांड','भोक',
                 'झवून','चुतिया','भोसडा','झवण्या','हांडग्या','गोट्या','नागडी','गांड','चोदणे','लंड']

In [24]:
label_map = {'HATE': 0, 'OFFN': 1, 'PRFN': 2, 'NOT': 3}
train_data['label'] = train_data['label'].map(label_map)
val_data['label'] = val_data['label'].map(label_map)
test_data['label'] = test_data['label'].map(label_map)

In [25]:
tokenizer = AutoTokenizer.from_pretrained("l3cube-pune/mahahate-multi-roberta")
config = XLMRobertaConfig.from_pretrained("l3cube-pune/mahahate-multi-roberta")
config.num_labels = 4  # Number of output labels
additional_features_size = len(hate_words) + len(offensive_words) + len(profane_words)


In [26]:
#Function to tokenize input sentence and additional features
def tokenize_input(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True )
    hate_feature = torch.tensor([1 if word in sentence else 0 for word in hate_words])
    offence_feature = torch.tensor([1 if word in sentence else 0 for word in offensive_words])
    profane_feature = torch.tensor([1 if word in sentence else 0 for word in profane_words])
    additional_features = torch.cat([hate_feature, offence_feature, profane_feature])
    return inputs, additional_features

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [28]:
class CustomXLMRobertaForSequenceClassification(nn.Module):
    def __init__(self, config, additional_features_size):
        super().__init__()
        self.roberta = AutoModel.from_pretrained("l3cube-pune/mahahate-multi-roberta", config=config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.word_attention = nn.Linear(config.hidden_size, 1)
        self.sentence_attention = nn.Linear(config.hidden_size, 1)
        self.classifier = nn.Linear(config.hidden_size + additional_features_size, config.num_labels)

    def forward(self, input_ids=None, attention_mask=None, additional_features=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        hidden_states = outputs.last_hidden_state

        #Calculating word attention
        word_scores = torch.tanh(self.word_attention(hidden_states))
        word_weights = torch.softmax(word_scores, dim=1)
        word_attention_output = torch.sum(hidden_states * word_weights, dim=1)

        #Calculating sentence attention
        sentence_scores = torch.tanh(self.sentence_attention(word_attention_output.unsqueeze(1)))
        sentence_weights = torch.softmax(sentence_scores, dim=1)
        sentence_attention_output = torch.sum(word_attention_output.unsqueeze(1) * sentence_weights, dim=1)

        pooled_output = self.dropout(sentence_attention_output)
        combined_features = torch.cat((pooled_output, additional_features), dim=1)
        logits = self.classifier(combined_features)
        return logits,word_weights, sentence_weights

In [29]:
from torch.nn.utils.rnn import pad_sequence

#Custom Dataset class for incorporating additional features (i.e. features extracted(converted to binary vector) from 3 lists of words)
class CustomDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data.iloc[idx]['text']
        label = self.data.iloc[idx]['label']
        inputs, additional_features = tokenize_input(sentence)
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'additional_features': additional_features,
            'label': torch.tensor(label)
        }
    def collate_fn(self, batch):
        input_ids = [item['input_ids'] for item in batch]
        attention_mask = [item['attention_mask'] for item in batch]
        additional_features = [item['additional_features'] for item in batch]
        labels = [item['label'] for item in batch]
        input_ids = pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id).to(device)
        attention_mask = pad_sequence(attention_mask, batch_first=True, padding_value=0).to(device)
        additional_features = torch.stack(additional_features).to(device)
        labels = torch.stack(labels)
        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'additional_features': additional_features,
            'label': labels
        }

In [30]:
#Creating the DataLoader for training, validation, and testing with collate_fn
train_dataset = CustomDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=train_dataset.collate_fn)

val_dataset = CustomDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=8, collate_fn=val_dataset.collate_fn)

test_dataset = CustomDataset(test_data)
test_loader = DataLoader(test_dataset, batch_size=8, collate_fn=test_dataset.collate_fn)

### Below Cells are for Training the Model

#### Do not run below cells if you want to only test the model

In [10]:
model = CustomXLMRobertaForSequenceClassification(config, additional_features_size).to(device)

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Some weights of XLMRobertaModel were not initialized from the model checkpoint at l3cube-pune/mahahate-multi-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
#Defining optimizer and loss function
optimizer = AdamW(model.parameters(), lr=5e-6)
loss_fn = torch.nn.CrossEntropyLoss()



In [1]:
#TO save new model after every 5 epochs.
num_epochs = 20
save_interval = 5

for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0.0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        additional_features = batch['additional_features'].to(device)
        labels = batch['label'].to(device)
        outputs,_,_ = model(input_ids=input_ids, attention_mask=attention_mask, additional_features=additional_features)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    #Validation
    model.eval()
    with torch.no_grad():
        val_loss = 0.0
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            additional_features = batch['additional_features'].to(device)
            labels = batch['label'].to(device)
            outputs,_,_ = model(input_ids=input_ids, attention_mask=attention_mask, additional_features=additional_features)
            val_loss += loss_fn(outputs, labels).item()
        val_loss /= len(val_loader)

        print(f"Epoch {epoch+1}, Training Loss: {avg_train_loss}, Validation Loss: {val_loss}")

    if (epoch + 1) % save_interval == 0:
        model_path = f"/kaggle/working/HateSpeechWithAttention_epoch{epoch+1}.pth"
        torch.save({
            'model_state_dict': model.state_dict(),
            'tokenizer': tokenizer,
            'config': config,
            'additional_features_size': additional_features_size,
            'label_map': label_map,
        }, model_path)

        print(f"Model saved at {model_path}")

In [15]:
print("Done")

Done


In [3]:
model_path = "/kaggle/working/HateSpeechHighestAccuracy.pth"
torch.save({
    'model_state_dict': model.state_dict(),
    'tokenizer': tokenizer,
    'config': config,
    'additional_features_size': additional_features_size,
    'label_map': label_map,
}, model_path)

print(f"Model saved at {model_path}")

In [2]:
#performing evaluation on test set
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        additional_features = batch['additional_features'].to(device)
        labels = batch['label'].to(device)
        outputs,_,_ = model(input_ids=input_ids, attention_mask=attention_mask, additional_features=additional_features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Accuracy on test set: {accuracy}")

In [18]:
print("Done")

Done


## Importing the Model and predicting

In [146]:
model_path = "/kaggle/input/hatespeechhighestaccuracyfinal/pytorch/marathihatehighestaccuracy/1/HateSpeechHighestAccuracy.pth"
checkpoint = torch.load(model_path,map_location=torch.device('cpu'))
loaded_model = CustomXLMRobertaForSequenceClassification(config, checkpoint['additional_features_size'])
loaded_model.load_state_dict(checkpoint['model_state_dict'])
loaded_model.eval()
tokenizer = checkpoint['tokenizer']
label_map_inverse = {v: k for k, v in label_map.items()}

Some weights of XLMRobertaModel were not initialized from the model checkpoint at l3cube-pune/mahahate-multi-roberta and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [147]:
device='cpu'

In [148]:
#performing evaluation on test set
loaded_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        additional_features = batch['additional_features'].to(device)
        labels = batch['label'].to(device)
        outputs,_,_ = loaded_model(input_ids=input_ids, attention_mask=attention_mask, additional_features=additional_features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Accuracy on test set: {accuracy}")

Accuracy on test set: 0.8285


In [149]:
#performing evaluation on validation set
loaded_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in val_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        additional_features = batch['additional_features'].to(device)
        labels = batch['label'].to(device)
        outputs,_,_ = loaded_model(input_ids=input_ids, attention_mask=attention_mask, additional_features=additional_features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Accuracy on validation set: {accuracy}")

Accuracy on validation set: 0.828


In [150]:
#performing evaluation on train set
loaded_model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        additional_features = batch['additional_features'].to(device)
        labels = batch['label'].to(device)
        outputs,_,_ = loaded_model(input_ids=input_ids, attention_mask=attention_mask, additional_features=additional_features)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    accuracy = correct / total
    print(f"Accuracy on train set: {accuracy}")

Accuracy on train set: 0.9395813953488372


### Prediction on single input Sentence

In [151]:
# sentence = "तुझ्या आयची गां* आईझ*** तुझ्या बायकोची गां* झ*"
# sentence ="कसाब भडवा पाकिस्तानी"
# sentence ="याचा अर्थ हे लोक किती निच वृत्तीची आहेत हे सिद्ध होते"
sentence ="अशी लोक खूप नीच असतात "
# sentence ="तसे म्हणायला गेले तर बाळा साहेब ठाकरे ओपनली शिव्या द्यायची तुझ्या आईला झ***"
# sentence ="नीच राजकारण करणारी लाचारसेना यांच्या *** कोंबले पाहिजेत जंत लाळचाट्या लाळपुसे"
# sentence ="धानोरा फसी येथे यांच्या हस्ते कबड्डी सामन्याचे उद्घाटन"
# sentence ="अय बु**  तुला कोणी विचारलं का  "
# sentence = "आ झवन्या मजा कोणी मलिक नाय तुज्या सारख्या कुत्रंचे असतात मालक मेंदू आहे"
# sentence ="बहुतांश लोकांचा असा समज (गैरसमज) आहे की वरिल वाक्यात श्री म्हणजे एखादे दैवत असावे, मलाही लहानपणी तसंच वाटायचं. आता सहज आठवलं म्हणून जिज्ञासेपोटी हा कौल घेतला. सर्वांचे आभार "
inputs, additional_features = tokenize_input(sentence)
additional_features = additional_features.unsqueeze(0)
# Move input_ids and attention_mask to the same device as the model
inputs['input_ids'] = inputs['input_ids'].to(device)
inputs['attention_mask'] = inputs['attention_mask'].to(device)

#Moving additional_features to the same device as the model
additional_features = additional_features.to(device)
outputs,word_weights,_ = loaded_model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'], additional_features=additional_features)

In [152]:
_, predicted_label = torch.max(outputs, dim=1)
predicted_label = label_map_inverse[predicted_label.item()]
print(f"Predicted Label: {predicted_label}")


Predicted Label: HATE


In [153]:
#Converting input_ids to tokens
tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

#Iterating over the tokens and their attention scores
for token, weight in zip(tokens, word_weights.squeeze(0)):
    print(f"Token: {token}\tWeight: {weight.item()}")

#word_weights.squeeze(0).tolist() is a 2D list where each sublist contain only one element i.e. weight of that subword
#So converting it to 1d list 
weightsList = [item for sublist in word_weights.squeeze(0).tolist() for item in sublist]

Token: <s>	Weight: 0.11116229742765427
Token: ▁अशी	Weight: 0.12045133858919144
Token: ▁लोक	Weight: 0.12630227208137512
Token: ▁खूप	Weight: 0.11846616864204407
Token: ▁न	Weight: 0.13591612875461578
Token: ीच	Weight: 0.13548597693443298
Token: ▁असतात	Weight: 0.12813222408294678
Token: </s>	Weight: 0.12408357858657837


In [154]:
tokens=tokens[1:-1]  #removing start and end token

In [155]:
tokens

['▁अशी', '▁लोक', '▁खूप', '▁न', 'ीच', '▁असतात']

In [156]:
#Concatening subwords(because of tokenization) to make it full word 
#and assigning it summation of weights of its subwords
def combine_subwords_with_weights(subwords, weights):
    words = []
    current_word = ""
    current_weight = 0.0
    for subword, weight in zip(subwords, weights):            
        if subword.startswith('▁'):
            if current_word:
                words.append((current_word, current_weight))
            current_word = subword[1:]
            current_weight = weight
        else:
            current_word += subword
            current_weight += weight
    if current_word:
        words.append((current_word, current_weight))
    return words


In [157]:
combined_words = combine_subwords_with_weights(tokens, weightsList)
combined_words

[('अशी', 0.11116229742765427),
 ('लोक', 0.12045133858919144),
 ('खूप', 0.12630227208137512),
 ('नीच', 0.25438229739665985),
 ('असतात', 0.13548597693443298)]

In [158]:
word_dict = {}
for word, value in combined_words:
    if word in word_dict:
        word_dict[word] += value
    else:
        word_dict[word] = value
combined_words_unique = [(word, value) for word, value in word_dict.items()]
print(combined_words_unique)

[('अशी', 0.11116229742765427), ('लोक', 0.12045133858919144), ('खूप', 0.12630227208137512), ('नीच', 0.25438229739665985), ('असतात', 0.13548597693443298)]


In [159]:
import plotly.graph_objects as go
import plotly.express as px

words, weights = zip(*combined_words_unique)
color_scale = px.colors.sequential.Viridis  # Use a dark sequential color scale for dark colors
fig = go.Figure(data=[go.Bar(
    x=words,
    y=weights,
    marker=dict(color=weights, colorscale=color_scale),
)])
fig.update_layout(
    title='Weights of Words for predicting the label',
    xaxis_title='Words',
    yaxis_title='Weights',
    xaxis_tickangle=-45,  # Rotating x-axis labels for better readability
)
fig.show()
