# Importing Modules

In [70]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AutoModel, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm import tqdm
import transformers
import pandas as pd
from transformers import AutoModel
from transformers import BertForSequenceClassification

In [71]:
# !git clone https://github.com/ai4bharat/indic-bert
# # !pip install -r /content/indic-bert/requirements.txt
# # !cd ..

# Loading Dataset

In [72]:
df = pd.read_csv('/content/emotions3.csv')
df.head(5)

Unnamed: 0,Review,Sentiment
0,मेरे ट्रैन का बुकिंग फेल हो रहा है बार बार,angry
1,मेरे फ्लाइट का बुकिंग फेल हो रहा है बार बार,angry
2,क्या बेकार की बातें कर रहे हो,angry
3,क्या बताओ यार मूड ही ख़राब है,angry
4,आप ऐसे कैसे मेरा पैसा काट सकते हो,angry


In [73]:
df['Sentiment'].unique()

array(['angry', 'happy', 'neutral', 'sad', nan], dtype=object)

In [74]:
df=df.dropna(subset=['Sentiment'])

# Model Building

In [75]:
class HindiSentimentDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length, label_map):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.label_map = label_map

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data.iloc[idx]['Review']
        label_str = self.data.iloc[idx]['Sentiment']
        label = self.label_map[label_str]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [76]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [77]:
label_map = {'neutral': 0,'angry':1 ,'sad':2,'happy':3}

In [78]:
# Define the model
model = BertForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=len(label_map))

You are using a model of type albert to instantiate a model of type bert. This is not supported for all configurations of models and can yield errors.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ai4bharat/indic-bert and are newly initialized: ['classifier.bias', 'classifier.weight', 'embeddings.LayerNorm.bias', 'embeddings.LayerNorm.weight', 'embeddings.position_embeddings.weight', 'embeddings.token_type_embeddings.weight', 'embeddings.word_embeddings.weight', 'encoder.layer.0.attention.output.LayerNorm.bias', 'encoder.layer.0.attention.output.LayerNorm.weight', 'encoder.layer.0.attention.output.dense.bias', 'encoder.layer.0.attention.output.dense.weight', 'encoder.layer.0.attention.self.key.bias', 'encoder.layer.0.attention.self.key.weight', 'encoder.layer.0.attention.self.query.bias', 'encoder.layer.0.attention.self.query.weight', 'encoder.layer.0.attention.self.value.bias', 'encoder.layer.0.attention.self.value.weight', 'encoder.la

In [79]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(200000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [80]:
tokenizer = transformers.AutoTokenizer.from_pretrained('ai4bharat/indic-bert', keep_accents=True)
tokenizer

AlbertTokenizerFast(name_or_path='ai4bharat/indic-bert', vocab_size=200000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '[CLS]', 'eos_token': '[SEP]', 'unk_token': '<unk>', 'sep_token': '[SEP]', 'pad_token': '<pad>', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
}

In [81]:
# Define training parameters
batch_size = 32
max_length = 256
epochs = 5
learning_rate = 3e-5

In [82]:
# Create dataloaders for training and validation
train_dataset = HindiSentimentDataset(train_df, tokenizer, max_length, label_map)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

val_dataset = HindiSentimentDataset(val_df, tokenizer, max_length, label_map)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [83]:
train_dataset.__len__()

1604

In [84]:
# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

In [85]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(200000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [86]:
model.train()
for epoch in range(epochs):
    running_loss = 0.0
    for batch in tqdm(train_loader, desc="Epoch " + str(epoch + 1) + "/" + str(epochs)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Evaluate on validation set
    model.eval()
    val_accuracy = 0
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            loss = criterion(logits, labels)
            val_loss += loss.item()

            _, predicted = torch.max(logits, 1)
            val_accuracy += (predicted == labels).sum().item()

    print('Epoch:', epoch + 1, 'Train Loss:', running_loss / len(train_loader), 'Val Loss:', val_loss / len(val_loader), 'Val Accuracy:', val_accuracy / len(val_dataset))


Epoch 1/5: 100%|██████████| 51/51 [01:03<00:00,  1.25s/it]


Epoch: 1 Train Loss: 1.4428224329854928 Val Loss: 1.4042055240044227 Val Accuracy: 0.34663341645885287


Epoch 2/5: 100%|██████████| 51/51 [01:04<00:00,  1.26s/it]


Epoch: 2 Train Loss: 1.1519972997553207 Val Loss: 0.9360140195259681 Val Accuracy: 0.5810473815461347


Epoch 3/5: 100%|██████████| 51/51 [01:04<00:00,  1.27s/it]


Epoch: 3 Train Loss: 0.8000872918203765 Val Loss: 0.8175020447144141 Val Accuracy: 0.6483790523690773


Epoch 4/5: 100%|██████████| 51/51 [01:04<00:00,  1.27s/it]


Epoch: 4 Train Loss: 0.40124329230656813 Val Loss: 0.5296221054517306 Val Accuracy: 0.8179551122194514


Epoch 5/5: 100%|██████████| 51/51 [01:05<00:00,  1.27s/it]


Epoch: 5 Train Loss: 0.3012448154828128 Val Loss: 0.5548075139522552 Val Accuracy: 0.800498753117207


# Saving model

In [87]:
torch.save(model.state_dict(), 'indic_bert_sentiment_model.pth')

# Evaluation

In [88]:
# Evaluate the model on the validation set
def evaluate_model(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits

            _, predicted = torch.max(logits, 1)
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    return accuracy

In [89]:
# Load the saved model
model.load_state_dict(torch.load('indic_bert_sentiment_model.pth'))
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(200000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [90]:
# Evaluate the model on the validation set
val_accuracy = evaluate_model(model, val_loader, device)
print('Validation Accuracy:', val_accuracy)

train_accuracy = evaluate_model(model, train_loader, device)
print('Training Accuracy:', train_accuracy)

Validation Accuracy: 0.800498753117207
Training Accuracy: 0.9139650872817955


# Prediction

In [91]:
# Make predictions on new data
def predict_sentiment(model, tokenizer, text, max_length, device):
    model.eval()
    inputs = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_label = torch.argmax(logits, dim=1).item()

    sentiment = [key for key, value in label_map.items() if value == predicted_label][0]
    return sentiment

In [93]:
# Example usage
text = "खिलौना ठीक-ठाक है लेकिन उसमें कुछ विशेष नहीं है।"
predicted_sentiment = predict_sentiment(model, tokenizer, text, max_length, device)
print('Predicted Sentiment:', predicted_sentiment)

Predicted Sentiment: neutral


In [95]:
# Example usage
text = "दुकान का सर्विस बहुत ही बेकर है समन भी डेफेक्टिव ठा और रेफुंड भी नही मिला गुस्सा हो रहा है"
predicted_sentiment = predict_sentiment(model, tokenizer, text, max_length, device)
print('Predicted Sentiment:', predicted_sentiment)

Predicted Sentiment: angry


In [96]:
# Example usage
text = "मुझे येह मसला पसंद आया स्वाड मस्त है और क्वालिटी भी एच्ची है खुशी हो रही है "
predicted_sentiment = predict_sentiment(model, tokenizer, text, max_length, device)
print('Predicted Sentiment:', predicted_sentiment)

Predicted Sentiment: happy


In [97]:
# Example usage
text = "बेकार प्लान है इनका "
predicted_sentiment = predict_sentiment(model, tokenizer, text, max_length, device)
print('Predicted Sentiment:', predicted_sentiment)

Predicted Sentiment: angry


In [100]:
# Example usage
text = "खेल के दौरान मेरे इस स्पोर्ट्स उपकरण का टूट जाना मेरी मनोबल को नहीं बढ़ाता बल्कि अधिक निराश कर देता है।"
predicted_sentiment = predict_sentiment(model, tokenizer, text, max_length, device)
print('Predicted Sentiment:', predicted_sentiment)

Predicted Sentiment: sad
