In [None]:
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import pandas as pd
import numpy as np
from sklearn import preprocessing
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoConfig, AdamW
from transformers import AutoModelForSequenceClassification
from transformers import BertTokenizer
from torch.utils.data import Dataset
import torch
from torch import nn
from transformers import Trainer, TrainingArguments
import numpy as np
from sklearn.model_selection import train_test_split 
import json

tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-bert")
model = AutoModelForSequenceClassification.from_pretrained("ai4bharat/indic-bert", num_labels=2)

In [None]:
def get_data(path):
    data=pd.read_csv(path)

    train=data.iloc[:9000]
    test=data.iloc[9000:9500]
    val=data.iloc[9500:10000]
     
    train['sentences']=train['sentences'].apply(eval)
    test['sentences']=test['sentences'].apply(eval)
    val['sentences']=val['sentences'].apply(eval)
    return train, val, test

input_path='../data/data.csv'
train, val, test=get_data(input_path)

In [None]:
from datasets import load_metric
metric1 = load_metric("accuracy")
metric2 = load_metric("f1")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric1.compute(predictions=predictions, references=labels)
    f1 = metric2.compute(predictions=predictions, references=labels)

    return accuracy

def compute_metrics_hyperparameter(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    accuracy = metric1.compute(predictions=predictions, references=labels)
    f1 = metric2.compute(predictions=predictions, references=labels)
    return f1

In [None]:
class CaseDataset(Dataset):
    def __init__(self,data,tokenizer):
        self.data = data
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.data)-1
    
    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Create empty lists to store outputs
        input_ids = []
        attention_mask = []

        encoded_sent = self.tokenizer.encode_plus(
            text=self.data.iloc[idx]['sentences'][0],   
            add_special_tokens=True,
            max_length=512,                   
            padding='max_length',             
            return_attention_mask=True,       
            truncation=True
            )
        
        input_ids = encoded_sent.get('input_ids')
        attention_mask = encoded_sent.get('attention_mask')
        
        ## Take the first 512 tokens 
        input_ids = input_ids[:512]
        attention_mask = attention_mask[:512]
        
        ## Take the last 512 tokens
        # input_ids = input_ids[-512:]
        # attention_mask = attention_mask[-512:]
        
        input_ids = torch.tensor(input_ids)
        attention_mask = torch.tensor(attention_mask)        
        label = torch.tensor(self.data.iloc[idx]['label'])
        # Create torch tensor with 1 value for each input
        
        return {'input_ids': input_ids, 'attention_mask': attention_mask, 'label': label}
train_dataset = CaseDataset(train, tokenizer)
validation_dataset = CaseDataset(val, tokenizer)

In [None]:
output_path='../results/classification/bert/first-last'
training_args = TrainingArguments(
    output_dir=output_path,
    num_train_epochs=100,         
    per_device_train_batch_size=4,  
    per_device_eval_batch_size=4,  
    warmup_steps=50,              
    weight_decay=0.01,             
    evaluation_strategy="epoch",
    save_strategy='epoch',
    learning_rate = 1e-5,
    save_total_limit=1
)

trainer = Trainer( model=model,args=training_args,train_dataset=train_dataset,eval_dataset=validation_dataset,compute_metrics=compute_metrics)
trainer.train()

In [None]:
test_dataset = CaseDataset(test, tokenizer)
predictions = trainer.predict(test_dataset)
print(predictions.metrics['test_accuracy'])