In [None]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification
!pip install nlpaug
import nlpaug.augmenter.word as naw

In [None]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].replace({'positive': 0, 'negative': 1})
df = df.sample(frac=1)
n = df.shape[0]
df_train = df.iloc[0: int(n * 0.9)]
df_test = df.iloc[int(n * 0.9): ]
x_test = df_test['review'].to_list()
y_test = df_test['sentiment']
df_aug = df_train.copy()
aug = naw.RandomWordAug(action="swap", aug_max=30)
df_aug['review'] = aug.augment(df_aug['review'].to_list())
aug = naw.SpellingAug(aug_max=30)
df_aug['review'] = aug.augment(df_aug['review'].to_list())
df_train = pd.concat([df_train, df_aug])
x_train = df_train['review'].to_list()
y_train = df_train['sentiment']

In [None]:
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.transformer = AutoModelForSequenceClassification.from_pretrained('microsoft/deberta-v3-base', num_labels=2)

    def forward(self, text):
        x = self.transformer(**text).logits
        return x

In [None]:
class MyDataset(data.Dataset):
    def __init__(self, text, y=None):
        super().__init__()
        self.tokenizer = AutoTokenizer.from_pretrained('microsoft/deberta-v3-base')
        self.x = self.tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
        self.y = torch.from_numpy(y.to_numpy())
            
    def __getitem__(self, index):
        text = {k: v[index] for k, v in self.x.items()}
        return (text, self.y[index])

    def __len__(self):
        return self.x['input_ids'].shape[0]

In [None]:
loss_func = nn.CrossEntropyLoss()
batch_size = 8
learning_rate = 1e-5
num_epochs = 1
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = Model()
print(model)       
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate)
train_dataset = MyDataset(x_train, y_train)
train_loader = data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
test_dataset = MyDataset(x_test, y_test)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

In [None]:
for epoch in range(1, num_epochs + 1):  
    model.train()
    print('epoch:', epoch)
    train_epoch_loss, train_n = 0, 0
    for text, targets in train_loader:
        text = {k:v.long().to(device) for k,v in text.items()}
        targets = targets.long().to(device)
        preds = model(text)
        loss = loss_func(preds, targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 
        
    model.eval() 
    with torch.no_grad():
        for mode, loader in zip(['train', 'test'], [train_loader, test_loader]):
            epoch_loss, currect, num_samples = 0, 0, 0
            for text, targets in loader:
                text = {k: v.long().to(device) for k,v in text.items()}
                targets = targets.long().to(device)
                preds = model(text)
                loss = loss_func(preds, targets)
                epoch_loss += loss.item() * targets.shape[0] 
                currect += (torch.argmax(preds, axis=1) == targets).sum()
                num_samples += targets.shape[0]

            epoch_loss = epoch_loss / num_samples
            accuracy = currect / num_samples
            print(mode, '- loss:', f'{epoch_loss:.2}', 'accuracy:', f'{accuracy:.4}')    