In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from torch import nn
from transformers import AutoTokenizer, AutoModel
import torch
!set 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'


In [28]:
df = pd.read_csv("./data/train.csv")
df_test = pd.read_csv("./data/test.csv")
labels = 'toxic severe_toxic obscene threat insult identity_hate'.split()


In [3]:
df.head()


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
device = torch.device('cuda')
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased").to(device)


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def extract_target(df):
    return (df[labels].to_numpy())


def tokenize_data(df) -> torch.Tensor:
    return tokenizer(list(df["comment_text"]),
                     padding="max_length", truncation=True, return_tensors='pt', return_attention_mask=False)['input_ids']  # type: ignore


In [6]:
# df = pd.read_csv('./data/checkpoint_1.csv')

train, val = train_test_split(df, test_size=0.3, train_size=0.7, shuffle=True)
X_train = tokenize_data(train)
X_val = tokenize_data(val)
y_train = torch.Tensor(extract_target(train))
y_val = torch.Tensor(extract_target(val))


In [7]:
class Classifier(nn.Module):
    def __init__(self, input_dim) -> None:
        super().__init__()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 6),
        )

    def forward(self, x):
        logits = self.linear_relu_stack(x)
        return logits


In [8]:
input_dim = 768
learning_rate = 0.003
batch_size = 16
epoch_num = 1


In [39]:
from torch.utils.data import DataLoader, TensorDataset
dataset = TensorDataset(X_train.to(device), y_train.to(device))
dataloader = DataLoader(dataset, batch_size=batch_size)
test_dataloader = DataLoader(TensorDataset(
    X_val.to(device), y_val.to(device)), batch_size=batch_size)

classifier = Classifier(input_dim).to(device)
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(classifier.parameters(), lr=learning_rate)


In [41]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score


def logits_to_labels(logits):
    return torch.sigmoid(logits).round()


def train_loop(dataloader, model, classifier, loss_fn, optimizer):
    size = len(dataloader.dataset)
    for batch, (X, y) in tqdm(enumerate(dataloader)):
        # Compute prediction and loss
        embed = model(X).last_hidden_state[:, 0]
        pred = classifier(embed)
        loss = loss_fn(pred, y)

        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if batch % 512 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


def test_loop(dataloader, model, classifier, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in tqdm(dataloader):
            embed = model(X).last_hidden_state[:, 0]
            pred = classifier(embed)
            test_loss += loss_fn(pred, y).item()
            correct += accuracy_score(y.cpu().numpy(),
                                      logits_to_labels(pred).cpu().numpy())

    test_loss /= num_batches
    correct /= size
    return (100*correct), test_loss


In [20]:
for epoch in range(epoch_num):
    train_loop(dataloader, model, classifier, loss_fn, optimizer)
    (accuracy, test_loss) = test_loop(
        test_dataloader, model, classifier, loss_fn)
    print(
        f"Test Error: \n Accuracy: {(accuracy):>0.1f}%, Avg loss: {test_loss:>8f} \n")


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)