In [None]:
%%capture
!wget https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xf aclImdb_v1.tar.gz
!pip install datasets
!pip install --upgrade fsspec
!pip install fsspec==0.9.0

In [None]:
import torch
import numpy as np
from pathlib import Path
from sklearn.utils import shuffle
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, AdamW

In [None]:
RANDOM_STATE = 41

In [None]:
TRAIN_DATA_PATH = Path('./aclImdb/train')
TEST_DATA_PATH = Path('./aclImdb/test')

In [None]:
def read_imdb_split(split_dir):
    texts, labels = [], []
    for label_dir in ["pos", "neg"]:
        for text_file in (split_dir/label_dir).iterdir():
            texts.append(text_file.read_text())
            labels.append(0 if label_dir is "neg" else 1)
    return texts, labels

In [None]:
train_texts, train_labels = read_imdb_split(TRAIN_DATA_PATH)
train_texts, train_labels = shuffle(train_texts, train_labels, random_state=RANDOM_STATE)
train_texts, train_labels = train_texts[:1000], train_labels[:1000]


test_texts, test_labels = read_imdb_split(TEST_DATA_PATH)
test_texts, test_labels = shuffle(test_texts, test_labels,  random_state=RANDOM_STATE)
test_texts, test_labels = test_texts[:1000], test_labels[:1000]

train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, train_labels, test_size=0.2, random_state=RANDOM_STATE)


In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
valid_encodings = tokenizer(valid_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
class ImdbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)
    
train_dataset = ImdbDataset(train_encodings, train_labels)
valid_dataset = ImdbDataset(valid_encodings, valid_labels)
test_dataset = ImdbDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=16, shuffle=False)

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
model.to(device)
optim = AdamW(model.parameters(), lr=5e-5)

In [None]:
num_epochs = 3
model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k:v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optim.step()
        optim.zero_grad()

In [None]:
from datasets import load_metric

metric = load_metric("accuracy")

In [None]:
model.eval()
for batch in valid_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()