## Loading IMDB movie review dataset

In [1]:
import gzip
import shutil
import time

import pandas as pd
import requests
import torch
from torch.nn import functional as Flatten
import torchtext

import transformers
from transformers import DistilBertTokenizerFast, DistilBertTokenizer
from transformers import DistilBertForSequenceClassification


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.backends.cudnn.deterministic = True
random_seed = 123
torch.manual_seed(random_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs = 5


In [3]:
url = "https://github.com/rasbt/machine-learning-book/raw/main/ch08/movie_data.csv.gz"
filename = url.split("/")[-1]

with open(filename, "wb") as f:
	r = requests.get(url)
	f.write(r.content)

with gzip.open("movie_data.csv.gz", "rb") as f_in:
	with open("movie_data.csv", "wb") as f_out:
		shutil.copyfileobj(f_in, f_out)

In [4]:
df = pd.read_csv("movie_data.csv")
df.head(3)

Unnamed: 0,review,sentiment
0,"In 1974, the teenager Martha Moxley (Maggie Gr...",1
1,OK... so... I really like Kris Kristofferson a...,0
2,"***SPOILER*** Do not read this, if you think a...",0


In [5]:
train_texts = df.iloc[:35000]["review"].values
train_labels = df.iloc[:35000]["sentiment"].values

valid_texts = df.iloc[35000:40000]["review"].values
valid_labels = df.iloc[35000:40000]["sentiment"].values

test_texts = df.iloc[40000:]["review"].values
test_labels = df.iloc[40000:]["sentiment"].values


### Tokenizing the dataset 

In [16]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased" )

train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
valid_encodings = tokenizer(list(valid_texts), truncation=True, padding=True)
test_encodings = tokenizer(list(test_texts), truncation=True, padding=True)


In [17]:
class IMDBdataset(torch.utils.data.Dataset):
	def __init__(self, encodings, labels):
		self.encodings = encodings
		self.labels = labels

	def __getitem__(self, idx):
		item = {
			key : torch.tensor(val[idx]) for key, val in self.encodings.items()
		}
		item["labels"] = torch.tensor(self.labels[idx])

		return item
	def __len__(self):
		return len(self.labels)
	





In [23]:

train_dataset = IMDBdataset(train_encodings, train_labels)
valid_dataset = IMDBdataset(valid_encodings, valid_labels)
test_dataset = IMDBdataset(test_encodings, test_labels)

train_loader = torch.utils.data.DataLoader(
	train_dataset, batch_size=4, shuffle=True
)

valid_loader = torch.utils.data.DataLoader(
	valid_dataset, batch_size=4, shuffle=False
)

test_loader = torch.utils.data.DataLoader(
	test_dataset, batch_size=4, shuffle=False
)

### Loading and Finetuning a pretrained BERT Model


In [27]:
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
model = model.to(device)
model.train()

optim = torch.optim.Adam(model.parameters(), lr=5e-5)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


OutOfMemoryError: CUDA out of memory. Tried to allocate 90.00 MiB. GPU 0 has a total capacty of 1.95 GiB of which 7.56 MiB is free. Including non-PyTorch memory, this process has 1.93 GiB memory in use. Of the allocated memory 1.88 GiB is allocated by PyTorch, and 26.92 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [28]:
def compute_accuracy(model,data_loader, device):
	with torch.no_grad():
		correct_pred, num_examples = 0,0
		for batch_idx, batch in enumerate(data_loader):
			# Prepare data
			input_ids = batch['input_ids'].to(device)
			attention_mask = batch['attention_mask'].to(device)
			labels = batch['labels'].to(device)

			outputs = model(input_ids, attention_mask=attention_mask)
			logits = outputs["logits"]
			predicted_labels = torch.argmax(logits, 1)
			num_examples += labels.size(0)
			correct_pred += (predicted_labels == labels).sum()
	
	return correct_pred.float()/num_examples * 100

In [29]:
start_time = time.time()

for epoch in range(num_epochs):
	model.train()
	for batch_idx, batch in enumerate(train_loader):
		input_ids = batch["input_ids"].to(device)
		attention_mask = batch["attention_mask"].to(device)
		labels = batch["labels"].to(device)

		# forward pass
		outputs = model(input_ids, attention_mask = attention_mask, labels = labels)
		loss, logits = outputs["loss"], outputs["logits"]

		# Backward pass
		optim.zero_grad()
		loss.backward()
		optim.step()

		# logging
		if not batch_idx % 250:
			print(f" Epoch: {epoch+1: 04d}/{num_epochs:04d} | Batch : {batch_idx:04d}/"
		 f"{len(train_loader):04d} | "
		 f"Loss : {loss: .4f}")
			
	model.eval()
	with torch.set_grad_enabled(False):
		print(f"Training Accuracy: {compute_accuracy(model, train_loader, device):.2f}%"
		f"\n Valid accuracy : {compute_accuracy(model, valid_loader, device):.2f}%")

	print(f"Time elapsed : {(time.time() - start_time)/60:.2f} min")

print(f"Total training time:{(time.time() - start_time)/60:.2f} min ")
print(f"Test Accuracy: {compute_accuracy(model, test_loader, device):.2f}%")




RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)