##1. Download Dataset.

In [1]:
#Download Dataset
!wget -q -nc http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

#Unzip
!tar -zxf /content/aclImdb_v1.tar.gz

##2. Install Dependencies.

In [2]:
# Install transformers library.
!pip install -q git+https://github.com/huggingface/transformers.git
# Install helper functions.
!pip install -q git+https://github.com/gmihaila/ml_things.git

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.0/43.0 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for ml_things (setup.py) ... [?25l[?25hdone


##3. Import dependencies.

In [3]:
import io
import os
import torch
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from ml_things import plot_dict, plot_confusion_matrix, fix_text
from transformers import(set_seed,
                         TrainingArguments,
                         Trainer,
                         GPT2Config,
                         GPT2Tokenizer,
                         AdamW,
                         get_linear_schedule_with_warmup,
                         GPT2ForSequenceClassification)

#Set reproducibility seed
set_seed(124)

#Set Epochs
epochs = 5

#Set Batch size
batch_size = 32

#Set maximum text sequence
max_length = 60

#Setup device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"

#Set the pretrained model to be used
model_name = 'gpt2'

#Create a dictionary of labels and their ids to convert string labels to number ids
label_ids = {'neg':0,'pos':1}

#Set number of labels
number_labels = len(label_ids)



#4. Instantiate the Model.

In [4]:
class MovieReviewModel(Dataset):
  def __init__(self,path,use_tokenizer):

    #Check if path exists.
    if not os.path.isdir(path):
      raise ValueError('Invalid "path" variable!, Needs to be a directory...')

    self.texts = []
    self.labels = []

    #Loop through the labels.
    for label in ['pos','neg']:
      sentiment_path = os.path.join(path,label)

      #Get all files from path
      file_names = os.listdir(sentiment_path)

      #Read through each file.
      for file_name in tqdm(file_names, desc=f'{label} files'):
        file_path = os.path.join(sentiment_path,file_name)

        #Read content.
        content = io.open(file_path, mode='r', encoding='utf-8').read()

        #Resolve any unicode issues and save content.
        content = fix_text(content)
        self.texts.append(content)
        self.labels.append(label)

    #The number of sample texts
    self.n_examples = len(self.labels)


    return


  def __len__(self):
    return self.n_examples

  def __getitem__(self, item) :
    return {'text':self.texts[item],
            'label':self.labels[item]}







  and should_run_async(code)


##5. Instantiate GPT2.

In [5]:
class Gpt2Classification(object):
  def __init__(self,use_tokenizer,labels_encoder,max_sequence_len=None):

    #Set tokenizer.
    self.use_tokenizer = use_tokenizer

    #Check max sequence length and label encoder in each class.
    self.max_sequence_len = use_tokenizer.model_max_length if max_sequence_len is None else max_sequence_len

    self.labels_encoder = labels_encoder

    return

  def __call__(self,sequences):

    # Get all texts from sequences list.
    texts = [sequence['text'] for sequence in sequences]
    # Get all labels from sequences list.
    labels = [sequence['label'] for sequence in sequences]
    # Encode all labels using label encoder.
    labels = [self.labels_encoder[label] for label in labels]
    # Call tokenizer on all texts to convert into tensors of numbers with
    # appropriate padding.
    inputs = self.use_tokenizer(text=texts, return_tensors="pt", padding=True, truncation=True,  max_length=self.max_sequence_len)
    # Update the inputs with the associated encoded labels as tensor.
    inputs.update({'labels':torch.tensor(labels)})

    return inputs




  and should_run_async(code)


##6.Train model on a single epoch first.

In [6]:
def train(dataloader,optimizer_,scheduler_,device_):

  #Use global varible.
  global model

  predictions_labels = []
  true_labels = []

  #Test loss for this epoch
  train_loss = 0

  #Put model into train mode.
  model.train()

  # For each batch of training data...
  for batch in tqdm(dataloader, total=len(dataloader)):

    # Add original labels - use later for evaluation.
    true_labels += batch['labels'].numpy().flatten().tolist()

    # move batch to target device
    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

    #Zero grad
    model.zero_grad()

    #Perform a forward pass
    outputs = model(**batch)

    loss, logits = outputs[:2]

    train_loss += loss.item()

    loss.backward()

    torch.nn.utils.clip_grad_norm(model.parameters(),1.0)

    optimizer_.step()

    scheduler_.step()

    logits = logits.detach().cpu().numpy()

    predictions_labels += logits.argmax(axis=-1).flatten().tolist()

  avg_loss = train_loss / len(dataloader)

  return true_labels, predictions_labels, avg_loss



  and should_run_async(code)


##7. Validate model on a single epoch first.

In [7]:
def validation(dataloader,device_):

  global model

  predictions_labels = []
  true_labels = []

  test_loss = 0

  model.eval()

  for batch in tqdm(dataloader,total=len(dataloader)):
    true_labels += batch['labels'].numpy().flatten().tolist()

    batch = {k:v.type(torch.long).to(device_) for k,v in batch.items()}

    with torch.no_grad():
      outputs = model(**batch)

      loss, logits = outputs[:2]
      test_loss += loss.item()

      predict_content = logits.argmax(axis=-1).flatten().tolist()

      predictions_labels += predict_content

  avg_loss = test_loss/len(dataloader)

  return true_labels, predictions_labels, avg_loss



  and should_run_async(code)


##8. Load model and Tokenizer.

In [8]:
# Get model configuration.
print('Loading configuraiton...')
model_config = GPT2Config.from_pretrained(pretrained_model_name_or_path=model_name, num_labels=number_labels)

# Get model's tokenizer.
print('Loading tokenizer...')
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=model_name)
# default to left padding
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token


# Get the actual model.
print('Loading model...')
model = GPT2ForSequenceClassification.from_pretrained(pretrained_model_name_or_path=model_name, config=model_config)

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# Load model to defined device.
model.to(device)
print('Model loaded to `%s`'%device)

  and should_run_async(code)


Loading configuraiton...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Loading tokenizer...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



Loading model...


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded to `cuda`


##8. Create Dataset and Collator.

In [9]:
# Create data collator to encode text and labels into numbers.
gpt2_classificaiton_collator = Gpt2Classification(use_tokenizer=tokenizer,
                                                          labels_encoder=label_ids,
                                                          max_sequence_len=max_length)

print('Dealing with Train...')
# Create pytorch dataset.
train_dataset = MovieReviewModel(path='/content/aclImdb/train',
                               use_tokenizer=tokenizer)
print('Created `train_dataset` with %d examples!'%len(train_dataset))

print()

# Move pytorch dataset into dataloader.
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=gpt2_classificaiton_collator)
print('Created `train_dataloader` with %d batches!'%len(train_dataloader))

print()

print('Dealing with Validation...')
# Create pytorch dataset.
test_dataset =  MovieReviewModel(path='/content/aclImdb/test',
                               use_tokenizer=tokenizer)
print('Created `valid_dataset` with %d examples!'%len(test_dataset))

# Move pytorch dataset into dataloader.
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=gpt2_classificaiton_collator)
print('Created `eval_dataloader` with %d batches!'%len(test_dataloader))

  and should_run_async(code)


Dealing with Train...


pos files: 100%|██████████| 12500/12500 [00:04<00:00, 2880.42it/s]
neg files: 100%|██████████| 12500/12500 [00:04<00:00, 2996.09it/s]


Created `train_dataset` with 25000 examples!

Created `train_dataloader` with 782 batches!

Dealing with Validation...


pos files: 100%|██████████| 12500/12500 [00:05<00:00, 2481.83it/s]
neg files: 100%|██████████| 12500/12500 [00:05<00:00, 2347.73it/s]

Created `valid_dataset` with 25000 examples!
Created `eval_dataloader` with 782 batches!





##9. Perform Train step on the model.


In [None]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = torch.optim.AdamW(model.parameters(),
                  lr = 2e-5, # default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # default is 1e-8.
                  )

#Number of training steps...
total_steps = len(train_dataloader)* epochs

#Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

# Store the average loss after each epoch so we can plot them.
all_loss = {'train_loss':[], 'test_loss':[]}
all_acc = {'train_acc':[], 'test_acc':[]}

# Loop through each epoch.
print('*****Epoch*****')
for epoch in tqdm(range(epochs)):
  print()
  print('Training on batches...')
  # Perform one full pass over the training set.
  train_labels, train_predict, train_loss = train(train_dataloader, optimizer, scheduler, device)
  train_acc = accuracy_score(train_labels, train_predict)

  # Get prediction form model on validation data.
  print('Validation on batches...')
  test_labels, test_predict, test_loss = validation(test_dataloader, device)
  test_acc = accuracy_score(test_labels, test_predict)

  # Print loss and accuracy values to see how training evolves.
  print("  train_loss: %.5f - val_loss: %.5f - train_acc: %.5f - valid_acc: %.5f"%(train_loss, test_loss, train_acc, test_acc))
  print()

 # Store the loss value for plotting the learning curve.
  all_loss['train_loss'].append(train_loss)
  all_loss['test_loss'].append(test_loss)
  all_acc['train_acc'].append(train_acc)
  all_acc['test_acc'].append(test_acc)

# Plot loss curves.
plot_dict(all_loss, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])

# Plot accuracy curves.
plot_dict(all_acc, use_xlabel='Epochs', use_ylabel='Value', use_linestyles=['-', '--'])



  and should_run_async(code)


*****Epoch*****


  0%|          | 0/5 [00:00<?, ?it/s]


Training on batches...



  torch.nn.utils.clip_grad_norm(model.parameters(),1.0)

  0%|          | 1/782 [00:03<44:53,  3.45s/it][A
  0%|          | 2/782 [00:04<23:13,  1.79s/it][A
  0%|          | 3/782 [00:04<15:22,  1.18s/it][A
  1%|          | 4/782 [00:05<11:42,  1.11it/s][A
  1%|          | 5/782 [00:05<10:31,  1.23it/s][A
  1%|          | 6/782 [00:06<09:38,  1.34it/s][A
  1%|          | 7/782 [00:06<08:47,  1.47it/s][A
  1%|          | 8/782 [00:07<08:13,  1.57it/s][A
  1%|          | 9/782 [00:07<07:33,  1.71it/s][A
  1%|▏         | 10/782 [00:08<07:51,  1.64it/s][A
  1%|▏         | 11/782 [00:08<07:16,  1.77it/s][A
  2%|▏         | 12/782 [00:09<07:19,  1.75it/s][A
  2%|▏         | 13/782 [00:10<07:02,  1.82it/s][A
  2%|▏         | 14/782 [00:10<07:36,  1.68it/s][A
  2%|▏         | 15/782 [00:11<07:03,  1.81it/s][A
  2%|▏         | 16/782 [00:11<06:40,  1.91it/s][A
  2%|▏         | 17/782 [00:12<07:10,  1.78it/s][A
  2%|▏         | 18/782 [00:13<07:39,  1.66it/s][A
  2%|▏         

Validation on batches...



  0%|          | 0/782 [00:00<?, ?it/s][A
  0%|          | 1/782 [00:00<01:49,  7.16it/s][A
  0%|          | 2/782 [00:00<01:45,  7.38it/s][A
  0%|          | 3/782 [00:00<01:47,  7.23it/s][A
  1%|          | 4/782 [00:00<01:55,  6.72it/s][A
  1%|          | 5/782 [00:00<01:57,  6.60it/s][A
  1%|          | 6/782 [00:00<01:53,  6.83it/s][A
  1%|          | 7/782 [00:01<01:50,  7.00it/s][A
  1%|          | 8/782 [00:01<01:46,  7.27it/s][A
  1%|          | 9/782 [00:01<01:48,  7.09it/s][A
  1%|▏         | 10/782 [00:01<01:47,  7.18it/s][A
  1%|▏         | 11/782 [00:01<01:48,  7.09it/s][A
  2%|▏         | 12/782 [00:01<01:54,  6.72it/s][A
  2%|▏         | 13/782 [00:01<01:51,  6.91it/s][A
  2%|▏         | 14/782 [00:02<01:50,  6.94it/s][A
  2%|▏         | 15/782 [00:02<01:50,  6.95it/s][A
  2%|▏         | 16/782 [00:02<01:49,  6.98it/s][A
  2%|▏         | 17/782 [00:02<01:48,  7.04it/s][A
  2%|▏         | 18/782 [00:02<01:47,  7.11it/s][A
  2%|▏         | 19/782 [00:0

##10. Plot a confusion matrix to evaluate results.

In [None]:
# Get prediction form model on validation data. This is where you should use.

true_labels, predictions_labels, avg_loss = validation(test_dataloader, device)

# Create the evaluation report.
evaluation_report = classification_report(true_labels, predictions_labels, labels=list(label_ids.values()), target_names=list(label_ids.keys()))
# Show the evaluation report.
print(evaluation_report)

# Plot confusion matrix.
plot_confusion_matrix(y_true=true_labels, y_pred=predictions_labels,
                      classes=list(label_ids.keys()), normalize=True,
                      magnify=0.1,
                      );