In [None]:
!pip install transformers
!pip install torch

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from transformers import Trainer, TrainingArguments, AdamW
import torch
import os
import mlflow
import mlflow.pytorch
import math

In [None]:
config = GPT2Config()
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel(config)

In [4]:
config.n_embd = 256
config.n_head = 4
config.n_layer = 4
config.n_positions = 128

In [5]:

model = GPT2LMHeadModel(config)

In [6]:
num_params = model.num_parameters()
print(f"Number of parameters in the model: {num_params}")

Number of parameters in the model: 16058112


In [7]:
class DataLoaderLite:
  def __init__(self, B, T, process_rank, num_processes, train_ratio=0.6, valid_ratio=0.2, mode='train'):
      self.B = B
      self.T = T
      self.process_rank = 0
      self.num_processes = 1
      self.train_ratio = train_ratio
      self.valid_ratio = valid_ratio
      self.mode = mode

      with open('input.txt', 'r') as f:
          text = f.read()
      enc = tokenizer
      tokens = enc.encode(text)
      self.tokens = torch.tensor(tokens)

      total_length = len(self.tokens)
      split_index_train = int(total_length * self.train_ratio)
      split_index_val = int(total_length * (self.train_ratio + self.valid_ratio))

      print("total tokens: ", len(self.tokens))

      if self.mode == 'train':
          self.tokens = self.tokens[:split_index_train]
      elif self.mode == 'val':
          self.tokens = self.tokens[split_index_train:split_index_val]
      else:  
          self.tokens = self.tokens[split_index_val:]

      if self.process_rank == 0:
          print(f"Loaded {len(self.tokens)} tokens for {self.mode} set")

      self.current_position = self.B * self.T * self.process_rank

  def next_batch(self):
      B, T = self.B, self.T
      buf = self.tokens[self.current_position : self.current_position + B * T + 1]
      x = (buf[:-1]).view(B, T)  
      y = (buf[1:]).view(B, T)    
      
      self.current_position += B * T * self.num_processes
      if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
          self.current_position = self.B * self.T * self.process_rank
      return x, y


B = 32  
T = 32  
train_ratio = 0.6  
valid_ratio = 0.2   
test_ratio = 0.2    

train_loader = DataLoaderLite(B, T, train_ratio, valid_ratio, mode='train')
val_loader = DataLoaderLite(B, T, train_ratio, valid_ratio, mode='val')
test_loader = DataLoaderLite(B, T, train_ratio, valid_ratio, mode='test')

x_train, y_train = train_loader.next_batch()
print("Training batch:", x_train.shape, y_train.shape)

x_val, y_val = val_loader.next_batch()
print("Validation batch:", x_val.shape, y_val.shape)

x_test, y_test = test_loader.next_batch()
print("Test batch:", x_test.shape, y_test.shape)

total tokens:  338025
Loaded 202815 tokens for train set
total tokens:  338025
Loaded 67605 tokens for val set
total tokens:  338025
Loaded 67605 tokens for test set
Training batch: torch.Size([32, 32]) torch.Size([32, 32])
Validation batch: torch.Size([32, 32]) torch.Size([32, 32])
Test batch: torch.Size([32, 32]) torch.Size([32, 32])


In [9]:
mlflow_log_dir = 'mlp/mlflow_logs/'
os.makedirs(mlflow_log_dir, exist_ok=True)
mlflow.set_tracking_uri(mlflow_log_dir)

In [10]:
optimizer = AdamW(model.parameters(), lr=5e-5)
tokenizer.pad_token = tokenizer.eos_token

num_train_epochs = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

experiment_name = "My_Experiment"  
mlflow.set_experiment(experiment_name)

mlflow.start_run()

mlflow.log_param("learning_rate", 5e-5)
mlflow.log_param("num_train_epochs", num_train_epochs)

model.train()
for epoch in range(num_train_epochs):
  for step in range(len(train_loader.tokens) // (B * T)):
      batch = train_loader.next_batch()
      input_ids = batch[0].to(device)
      labels = batch[1].to(device)

      outputs = model(input_ids, labels=labels)
      loss = outputs.loss

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      if step % 100 == 0:
          print(f"Epoch {epoch + 1}, Step {step}, Training Loss: {loss.item()}")
          mlflow.log_metric("training_loss", loss.item(), step=step + epoch * (len(train_loader.tokens) // (B * T)))

  model.eval()  
  val_loss = 0.0
  num_val_batches = len(val_loader.tokens) // (B * T)

  with torch.no_grad(): 
      for val_step in range(num_val_batches):
          val_batch = val_loader.next_batch()
          val_input_ids = val_batch[0].to(device)
          val_labels = val_batch[1].to(device)

          val_outputs = model(val_input_ids, labels=val_labels)
          val_loss += val_outputs.loss.item()

  val_loss /= num_val_batches
  print(f"Epoch {epoch + 1}, Validation Loss: {val_loss}")
  mlflow.log_metric("validation_loss", val_loss, step=epoch)

mlflow.pytorch.log_model(model, "model")
mlflow.end_run()
print("Training complete.")



Epoch 1, Step 0, Training Loss: 10.837706565856934


Epoch 1, Step 100, Training Loss: 9.00510025024414
Epoch 1, Validation Loss: 7.558545264330777
Epoch 2, Step 0, Training Loss: 7.365004062652588
Epoch 2, Step 100, Training Loss: 6.9494547843933105
Epoch 2, Validation Loss: 6.600368246887669
Epoch 3, Step 0, Training Loss: 6.260554790496826
Epoch 3, Step 100, Training Loss: 6.532742023468018
Epoch 3, Validation Loss: 6.451968019658869
Epoch 4, Step 0, Training Loss: 6.025532245635986
Epoch 4, Step 100, Training Loss: 6.336957931518555
Epoch 4, Validation Loss: 6.392018455447572
Epoch 5, Step 0, Training Loss: 5.896519660949707
Epoch 5, Step 100, Training Loss: 6.1768317222595215
Epoch 5, Validation Loss: 6.346493713783495
Epoch 6, Step 0, Training Loss: 5.7864227294921875
Epoch 6, Step 100, Training Loss: 6.041418075561523
Epoch 6, Validation Loss: 6.327969782280199
Epoch 7, Step 0, Training Loss: 5.697930335998535
Epoch 7, Step 100, Training Loss: 5.917299270629883
Epoch 7, Validation Loss: 6.320606340061534
Epoch 8, Step 0, Training 



Epoch 10, Validation Loss: 6.33620548248291
Training complete.


In [11]:
def evaluate_perplexity(model, dataloader):
  model.eval()
  total_loss = 0.0
  total_count = 0

  with torch.no_grad():
      for step in range(len(dataloader.tokens) // (B * T)):
          batch = dataloader.next_batch()
          input_ids = batch[0].to(device)
          labels = batch[1].to(device)

          outputs = model(input_ids, labels=labels)
          loss = outputs.loss
          total_loss += loss.item() * input_ids.size(0)
          total_count += input_ids.size(0)

  avg_loss = total_loss / total_count
  perplexity = math.exp(avg_loss)
  return perplexity

test_perplexity = evaluate_perplexity(model, test_loader)
print(f'Test Perplexity: {test_perplexity}')

Test Perplexity: 591.0631681861266
