In [30]:
!pip install datasets tiktoken wandb --quiet
import tiktoken
import math
import torch
import numpy as np
import gc
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
import os
import time

In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [3]:
encoder_1 = tiktoken.get_encoding("gpt2")

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"shravanidhote12","key":"d238c4c92c6c26cc53fcc6382afeb017"}') # Put your kaggle username & key here


!chmod 600 /root/.kaggle/kaggle.json

Collecting kaggle==1.5.8
  Downloading kaggle-1.5.8.tar.gz (59 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/59.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.8-py3-none-any.whl size=73248 sha256=10c115fad4dff013b01a8825bfaf144b22d1877e1313acf6454f4308ba851ba6
  Stored in directory: /root/.cache/pip/wheels/0b/76/ca/e58f8afa83166a0e68f0d5cd2e7f99d260bdc40e35da080eee
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.16
    Uninstalling kaggle-1.5.16:
      Successfully uninstalled kaggle-1.5.16
Successfully installed kaggle-1.5.8


In [6]:
main_data = np.load('/content/drive/MyDrive/IDL_HW5/complete_data_2.npz')

train_dataset = main_data['train']
validation_dataset   = main_data['val']
test_dataset  = main_data['test']

# Finetuning loader

In [7]:
class Fine_tuning_dataloader(torch.utils.data.DataLoader):
    def __init__(self, data, bs=32):

        self.data, self.bs, len_data = data, bs, len(data)

        temp = len_data / self.bs
        self.batch_count = math.ceil(temp)

    def __len__(self):

        return self.batch_count

    def __iter__(self):

        np.random.shuffle(self.data)

        for point in range(0, len(self.data), self.bs):

            temp = point + self.bs

            ip = self.data[point:temp, :]

            column_except_first = ip[:, 1:]

            column_of_eot_tokens = np.full((ip.shape[0], 1), encoder_1.eot_token)

            op = np.concatenate((column_except_first, column_of_eot_tokens), axis=1)

            ip_as_int64 = ip.astype(np.int64)
            op_as_int64 = op.astype(np.int64)

            yield torch.from_numpy(ip_as_int64), torch.from_numpy(op_as_int64)


train_dataset_fineloader = Fine_tuning_dataloader(train_dataset)
val_dataset_fineloader = Fine_tuning_dataloader(validation_dataset)
test_dataset_fineloader = Fine_tuning_dataloader(test_dataset)

## Model Classes

In [8]:
# Architecture Configuration

n_head = 6
num_heads = 6
n_embd = 512
n_layer = 6
dropout = 0.2
block_size = 256
batch_size = 40
learning_rate = 3e-4

vocab_size = encoder_1.n_vocab

class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        # Linear layers for key, query, and value
        self.linear_layers = nn.ModuleList([
            nn.Linear(n_embd, head_size, bias=False) for _ in range(3)
        ])
        self.key_layer, self.query_layer, self.value_layer = self.linear_layers
        # Lower triangular mask for attention
        identity_matrix = torch.eye(block_size)
        lower_triangular_mask = torch.tril(identity_matrix)
        self.register_buffer('tril', lower_triangular_mask)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        # Self-attention mechanism
        batch_size = x.shape[0]
        time_steps = x.shape[1]
        channels = x.shape[2]
        key, query = self.key_layer(x), self.query_layer(x)
        key_transposed = key.transpose(-2, -1)
        attention_scores = torch.matmul(query, key_transposed) / torch.sqrt(torch.tensor(key.shape[-1], dtype=torch.float32))
        attention_mask = torch.where(self.tril[:time_steps, :time_steps] == 0, float('-inf'), attention_scores)
        attention_probs = self.drop(torch.nn.functional.log_softmax(attention_mask, dim=-1).exp())
        value = self.value_layer(x)
        output = attention_probs @ value
        return output

# Define Multi-Head Attention module
class MultiHeadAttention(nn.Module):
    """ Multiple heads of self-attention in parallel """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList()
        for _ in range(num_heads):
            self.heads.append(Head(head_size))
        self.projection = nn.Linear(head_size * num_heads, n_embd)
        self.drop = nn.Dropout(dropout)

    def forward(self, x):
        # Combine multiple self-attention heads
        individual_heads = [head_module(x) for head_module in self.heads]
        concatenated_heads_expanded = torch.cat(individual_heads, dim=-1)
        projected_out = self.projection(concatenated_heads_expanded)
        dropped_out = self.drop(projected_out)
        return dropped_out

# Define FeedForward module
class FeedForward(nn.Module):
    def __init__(self, n_embd, dropout):
        super(FeedForward, self).__init__()
        self.layer1 = nn.Linear(n_embd, 4 * n_embd)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(4 * n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        # FeedForward neural network
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        x = self.dropout(x)
        return x

# Define a Block in the GPT model
class Block(nn.Module):
    def __init__(self, n_embd, n_head):
        super().__init__()
        self.self_attention = MultiHeadAttention(n_head, (n_embd // n_head))
        self.feed_forward = FeedForward(n_embd, dropout)
        self.linearlayer1, self.linearlayer2 = nn.LayerNorm(n_embd), nn.LayerNorm(n_embd)

    def forward(self, x):
        # Block in the GPT model
        x = x + self.self_attention(self.linearlayer1(x)) + self.feed_forward(self.linearlayer2(x))
        return x

# Define the GPT Language Model
class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_emb_tbl = nn.Embedding(vocab_size, n_embd)
        self.pos_emb_tbl = nn.Embedding(block_size, n_embd)
        attention_blocks = [Block(n_embd, n_head=n_head) for _ in range(n_layer)]
        self.attention_blocks = nn.Sequential(*attention_blocks)
        self.layer_norm_final = nn.LayerNorm(n_embd)
        self.mode_head = nn.Linear(n_embd, vocab_size)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        # Initialize weights for linear and embedding layers
        if isinstance(module, (nn.Linear, nn.Embedding)):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            torch.nn.init.zeros_(module.bias) if hasattr(module, 'bias') and module.bias is not None else None

    def forward(self, idx, targets=None):
        # Forward pass through the GPT model
        embedding_indices = idx
        token_embeddings = self.token_emb_tbl(embedding_indices)
        pos_indices = torch.arange(idx.shape[1], device=device)
        positional_embeddings = self.pos_emb_tbl(pos_indices).unsqueeze(0)
        normalized_embeddings = self.layer_norm_final(self.attention_blocks(token_embeddings + positional_embeddings.expand_as(token_embeddings)))
        logits = self.mode_head(normalized_embeddings)
        loss = F.cross_entropy(logits.flatten(0, 1), targets.flatten()) if targets is not None else None
        return logits, loss

    def generate(self, idx, max_new_tokens):
        # Generate new tokens
        for _ in range(max_new_tokens):
            currentlogits, _ = self.forward(idx[:, -block_size:])
            currentlogits = currentlogits[:, -1, :]
            nexttoken = torch.multinomial(F.softmax(currentlogits, dim=-1), num_samples=1).view(-1, 1)
            idx = torch.hstack((idx, nexttoken))
        return idx


In [9]:
def generate_tokens_greedily(initial_indices, newtokens):
  for i in range(newtokens):
        indices_cond = initial_indices[:, -block_size:]
        with torch.no_grad():
          output_logits, loss = model(indices_cond)
        output_logits = output_logits[:, -1, :]
        prob = torch.nn.functional.softmax(output_logits, dim=-1)
        max_indices = torch.max(prob, dim=-1, keepdim=True)[1]
        initial_indices = torch.cat((initial_indices, max_indices), dim=1)
  return initial_indices

# Run

In [None]:
# wandb.finish()
import wandb
wandb.login(key="bc23ae38b83159616a86ecc84dc25bc2e82d4da4")

run = wandb.init(
    name = "hw5_finetuning-Kaggle-2", ## Wandb creates random run names if you skip this field
    reinit = True,  ### Allows reinitalizing runs when you re-run this cell
    # id = "ii86awr7",  # Insert specific run id here if you want to resume a previous run
    # resume = "must",  ### You need this to resume previous runs, but comment out reinit = True when using this
    project = "hw5-ablations")

# Initialize model

In [10]:
model = GPTLanguageModel()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=2, factor =0.1)

In [None]:
# path = "/content/drive/MyDrive/IDL/HW5/Finetune_tiral/xyz.pth"
# or
# url = wandb.restore("xyz.pth").name
# model.load_state_dict(torch.load(path,map_location=torch.device(device))["model_state_dict"])
# optimizer.load_state_dict(torch.load(path)["optimizer_state_dict"])

In [15]:
avg = 0
sum = 0
for param_group in optimizer.param_groups:
    avg = avg + param_group['lr']
    sum = sum+ 1

lr = avg/sum
print("Learning rate: ",lr)

for param_group in optimizer.param_groups:
    param_group['lr'] = 1e-04

avg = 0
sum = 0
for param_group in optimizer.param_groups:
    avg = avg + param_group['lr']
    sum = sum+ 1
lr = avg/sum
print("Set Learning rate to: ",lr)

Learning rate:  0.0001
Set Learning rate to:  0.0001


In [21]:
def generate_and_display_sample(max_tokens=20):

    input_content = ["This chocolate cake ", "The government is doing"]

    encoded_content = encoder_1.encode_batch(input_content)
    content_tensors = torch.tensor(encoded_content)
    content_tensors = content_tensors.to(device)

    contents_decoded = encoder_1.decode_batch(model.generate(content_tensors, max_tokens).cpu().numpy())

    print("\n####\n".join(contents_decoded))


interval_to_save = 1500

dir_to_save= "/content/drive/MyDrive/IDL_HW5/HW5/finetuning_trials/checkpoints"


In [23]:
def validate(loader, epoch, train_iter, loss):

    progress_bar = tqdm(total=len(loader), dynamic_ncols=True,
                   leave=False, pos=0, desc='Validate', ncols=5)

    training_error = 0.0

    for i, (p, q) in enumerate(loader):

        model.eval()

        p, q = p.to(device), q.to(device)

        with torch.no_grad():

              model_out = model(p, q)
              logits = model_out[0]
              loss = model_out[1]

        training_error = training_error + loss

        progress_bar.set_postfix(loss=f"{training_error / (train_iter + 1):.04f}").update()

        del p, q, logits
        gc.collect()
        torch.cuda.empty_cache()

    progress_bar.close()

    return training_error / i


### Run 1

In [29]:
evaluation_interval = 10000
max_iterations = 500000


In [None]:

# for epoch in range(5):

#     epoch_number = epoch + 1
#     print(f"Epoch: {epoch_number}")

#     progress_bar = tqdm(total=len(train_dataset_fineloader), dynamic_ncols=True,
#                    leave=False, desc='Train', ncols=5, position=0)

#     train_loss, start_time = 0, time.time()

#     for step, (p, q) in enumerate(train_dataset_fineloader):

#         if (step != 0 and step % evaluation_interval == 0) or step == max_iterations - 1:

#             filename = f"finetuning_2_{epoch}_{step}_{loss:.4f}.pth"
#             checkpoint_path = os.path.join(dir_to_save, filename)

#             model_state = model.state_dict()
#             optimizer_state = optimizer.state_dict()

#             states_dict = {'model_state_dict': model_state, 'optimizer_state_dict': optimizer_state}
#             torch.save(states_dict, checkpoint_path)

#             wandb.save(checkpoint_path)

#             average_train_loss = float(train_loss / (step + 1))
#             print(f"Step {step}: Average train loss {average_train_loss}")

#             validation_loss = validate(val_dataset_fineloader, epoch, step, average_train_loss)
#             print("Validation loss: ", validation_loss)

#             wandb.log({ 'train_loss': average_train_loss, 'valid_loss': validation_loss })


#         p, q = p.to(device), q.to(device)


#         result, error = model(p, q)

#         train_loss = train_loss + error

#         optimizer.zero_grad(set_to_none=True); error.backward(); optimizer.step()

#         average_loss = train_loss / (step + 1)
#         formatted_loss = "{:.04f}".format(average_loss)
#         progress_bar.set_postfix(loss=formatted_loss)


#         if step % 30 == 0:

#             # average_train_loss = train_loss / (step + 1)
#             # wandb.log({'train_loss': float(average_train_loss)})

#             mins = int((time.time() - start_time) // 60)
#             secs = int((time.time() - start_time) % 60)

#         if step % 2000 == 0:

#             iteration = step + 1
#             average_train_loss = train_loss / iteration
#             elapsed_time_formatted = f"{mins}m {secs}s"
#             print("Iter: {}  Train loss: {:.4f}  Elapsed Time: {}".format(iteration, float(average_train_loss), elapsed_time_formatted))

#         progress_bar.update()

#         del p, q, result

#         gc.collect()

#         torch.cuda.empty_cache()


Epoch:  1


 ######################################## 




Train:   0%|          | 1/51675 [00:06<90:34:56,  6.31s/it, loss=4.8965]

Iter: 1  Train loss: 4.896520614624023  Elapsed Time: 0m 6s


Train:   4%|▍         | 2001/51675 [31:23<13:02:03,  1.06it/s, loss=4.8026]

Iter: 2001  Train loss: 4.802559852600098  Elapsed Time: 31m 4s


Train:   8%|▊         | 4001/51675 [1:02:49<12:29:51,  1.06it/s, loss=4.8019]

Iter: 4001  Train loss: 4.801863193511963  Elapsed Time: 62m 39s


Train:  12%|█▏        | 6001/51675 [1:34:09<11:56:56,  1.06it/s, loss=4.7996]

Iter: 6001  Train loss: 4.799614429473877  Elapsed Time: 94m 9s


Train:  15%|█▌        | 8001/51675 [2:05:27<11:23:50,  1.06it/s, loss=4.7972]

Iter: 8001  Train loss: 4.797238826751709  Elapsed Time: 125m 8s




step 10000: train loss 4.795764923095703


                                                                          

Val loss:  tensor(4.8324, device='cuda:0')


Train:  19%|█▉        | 10001/51675 [2:52:11<3226:36:41, 278.73s/it, loss=4.7962]

Iter: 10001  Train loss: 4.796226978302002  Elapsed Time: 156m 36s


Train:  23%|██▎       | 12001/51675 [3:23:20<10:17:04,  1.07it/s, loss=4.7912]   

Iter: 12001  Train loss: 4.791209697723389  Elapsed Time: 203m 20s


Train:  27%|██▋       | 14001/51675 [3:54:25<9:46:42,  1.07it/s, loss=4.7869] 

Iter: 14001  Train loss: 4.7868757247924805  Elapsed Time: 234m 6s


Train:  31%|███       | 16001/51675 [4:25:33<9:15:07,  1.07it/s, loss=4.7827]

Iter: 16001  Train loss: 4.782687664031982  Elapsed Time: 265m 24s


Train:  35%|███▍      | 18001/51675 [4:56:38<8:41:46,  1.08it/s, loss=4.7794]

Iter: 18001  Train loss: 4.779421806335449  Elapsed Time: 296m 38s


Train:  39%|███▊      | 20000/51675 [5:27:43<8:13:51,  1.07it/s, loss=4.7767]

step 20000: train loss 4.776421070098877


                                                                          

Val loss:  tensor(4.8193, device='cuda:0')


Train:  39%|███▊      | 20001/51675 [5:43:34<2516:31:32, 286.02s/it, loss=4.7766]

Iter: 20001  Train loss: 4.776648044586182  Elapsed Time: 327m 25s


Train:  43%|████▎     | 22001/51675 [6:14:38<7:38:44,  1.08it/s, loss=4.7737]    

Iter: 22001  Train loss: 4.7736616134643555  Elapsed Time: 374m 29s


Train:  46%|████▋     | 24001/51675 [6:45:41<7:15:03,  1.06it/s, loss=4.7715]

Iter: 24001  Train loss: 4.771488666534424  Elapsed Time: 405m 41s


Train:  50%|█████     | 26001/51675 [7:16:46<6:39:04,  1.07it/s, loss=4.7693]

Iter: 26001  Train loss: 4.7693190574646  Elapsed Time: 436m 27s


Train:  54%|█████▍    | 28001/51675 [7:47:51<6:09:31,  1.07it/s, loss=4.7677]

Iter: 28001  Train loss: 4.76768159866333  Elapsed Time: 467m 42s


Train:  58%|█████▊    | 30000/51675 [8:18:55<5:35:44,  1.08it/s, loss=4.7656]

step 30000: train loss 4.765425205230713


                                                                          

Val loss:  tensor(4.8148, device='cuda:0')


Train:  58%|█████▊    | 30001/51675 [8:34:51<1730:35:06, 287.45s/it, loss=4.7656]

Iter: 30001  Train loss: 4.7655792236328125  Elapsed Time: 514m 51s


Train:  62%|██████▏   | 32001/51675 [9:05:53<5:04:39,  1.08it/s, loss=4.7640]    

Iter: 32001  Train loss: 4.7639970779418945  Elapsed Time: 545m 35s


Train:  65%|██████▍   | 33365/51675 [9:54:56<1400:13:51, 275.30s/it, loss=4.7630]wandb: Network error (SSLError), entering retry loop.
Train:  65%|██████▍   | 33374/51675 [11:54:28<3314:21:07, 651.97s/it, loss=4.7630]

### Run 2

In [None]:

for epoch in range(5):

    epoch_number = epoch + 1
    print(f"Epoch: {epoch_number}")

    progress_bar = tqdm(total=len(train_dataset_fineloader), dynamic_ncols=True,
                   leave=False, desc='Train', ncols=5, position=0)

    train_loss, start_time = 0, time.time()

    for step, (p, q) in enumerate(train_dataset_fineloader):

        if (step != 0 and step % evaluation_interval == 0) or step == max_iterations - 1:

            filename = f"finetuning_2_{epoch}_{step}_{loss:.4f}.pth"
            checkpoint_path = os.path.join(dir_to_save, filename)

            model_state = model.state_dict()
            optimizer_state = optimizer.state_dict()

            states_dict = {'model_state_dict': model_state, 'optimizer_state_dict': optimizer_state}
            torch.save(states_dict, checkpoint_path)

            wandb.save(checkpoint_path)

            average_train_loss = float(train_loss / (step + 1))
            print(f"Step {step}: Average train loss {average_train_loss}")

            validation_loss = validate(val_dataset_fineloader, epoch, step, average_train_loss)
            print("Validation loss: ", validation_loss)

            wandb.log({ 'train_loss': average_train_loss, 'valid_loss': validation_loss })


        p, q = p.to(device), q.to(device)


        result, error = model(p, q)

        train_loss = train_loss + error

        optimizer.zero_grad(set_to_none=True); error.backward(); optimizer.step()

        average_loss = train_loss / (step + 1)
        formatted_loss = "{:.04f}".format(average_loss)
        progress_bar.set_postfix(loss=formatted_loss)


        if step % 30 == 0:

            average_train_loss = train_loss / (step + 1)
            wandb.log({'train_loss': float(average_train_loss)})

            mins = int((time.time() - start_time) // 60)
            secs = int((time.time() - start_time) % 60)

        if step % 2000 == 0:

            iteration = step + 1
            average_train_loss = train_loss / iteration
            elapsed_time_formatted = f"{mins}m {secs}s"
            print("Iter: {}  Train loss: {:.4f}  Elapsed Time: {}".format(iteration, float(average_train_loss), elapsed_time_formatted))

        progress_bar.update()

        del p, q, result

        gc.collect()

        torch.cuda.empty_cache()


Epoch:  1


 ######################################## 




Train:   0%|          | 1/51675 [00:05<79:52:45,  5.56s/it, loss=3.7210]

Iter: 1  Train loss: 3.7209949493408203  Elapsed Time: 0m 5s


Train:   4%|▍         | 2001/51675 [11:11<4:36:23,  3.00it/s, loss=3.1947]

Iter: 2001  Train loss: 3.1946802139282227  Elapsed Time: 11m 5s


Train:   8%|▊         | 4001/51675 [22:16<4:26:41,  2.98it/s, loss=3.1872]

Iter: 4001  Train loss: 3.1871705055236816  Elapsed Time: 22m 13s


Train:  12%|█▏        | 6001/51675 [33:20<4:13:24,  3.00it/s, loss=3.1806]

Iter: 6001  Train loss: 3.1806278228759766  Elapsed Time: 33m 20s


Train:  15%|█▌        | 8001/51675 [44:26<4:02:57,  3.00it/s, loss=3.1786]

Iter: 8001  Train loss: 3.178565263748169  Elapsed Time: 44m 19s




step 10000: train loss 3.175849199295044




Val loss:  tensor(3.2184, device='cuda:0')


Train:  19%|█▉        | 10001/51675 [1:04:23<1840:47:43, 159.02s/it, loss=3.1762]

Iter: 10001  Train loss: 3.1761674880981445  Elapsed Time: 55m 30s


Train:  23%|██▎       | 12001/51675 [1:15:52<3:46:10,  2.92it/s, loss=3.1420]

Iter: 12001  Train loss: 3.1419801712036133  Elapsed Time: 75m 52s


Train:  27%|██▋       | 14001/51675 [1:27:16<3:32:22,  2.96it/s, loss=3.1161]

Iter: 14001  Train loss: 3.116067886352539  Elapsed Time: 87m 9s


Train:  31%|███       | 16001/51675 [1:38:37<3:23:48,  2.92it/s, loss=3.0960]

Iter: 16001  Train loss: 3.0960099697113037  Elapsed Time: 98m 33s


Train:  35%|███▍      | 18001/51675 [1:50:00<3:10:03,  2.95it/s, loss=3.0798]

Iter: 18001  Train loss: 3.079780101776123  Elapsed Time: 110m 0s


Train:  38%|███▊      | 19724/51675 [1:59:49<3:03:34,  2.90it/s, loss=3.0686]