In [2]:
!pip install pytorch-lightning

Collecting pytorch-lightning
  Downloading pytorch_lightning-2.4.0-py3-none-any.whl.metadata (21 kB)
Collecting torchmetrics>=0.7.0 (from pytorch-lightning)
  Downloading torchmetrics-1.4.1-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.10.0 (from pytorch-lightning)
  Downloading lightning_utilities-0.11.6-py3-none-any.whl.metadata (5.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=2.1.0->pytorch-lightning)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=2.1.0->pytorch-lightning)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=2.1.0->pytorch-lightning)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=2.1.0->pytorch-lightning)
  Using cached nv

In [151]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
from torch.utils.data import TensorDataset, DataLoader ## We'll store our data in DataLoaders
from torch.optim import Adam

import pytorch_lightning as pl

In [152]:
# Variables
d_model = 256 #dimension of model, also the number of value coming out from the embedding value
max_length = 65
no_of_token = max_length
d_ff = 1024 #dimension of feed forward layers
num_attention_heads = 3

In [153]:
def tokenize(sentence):
    sentence = sentence.split()
    tokens = []
    for word in sentence:
      if word == "<EOS>":
        tokens.append(word)
      else:
        word = word.lower()
        if word[-1] in [",", ".", "!", "?", ":", ";", ")", "]", "}", "'", '"']:
          tokens.append(word[:-1])
          tokens.append(word[-1])
        elif word[0] in ["(", "[", "{", "'", '"']:
          tokens.append(word[0])
          tokens.append(word[1:])
        else:
          tokens.append(word)
    return tokens
assert tokenize("i hAVe a brother, and a sister. <EOS>") == ['i', 'have', 'a', 'brother', ',', 'and', 'a', 'sister', '.', '<EOS>']

In [154]:

with open("/content/QnA_pair.txt", "r") as data:
  data = data.readlines()
token_to_idx = {}
for line in data:
  for word in tokenize(line):
    if word not in token_to_idx:
      token_to_idx[word] = len(token_to_idx)
print(token_to_idx)
idx_to_token = dict(map(reversed, token_to_idx.items()))
print(idx_to_token)

inputs = []
labels = []

for line in data:
  EOS = 0
  #line = What is Chardonnay? <EOS> Chardonnay is a ... <EOS>
  input = []
  output = []
  #get input:
  for word in tokenize(line):
    if word == "<EOS>":
      if EOS == 0: #if encounter the first <EOS> keep going until meet the second one
        EOS =+ 1
        input.append(token_to_idx[word])
      elif EOS == 1:
        break
    else:
      input.append(token_to_idx[word])
  #get output
  for word in tokenize(line)[1:]:
      output.append(token_to_idx[word])

  inputs.append(input)
  labels.append(output)
inputs.pop()
labels.pop()

print(inputs)
print(labels)
assert len(inputs) == len(labels)
for i in inputs:
  assert i.count(4) == 1 #check if in input have only one EOS
for i in labels:
  assert i.count(4) == 2 #check if in labels have only two EOS

# Open the file for writing
with open('file.txt', 'w') as f:
    f.write(f"token_to_idx: {token_to_idx}\n\n")
    f.write(f"idx_to_token: {idx_to_token}\n\n")
    f.write(f"inputs: {inputs}\n\n")
    f.write(f"labels: {labels}")

{'what': 0, 'is': 1, 'chardonnay': 2, '?': 3, '<EOS>': 4, 'a': 5, 'full-bodied': 6, 'white': 7, 'wine': 8, 'known': 9, 'for': 10, 'its': 11, 'rich': 12, ',': 13, 'creamy': 14, 'texture': 15, 'and': 16, 'flavors': 17, 'of': 18, 'butter': 19, 'vanilla': 20, 'oak': 21, '.': 22, 'it': 23, 'originates': 24, 'from': 25, 'burgundy': 26, 'france': 27, 'but': 28, 'now': 29, 'produced': 30, 'in': 31, 'many': 32, 'regions': 33, 'around': 34, 'the': 35, 'world': 36, 'pairs': 37, 'well': 38, 'with': 39, 'seafood': 40, 'dishes': 41, 'can': 42, 'you': 43, 'describe': 44, 'are': 45, 'characteristics': 46, 'where': 47, 'does': 48, 'originate': 49, 'food': 50, 'tell': 51, 'me': 52, 'about': 53, 'pinot': 54, 'noir': 55, 'light-bodied': 56, 'red': 57, 'berries': 58, 'such': 59, 'as': 60, 'cherry': 61, 'raspberry': 62, 'delicate': 63, 'aroma': 64, 'silky': 65, 'versatile': 66, 'that': 67, 'complements': 68, 'like': 69, 'roast': 70, 'chicken': 71, 'salmon': 72, 'main': 73, 'features': 74, 'how': 75, 'taste'

In [155]:
combine = inputs + labels
print(len(inputs), len(labels))
print(len(combine))

65 65
130


In [156]:
combine = inputs+labels
length = [len(sentence) for sentence in combine]
print(max(length))

64


In [157]:
def pad_sequences(ins, labs):
  combine = ins+labs
  length = [len(sentence) for sentence in combine]
  max_length = 65
  padded_ins = []
  padded_labs = []
  for sequence in ins:
    padding = [0] * (max_length - len(sequence))
    padded_sequence = sequence + padding
    padded_ins.append(padded_sequence)
  for sequence in labs:
    padding = [0] * (max_length - len(sequence))
    padded_sequence = sequence + padding
    padded_labs.append(padded_sequence)
  return padded_ins, padded_labs

one, two = pad_sequences([[3,2,4,2,3,2,3], [3,2,2]],
                         [[3,2],[3,2,1]])
assert len(one) == len(two)


In [158]:
inputs, labels = pad_sequences(inputs, labels)
inputs = torch.tensor(inputs)
labels = torch.tensor(labels)
print(inputs.shape)
print(labels.shape)
#(no of sentence, max_lenght of each sentence)

dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset)

torch.Size([65, 65])
torch.Size([65, 65])


In [159]:
#nn.embedding

In [160]:
a = torch.zeros(10, 5)
print(a) #max_len=10, d_model=5

tensor([[0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0.]])


In [161]:
print(a.size())
print(a.size(0))

torch.Size([10, 5])
10


In [162]:
class positional_encoding(nn.Module):
  def __init__(self, d_model, max_length):
    super().__init__()

    pe = torch.zeros(max_length, d_model)

    position = torch.arange(0, max_length, step=1).float().unsqueeze(1)
    embedded_index = torch.arange(0, d_model, step=2).float() #i    #step = 2 because i = 0 can be use for sin and cos

    div_term = 1/10000 ** (embedded_index / d_model)
    # fill in the zeros table
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)

    self.register_buffer("pe", pe)

  def forward(self, word_embedding): #take output from word_embedding then calculate #over
    return word_embedding + self.pe[:word_embedding.size(0)] # just self.pe since all inputs are already uniform due to padding


In [172]:
class Attention(nn.Module):
  def __init__(self, d_model):
    super().__init__()
    self.w_Q = nn.Linear(in_features=d_model, out_features=d_model)
    self.w_K = nn.Linear(in_features=d_model, out_features=d_model)
    self.w_V = nn.Linear(in_features=d_model, out_features=d_model)

    self.row_dim = 0
    self.col_dim = 1

  def forward(self, encodings_Q, encodings_K, encodings_V, mask=None):
    Q = self.w_Q(encodings_Q) #self.w_Q, w_K, w_V is a linear layer with the formula: out = in*W + b (the layer already include W and b to train itself)
    K = self.w_K(encodings_K)
    V = self.w_V(encodings_V)

    sims = torch.matmul(Q, K.transpose(-2, -1))

    scaled_sims = sims / torch.tensor(K.size(-1)).sqrt()

    if mask is not None:
      scaled_sims = scaled_sims.masked_fill(mask == 0, -1e9)

    attention_weights = F.softmax(scaled_sims, dim=-1) # which dimension should the weight after softmax add up to 1 in this case the sum of all column in one row equal 1

    attention_score = torch.matmul(attention_weights, V)

    return attention_score

    return torch.matmul(scaled_sims, V)


In [189]:
class Decoder_only_transformer(pl.LightningModule):
  def __init__(self, no_of_token, d_model, max_length, num_attention_heads=3):
    super().__init__()
    self.em = nn.Embedding(no_of_token, d_model)
    self.pe = positional_encoding(d_model, max_length)

    self.attention = Attention(d_model)
    self.attention_2 = Attention(d_model=d_model)
    self.attention_3 = Attention(d_model=d_model)

    self.reduce_attention_dim = nn.Linear(in_features=(d_model), out_features=d_model)



    self.fc = nn.Linear(d_model, d_model)
    self.final_fc = nn.Linear(d_model, no_of_token)
    self.loss = nn.CrossEntropyLoss()

  def forward(self, token_ids):  #input is token_ids not the token itself e.g. "4" instead of <EOS>
    word_embedding = self.em(token_ids)
    position_encoding = self.pe(word_embedding)

    mask = torch.tril(torch.ones((token_ids.size(dim=0), token_ids.size(dim=0)), device=self.device))

    mask = mask == 0


    #first decoder
    attention_score = self.attention(position_encoding, position_encoding, position_encoding, mask=mask)
    attention_score_2 = self.attention_2(attention_score, attention_score, attention_score, mask=mask)
    attention_score_3 = self.attention_3(attention_score_2, attention_score_2, attention_score_2, mask=mask)
    attention_score_3 = self.reduce_attention_dim(attention_score_3)
    residual_value = attention_score_3 + position_encoding
    output1 = self.fc(residual_value)

    #second decoder (same as above but the input is output from the previous decoder instead of the position value)
    attention_score = self.attention(output1, output1, output1, mask=mask)
    attention_score_2 = self.attention_2(attention_score, attention_score, attention_score, mask=mask)
    attention_score_3 = self.attention_3(attention_score_2, attention_score_2, attention_score_2, mask=mask)
    attention_score_3 = self.reduce_attention_dim(attention_score_3)
    residual_value = attention_score_3 + position_encoding
    output2 = self.fc(residual_value)

    #third decoder

    attention_score = self.attention(output2, output2, output2, mask=mask)
    attention_score_2 = self.attention_2(attention_score, attention_score, attention_score, mask=mask)
    attention_score_3 = self.attention_3(attention_score_2, attention_score_2, attention_score_2, mask=mask)

    attention_score_3 = self.reduce_attention_dim(attention_score_3)

    residual_value = attention_score_3 + position_encoding
    output3 = self.fc(residual_value)

    #fourth decoder

    attention_score = self.attention(output3, output3, output3, mask=mask)
    attention_score_2 = self.attention_2(attention_score, attention_score, attention_score, mask=mask)
    attention_score_3 = self.attention_3(attention_score_2, attention_score_2, attention_score_2, mask=mask)

    attention_score_3 = self.reduce_attention_dim(attention_score_3)

    residual_value = attention_score_3 + position_encoding
    output4 = self.fc(residual_value)

    #fifth decoder

    attention_score = self.attention(output4, output4, output4, mask=mask)
    attention_score_2 = self.attention_2(attention_score, attention_score, attention_score, mask=mask)
    attention_score_3 = self.attention_3(attention_score_2, attention_score_2, attention_score_2, mask=mask)
    attention_score_3 = self.reduce_attention_dim(attention_score_3)

    residual_value = attention_score_3 + position_encoding
    output5 = self.fc(residual_value)

    #sixth decoder

    attention_score = self.attention(output5, output5, output5, mask=mask)
    attention_score_2 = self.attention_2(attention_score, attention_score, attention_score, mask=mask)
    attention_score_3 = self.attention_3(attention_score_2, attention_score_2, attention_score_2, mask=mask)

    attention_score_3 = self.reduce_attention_dim(attention_score_3)

    residual_value = attention_score_3 + position_encoding
    output = self.final_fc(residual_value)
    output = torch.clamp(output, min=0, max=self.em.num_embeddings - 1)

    return output

  def configure_optimizers(self):
    return Adam(self.parameters(), lr=0.0005)

  def training_step(self, batch, batch_idx):
    inputs, labels = batch
    inputs = torch.clamp(inputs, min=0, max=self.em.num_embeddings - 1)
    outputs = self.forward(inputs)

    labels = torch.clamp(labels, min=0, max=self.em.num_embeddings - 1)
    loss = self.loss(outputs.view(-1, self.em.num_embeddings), labels.view(-1))
    self.log("train_loss", loss)
    return loss




In [190]:
model = Decoder_only_transformer(no_of_token=65, d_model=d_model, max_length=max_length)

model_input = torch.tensor([token_to_idx["what"],
                            token_to_idx["is"],
                            token_to_idx["chardonnay"],
                            token_to_idx["?"],
                            token_to_idx["<EOS>"]])

input_length = model_input.size(dim=0)

prediction = model(model_input)
predict_id = torch.tensor([torch.argmax(prediction[-1, :])])
predict_ids = predict_id #right now only 1 word was generated

for i in range(max_length):
  if predict_id == token_to_idx["<EOS>"]:
    break
  model_input = torch.cat((model_input, predict_id))
  prediction = model(predict_id)
  predict_id = torch.tensor([torch.argmax(prediction[-1, :])])
  predict_ids = torch.cat((predict_ids, predict_id))

print("predict_ids")
for i in predict_ids:
  print(idx_to_token[i.item()])

predict_ids
france
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries
known
delicate
berries


In [191]:
trainer = pl.Trainer(max_epochs=30)
trainer.fit(model, dataloader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                 | Type                | Params | Mode 
---------------------------------------------------------------------
0 | em                   | Embedding           | 16.6 K | train
1 | pe                   | positional_encoding | 0      | train
2 | attention            | Attention           | 197 K  | train
3 | attention_2          | Attention           | 197 K  | train
4 | attention_3          | Attention           | 197 K  | train
5 | reduce_attention_dim | Linear              | 65.8 K | train
6 | fc                   | Linear              | 65.8 K | train
7 | final_fc             | Linear              | 16.7 K | train
8 | loss                 | CrossEntropyLoss    | 0      | train
--

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=30` reached.


In [196]:
trainer = pl.Trainer(max_epochs=5)
trainer.fit(model, dataloader)

INFO:pytorch_lightning.utilities.rank_zero:GPU available: False, used: False
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.callbacks.model_summary:
  | Name                 | Type                | Params | Mode 
---------------------------------------------------------------------
0 | em                   | Embedding           | 16.6 K | train
1 | pe                   | positional_encoding | 0      | train
2 | attention            | Attention           | 197 K  | train
3 | attention_2          | Attention           | 197 K  | train
4 | attention_3          | Attention           | 197 K  | train
5 | reduce_attention_dim | Linear              | 65.8 K | train
6 | fc                   | Linear              | 65.8 K | train
7 | final_fc             | Linear              | 16.7 K | train
8 | loss                 | CrossEntropyLoss    | 0      | train
--

Training: |          | 0/? [00:00<?, ?it/s]

INFO:pytorch_lightning.utilities.rank_zero:`Trainer.fit` stopped: `max_epochs=5` reached.


In [200]:

model_input = torch.tensor([token_to_idx["what"],
                            token_to_idx["is"],
                            token_to_idx["chardonnay"],
                            token_to_idx["?"],
                            token_to_idx["<EOS>"]])

input_length = model_input.size(dim=0)

prediction = model(model_input)
predict_id = torch.tensor([torch.argmax(prediction[-1, :])])
predict_ids = predict_id #right now only 1 word was generated

for i in range(max_length):
  if predict_id == token_to_idx["<EOS>"]:
    break
  model_input = torch.cat((model_input, predict_id))
  prediction = model(predict_id)
  predict_id = torch.tensor([torch.argmax(prediction[-1, :])])
  predict_ids = torch.cat((predict_ids, predict_id))

print("predict_ids")
for i in predict_ids:
  print(idx_to_token[i.item()], end=" ")

predict_ids
the aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma aroma 