<a href="https://colab.research.google.com/github/sgajendra/LanguageModels/blob/main/Attention_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
from torch import nn
import torch.nn.functional as F

USE_CUDA = torch.cuda.is_available()
device = torch.device("cuda" if USE_CUDA else "cpu")
device

device(type='cuda')

In [None]:
os.chdir('/content/drive/MyDrive/archive')
os.listdir()

['movie_lines.txt',
 'movie_titles_metadata.txt',
 'movie_characters_metadata.txt',
 '.DS_Store',
 'raw_script_urls.txt',
 'README.txt',
 'chameleons.pdf',
 'movie_conversations.txt']

In [None]:
with open('movie_lines.txt','r',encoding='utf-8',errors='ignore') as file:
  lines = file.readlines()
with open('movie_conversations.txt','r',encoding='utf-8',errors='ignore') as file:
  convs = file.readlines()

In [None]:
lines[:10], convs[:10]

(['L1045 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ They do not!\n',
  'L1044 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ They do to!\n',
  'L985 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I hope so.\n',
  'L984 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ She okay?\n',
  "L925 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Let's go.\n",
  'L924 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ Wow\n',
  "L872 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Okay -- you're gonna need to learn how to lie.\n",
  'L871 +++$+++ u2 +++$+++ m0 +++$+++ CAMERON +++$+++ No\n',
  'L870 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ I\'m kidding.  You know how sometimes you just become this "persona"?  And you don\'t know how to quit?\n',
  'L869 +++$+++ u0 +++$+++ m0 +++$+++ BIANCA +++$+++ Like my fear of wearing pastels?\n'],
 ["u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L194', 'L195', 'L196', 'L197']\n",
  "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L198', 'L199']\n",
  "u0 +++$+++ u2 +++$+++ m0 +++$+++ ['L200', '

In [None]:
lines_split={}
convs_split=[]

for line in lines:
  line_split = line.split(' +++$+++ ')
  lines_split[line_split[0].strip()] = line_split[-1].strip()

for conv in convs:
  conv_split = conv.split(' +++$+++ ')[-1][1:-2].replace("'","").replace(", "," ").split()
  convs_split.append(conv_split)

In [None]:
import string
def remove_punct(strings):
  punctuation = string.punctuation
  no_punct=""
  for char in strings:
    if char not in punctuation:
      no_punct=no_punct+char
  return no_punct.lower()


In [None]:
pairs = []
max_len = 25
for conv in convs_split:
  for c in range(len(conv)-1):
    qa_pairs=[]

    first = remove_punct(lines_split[conv[c]])
    second = remove_punct(lines_split[conv[c+1]])

    qa_pairs.append(first.split()[:max_len])
    qa_pairs.append(second.split()[:max_len])
    pairs.append(qa_pairs)


In [None]:
len(pairs)

221616

In [None]:
from collections import Counter
word_freq = Counter()
min_word_freq = 5
word_freq_trim = {}
for pair in pairs:
   word_freq.update(pair[0])
   word_freq.update(pair[1])
for word,freq in word_freq.items():
  if word_freq[word] > min_word_freq:
    word_freq_trim[word] = freq + 1
#word_freq_trim is nothing but the bag of words

In [None]:
##word dictionary
word_indx = {}
for i,word in enumerate(word_freq_trim.keys()):
  word_indx[word] = i + 1
word_indx['<unk>'] = len(word_indx) + 1
word_indx['<start>'] = len(word_indx) + 1
word_indx['<end>'] = len(word_indx) + 1
word_indx['<pad>'] = 0

indx_word={}
for k, v in word_indx.items():
  indx_word[v] = k

In [None]:
def encode_questions(words, word_indx):
  encode_q = [word_indx[word] if word in word_indx.keys() else word_indx['<unk>'] for word in words ] + [word_indx['<pad>']]*(max_len - len(words))
  return encode_q

In [None]:
def encode_answers(words, word_indx):
  encode_a = [word_indx['<start>']]+[word_indx[word] if word in word_indx.keys() else word_indx['<unk>'] for word in words] + [word_indx['<end>']] + [word_indx['<pad>']]*(max_len - len(words))
  return encode_a

In [None]:
pairs_encoded = []
for pair in pairs:
  q_encoded = encode_questions(pair[0],word_indx)
  a_encoded = encode_answers(pair[1],word_indx)
  pairs_encoded.append([q_encoded,a_encoded])

In [None]:
from torch.utils.data import Dataset

class dataset(Dataset):
  def __init__(self, input_pairs):
    self.input_pairs = input_pairs
    self.input_pairs_len = len(input_pairs)

  def __getitem__(self, index):
    question = torch.LongTensor(self.input_pairs[index][0])
    answer = torch.LongTensor(self.input_pairs[index][1])

    return question, answer

  def __len__(self):
    return self.input_pairs_len

In [None]:
#covert list to tensor of (batch_size, max_len), [100, 25]
from torch.utils.data import DataLoader
train_loader =  DataLoader(dataset(pairs_encoded), batch_size=100, shuffle=True)

In [None]:
##A look-ahead mask is required to prevent the decoder from attending to succeeding words, 
##such that the prediction for a particular word can only depend on known outputs for the words 
##that come before it.
##https://machinelearningmastery.com/joining-the-transformer-encoder-and-decoder-and-masking/

def create_mask(question, answers_input, answers_target):
  
  def look_ahead_mask(size):
    mask = torch.triu(torch.ones(size,size)).transpose(0,1).type(dtype=torch.uint8)
    return mask
  
  question_mask = (question != 0).to(device)
  question_mask = question_mask.unsqueeze(1).unsqueeze(1) #[100,1,1,25]

  answer_input_mask = (answers_input != 0).to(device)
  answer_input_mask = answer_input_mask.unsqueeze(1)
  answer_input_mask = answer_input_mask & look_ahead_mask(answers_input.size(-1)).type_as(answer_input_mask.data)

  answers_target_mask = (answers_target!=0)

  return question_mask, answer_input_mask, answers_target_mask

In [None]:
import math
class embedding(nn.Module):

  def __init__(self, vocab_size, d_model, max_len=50):
    super(embedding,self).__init__()
    self.d_model = d_model
    self.dropout = nn.Dropout(0.1)
    self.embed = nn.Embedding(vocab_size, d_model)
    self.pe = self.create_positional_encoding(max_len, d_model)

  def create_positional_encoding(self, max_len, d_model):
    pe = torch.zeros(max_len, d_model).to(device)
    for pos in range(max_len): ##For each position of the word
      for i in range(0,d_model,2):
        pe[pos, i] = math.sin(pos/(10000** ((2*i)/d_model)))
        pe[pos, i+1] = math.cos(pos/(10000** ((2*(i+1))/d_model)))
    pe = pe.unsqueeze(0)
    return pe

  def forward(self, encoded_words):
    embeddings = self.embed(encoded_words)*math.sqrt(self.d_model)
    embeddings += self.pe[:, :embeddings.size(1)]
    embeddings = self.dropout(embeddings)
    return embeddings



In [None]:
class multi_head_attention(nn.Module):
  def __init__(self,heads, d_model):
    super(multi_head_attention,self).__init__()
    self.heads = heads
    self.dropout = nn.Dropout(0.1)
    self.d_k = d_model//heads
    self.query = nn.Linear(d_model, d_model)
    self.key = nn.Linear(d_model, d_model)
    self.value = nn.Linear(d_model, d_model)
    self.concat = nn.Linear(d_model, d_model)

  def forward(self, query, key, value,mask):
    """
    q, k, v of shape (batch_size, max_words, d_model)
    mask: (batch_size, 1, 1, max_words)
    """
    query = self.query(query)   #(batch_size, max_words, d_model)
    key = self.key(key)         #(batch_size, max_words, d_model)
    value = self.value(value)   #(batch_size, max_words, d_model)

    ##query should of shape 
    #--> (batch_size, max_words, d_model)-->(batch_size, max_words, h, dk) --> (batch_size, h, max_word, dk)
    query = query.view(query.size(0), -1, self.heads, self.d_k)
    #swap dim 2 and dim 1
    query = query.permute(0,2,1,3)

    key = key.view(query.size(0), -1, self.heads, self.d_k)
    #swap dim 2 and dim 1
    key = key.permute(0,2,1,3)

    value = value.view(query.size(0), -1, self.heads, self.d_k)
    #swap dim 2 and dim 1
    value = value.permute(0,2,1,3)

    ##now take dor product of query and key
    #(batch_size, h, max_word, dk) * (batch_size, h, dk, max_word)  --> (batch_size, h, max_word, max_word)
    score = torch.matmul(query, key.permute(0,1,3,2))/math.sqrt(self.d_k)
    #masking
    score = score.masked_fill(mask==0, -1e9)
    #softmax
    weights = nn.softmax(score, dim=-1)  #last dimension max_len, which is drived from key, 
    weights = self.dropout(weights)

    #now dor prodouct with the value
    #(batch_size, h, max_word, max_word) * (batch_size, h, max_word, dk)  --> (batch_size, h, max_word, dk)
    context = torch.matmul(weights, value)
    #later we will transpose the matrix
    #(batch_size, h, max_word, dk)  --> (batch_size, max_word, h, dk) --> (batch_size, max_len, h*dk)(concat all h to get back 512 embedding)
    context = context.permute(0,2,1,3).view(context.size(0),-1, self.heads * self.dk)
    interacted = self.concat(context)

    return interacted

In [None]:
class feedforward(nn.Module):
  def __init__(self,d_model, middle_dim=2048):
    super(feedforward,self).__init__()
    self.fc1 = nn.Linear(d_model, middle_dim)
    self.dropout = nn.Dropout(0.1)
    self.fc2 = nn.Linear(middle_dim, d_model)

  def forward(self, interacted):
    feed = F.relu(self.fc1(interacted))
    feed = self.dropout(feed)
    feed = self.fc2(feed)
    return feed

In [None]:
class encoder_layer(nn.Module):
  def __init__(self, d_model, heads):
    super(encoder_layer,self).__init__()
    self.self_multiheadattention = multi_head_attention(heads, d_model)
    self.feedforward = feedforward(d_model)
    self.layerNorm = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(0.1)

  def forward(self, embeddings, mask):
    interacted = self.self_multiheadattention(embeddings,embeddings,embeddings,mask)
    interacted = self.layerNorm(interacted + embeddings)
    feed_forward_out = self.dropout(self.feedforward(interacted))
    encoded = self.layerNorm(feed_forward_out+interacted)
    return encoded

In [None]:
class decoder_layer(nn.Module):
  def __init__(self, d_model, heads):
    super(decoder_layer,self).__init__()
    self.self_multiheadattention = multi_head_attention(heads, d_model)
    self.src_multiheadattention = multi_head_attention(heads, d_model)
    self.feedforward = feedforward(d_model)
    self.layerNorm = nn.LayerNorm(d_model)
    self.dropout = nn.Dropout(0.1)

  def forward(self, embeddings, encoded, src_mask, target_mask):
    query = self.self_multiheadattention(embeddings,embeddings,embeddings,target_mask)
    query = self.dropout(query)
    query = self.layerNorm(query+embedding)
    interacted = self.src_multiheadattention(query,encoded,encoded,src_mask)
    interacted = self.dropout(interacted)
    interacted = self.layerNorm(interacted+query)
    feed_forward_out = self.dropout(self.feedforward(interacted))
    decoded = self.layerNorm(feed_forward_out + interacted)
    return decoded
    

In [None]:
#transformer, 6encoder layer and 6decoder

class Transformer(nn.Module):
  def __init__(self, d_model, heads, num_layers, word_map):
    super(Transformer,self).__init__()
    self.d_model = d_model
    self.vocab_size = len(word_map)
    self.embed = embedding(self.vocab_size, d_model)
    self.encoder = nn.ModuleList([encoder_layer(d_model, heads) for _ in range(num_layers)])
    self.decoder = nn.ModuleList([decoder_layer(d_model, heads) for _ in range(num_layers)])
    self.logit = nn.Linear(d_model, self.vocab_size)


  def encode(self, src_words, src_mask):
    src_embeddings =  self.embed(src_words)
    for layers in self.encoder:
      src_embeddings = layers(src_embeddings,src_mask)
    return src_embeddings

  def decode(self,target_words, target_mask,src_embeddings, src_mask):
    tgt_embeddings =  self.embed(target_words)
    for layers in self.decoder:
      tgt_embeddings = layers(tgt_embeddings, src_embeddings,src_mask, target_mask)
    return tgt_embeddings

  def forward(self, src_word, src_mask, target_words, target_mask):
    encoded = self.encode(src_word,src_mask)
    decoded = self.decode(target_words, target_mask,encoded, src_mask)
    out = F.log_softmax(self.logit(decoded))
    return out

In [None]:
class Adamwarmup:
  def __init__(self, model_size, warmup_steps, optimizer):
    self.model_size = model_size
    self.warmup_steps = warmup_steps
    self.optimizer = optimizer
    self.current_step = 0
    self.lr = 0

  def get_lr(self):
     return self.model_size ** (-0.5) * min(self.current_step **(-0.5), self.current_step * self.warmup_steps ** (-1.5))

  def step(self):
    self.current_step += 1
    lr = self.get_lr()
    for param_group in self.optimizer.param_groups:
      param_group['lr'] = lr
    self.lr = lr
    #update weights
    self.optimizer.step()   #update the weights


In [None]:
class losswithLS(nn.Module):
  def __init__(self, size, smoth):
    super(losswithLS, self).__init__()
    self.criteria = nn.KLDivLoss(size_average = False, reduce = False)
    self.confidence =1-smoth
    self.smooth = smoth
    self.size = size

  def forward(self, prediction, target, mask):
    """
    prediction: (batch_size, max_words, vocab_size)
    target and mask: (batch_size, max_words)
    """
    
    predictions = prediction.view(-1, prediction.size(-1))
    #label smoothing
    target = target.view(-1)
    mask = mask.float()
    mask = mask.view(-1)
    labels = predictions.data.clone()  ##copy the labels
    labels.fill_(self.smooth/(self.size-1))
    labels.scatter(1, target.data.unsqueeze(1), self.confidence)

    #build the loss
    loss = self.criteria(predictions, labels)
    loss = (loss.sum(1) * mask).sum()/mask.sum()
    return loss

In [None]:
d_model = 512
heads =8
num_layers =1
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs =1
word_map = word_indx

transformer = Transformer(d_model=d_model, heads=heads, num_layers=num_layers, word_map=word_map)
transformer.to(device)
adam_optimizer = torch.optim.Adam(transformer.parameters(), lr = 0,betas=(0.9,0.98), eps=1e-9)
tranformer_optimer = Adamwarmup(model_size=d_model, warmup_steps=4000, optimizer=adam_optimizer)
criterion = losswithLS(size=len(word_map), smoth=0.3)




In [None]:
def train(train_loader, transformer, criterion, epoch):
  
  transformer.train()
  sum_loss = 0
  count = 0
  for i, (question, reply) in enumerate(train_loader):

    samples = question.shape[0]
    #move to device
    question = question.to(device)
    reply = reply.to(device)

    reply_input = reply[:, :-1]
    reply_target = reply[:, 1:]

    question_mask, reply_input_mask, reply_target_mask = create_mask(question, reply_input, reply_target)

    #run through transformer to get predictions
    out = transformer(question, question_mask, reply_input, reply_input_mask)
    #loss
    loss = criterion(out, reply_target, reply_target_mask)

    ##backprop
    tranformer_optimer.optimizer.zero_grads()

    loss.backward()
    tranformer_optimer.step()

    sum_loss += loss.item()*samples
    count += 1

    if i % 100 ==0:
      print("Epochs [{}][{}/{}]\tLoss: {:.3f}".format(epochs, i, len(train_loader), sum_loss/count))

In [None]:
def Evaluation(transformer, question, question_mask, max_len, word_indx):
  rev_indx_word = {v:k for k, v in word_indx.items()}
  transformer.eval()

  start_token = word_indx["<start>"]
  encoded = transformer.encoded(question, question_mask)
  words = torch.LongTensor([[start_token]]).to(device)

  for step in range(max_len -1):
    size = words.shape[0]
    target_mask = torch.triu(torch.ones(size,size)).transpose(0,1).type(dtype=torch.uint8)
    target_mask = target_mask.to(device).unsqueeze(0)
    decoded = transformer.decoded(words, target_mask,encoded, question_mask)
    #decoded --> (1, 1,vocab_size)
    predictions = transformer.logit([decoded[:,-1]])
    _, next_word = torch.max(predictions, dim=1)
    next_word = next_word.item()
    if next_word == word_indx["<end>"]:
      break
    words = torch.cat(words, torch.LongTensor([[next_word]]).to(device),dim=1)

  words = words.sequeeze(0) #once dimenision tensor
  words = words.tolist()

  sen_indx = [ w for w in words if w not in {word_indx["<start>"]}]
  sentence = " ".join([rev_indx_word[sen_indx[k]] for k in range(len(sen_indx))])
  return sentence

In [None]:
for epoch in range(epochs):
  train(train_loader, transformer, criterion, epoch)
  state = {'epoch':epoch, 'transformer':transformer,'transformer_optimizer':tranformer_optimer}
  torch.save(state,'checkpoint_'+str(epoch)+'.tar')

AttributeError: ignored

In [None]:
checkpoint = torch.load('checkpoint_.tar')
transformer = checkpoint['transformer']

In [None]:
while(1):
  question=input("Question:")
  if question=='quit':
    break
  max_len=input("Enter max word to be granted")
  enc_ques = [word_map.get(word, word_indx['unk']) for word in question.split()]
  question = torch.LongTensor(enc_ques).to(device).unsqueeze(0)
  question_mask = (question!=0).to(device).unsqueeze(1).unsqueeze(1)
  sentence = evaluate(transformer, question, question_mask, max_len, word_indx)
  print(sentence)


In [None]:
question='How are you'
question.split()

In [None]:
a[:, :-1].shape

torch.Size([10, 4])

In [None]:
a[:, 1:].shape

torch.Size([10, 4])