In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchtext
import nltk

import unicodedata
import os
import string
import re


import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

USE_GPU = True
if USE_GPU and torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

print('using device:', device)



using device: cuda


In [2]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
      if unicodedata.category(c) != 'Mn')


def preprocess_sentence(text):
    text = unicode_to_ascii(text.lower().strip())
    text = re.sub(r"i'm", "i am", text)
    text = re.sub(r"\r", "", text)
    text = re.sub(r"he's", "he is", text)
    text = re.sub(r"she's", "she is", text)
    text = re.sub(r"it's", "it is", text)
    text = re.sub(r"that's", "that is", text)
    text = re.sub(r"what's", "that is", text)
    text = re.sub(r"where's", "where is", text)
    text = re.sub(r"how's", "how is", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"won't", "will not", text)
    text = re.sub(r"can't", "cannot", text)
    text = re.sub(r"n't", " not", text)
    text = re.sub(r"n'", "ng", text)
    text = re.sub(r"'bout", "about", text)
    text = re.sub(r"'til", "until", text)
    text = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,]", "", text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub("(\\W)"," ",text)
    text = re.sub('\S*\d\S*\s*','', text)
    return text

In [3]:
lines = open('/kaggle/input/chatbot-data/cornell movie-dialogs corpus/movie_lines.txt', encoding='utf-8',
             errors='ignore').read().split('\n')

conversatino = open('/kaggle/input/chatbot-data/cornell movie-dialogs corpus/movie_conversations.txt', encoding='utf-8',
             errors='ignore').read().split('\n')

In [4]:
conversatino_rel_line = []
for row in conversatino:
    line_ids = row.split(' +++$+++ ')[-1][1:-1].replace("'", "").split(', ')
    conversatino_rel_line.append(line_ids)

dialog = {}
for line in lines:
    row = line.split(' +++$+++ ')
    dialog[row[0]] = row[-1]


questions = []
answers = []
for link_ids in conversatino_rel_line:
    for i in range(len(link_ids) - 1):
        question = dialog[link_ids[i]].strip()
        answer = dialog[link_ids[i + 1]].strip()
        
        #  Take sentence with 4 and more words
        if len(question.split()) > 3 and len(answer.split()) > 3:
          questions.append(question)
          answers.append(answer)

df = pd.DataFrame({'QUESTION': questions[0:10000], 'ANSWER': answers[0:10000]})
# df = pd.DataFrame({'QUESTION': questions, 'ANSWER': answers})

df.dropna(axis=0, how='any', inplace=True)

df['QUESTION_CLEAN'] = df['QUESTION'].apply(lambda x: preprocess_sentence(x))
df['ANSWER_CLEAN'] = df['ANSWER'].apply(lambda x: preprocess_sentence(x))

df

Unnamed: 0,QUESTION,ANSWER,QUESTION_CLEAN,ANSWER_CLEAN
0,Can we make this quick? Roxanne Korrine and A...,"Well, I thought we'd start with pronunciation,...",can we make this quick roxanne korrine and an...,well i thought we would start with pronunciati...
1,"Well, I thought we'd start with pronunciation,...",Not the hacking and gagging and spitting part....,well i thought we would start with pronunciati...,not the hacking and gagging and spitting part ...
2,Not the hacking and gagging and spitting part....,Okay... then how 'bout we try out some French ...,not the hacking and gagging and spitting part ...,okay then how about we try out some french cui...
3,"The thing is, Cameron -- I'm at the mercy of a...",Seems like she could get a date easy enough...,the thing is cameron i am at the mercy of a p...,seems like she could get a date easy enough
4,"Gosh, if only we could find Kat a boyfriend...",Let me see what I can do.,gosh if only we could find kat a boyfriend,let me see what i can do
...,...,...,...,...
9995,We're not going that route. Sonrisa's not som...,What else can we do?,we are not going that route sonrisas not some...,what else can we do
9996,What else can we do?,Like I said-- This isn't my regular line of wo...,what else can we do,like i said this is not my regular line of wor...
9997,Goodnight Bears. Goodnight chairs. Goodnight ...,I don't want to move away.,goodnight bears goodnight chairs goodnight ki...,i do not want to move away
9998,I don't want to move away.,"I know, Honey, but just think how exiting it w...",i do not want to move away,i know honey but just think how exiting it wil...


In [5]:
df = pd.read_parquet("/kaggle/input/squad-datasets/squad.parquet")
df.head()
df["answers"][0]["text"][0]
df["answersTmp"] = df["answers"].apply(lambda x : str(x["text"]))
df.dropna(axis=0, how='any', inplace=True)
df['QUESTION_CLEAN'] = df['question'].apply(lambda x: preprocess_sentence(x))
df['ANSWER_CLEAN'] = df['answersTmp'].apply(lambda x: preprocess_sentence(x))
df


Unnamed: 0,id,title,context,question,answers,answersTmp,QUESTION_CLEAN,ANSWER_CLEAN
0,56be85543aeaaa14008c9063,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce start becoming popular?,"{'text': ['in the late 1990s'], 'answer_start'...",['in the late 1990s'],when did beyonce start becoming popular,in the late
1,56be85543aeaaa14008c9065,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What areas did Beyonce compete in when she was...,"{'text': ['singing and dancing'], 'answer_star...",['singing and dancing'],what areas did beyonce compete in when she was...,singing and dancing
2,56be85543aeaaa14008c9066,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,When did Beyonce leave Destiny's Child and bec...,"{'text': ['2003'], 'answer_start': [526]}",['2003'],when did beyonce leave destinys child and beco...,
3,56bf6b0f3aeaaa14008c9601,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In what city and state did Beyonce grow up?,"{'text': ['Houston, Texas'], 'answer_start': [...","['Houston, Texas']",in what city and state did beyonce grow up,houston texas
4,56bf6b0f3aeaaa14008c9602,Beyoncé,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,In which decade did Beyonce become famous?,"{'text': ['late 1990s'], 'answer_start': [276]}",['late 1990s'],in which decade did beyonce become famous,late
...,...,...,...,...,...,...,...,...
130314,5a7e070b70df9f001a875439,Matter,"The term ""matter"" is used throughout physics i...",Physics has broadly agreed on the definition o...,"{'text': [], 'answer_start': []}",[],physics has broadly agreed on the definition o...,
130315,5a7e070b70df9f001a87543a,Matter,"The term ""matter"" is used throughout physics i...",Who coined the term partonic matter?,"{'text': [], 'answer_start': []}",[],who coined the term partonic matter,
130316,5a7e070b70df9f001a87543b,Matter,"The term ""matter"" is used throughout physics i...",What is another name for anti-matter?,"{'text': [], 'answer_start': []}",[],what is another name for antimatter,
130317,5a7e070b70df9f001a87543c,Matter,"The term ""matter"" is used throughout physics i...",Matter usually does not need to be used in con...,"{'text': [], 'answer_start': []}",[],matter usually does not need to be used in con...,


In [6]:
df = pd.read_parquet("/kaggle/input/gsm-general-qa/gsm_general_qa_instructions_train.parquet")
df.dropna(axis=0, how='any', inplace=True)
df['QUESTION_CLEAN'] = df['prompt'].apply(lambda x: preprocess_sentence(x))
df['ANSWER_CLEAN'] = df['chosen'].apply(lambda x: preprocess_sentence(x))
df

Unnamed: 0,prompt,chosen,QUESTION_CLEAN,ANSWER_CLEAN
7332,Human: John decides to start collecting art. ...,"The first 3 pieces each cost 45000/3=$15,000\n...",human john decides to start collecting art he...,the first pieces each cost so the next piece w...
15658,Human: Identify the major theme of the followi...,The major theme of this excerpt is the pursuit...,human identify the major theme of the followin...,the major theme of this excerpt is the pursuit...
11209,Human: Pretend you are a teacher explaining th...,"The water cycle, also known as the hydrologic ...",human pretend you are a teacher explaining the...,the water cycle also known as the hydrologic c...
1725,Human: Explain the plot of the movie Inception...,Inception follows a skilled thief named Dom Co...,human explain the plot of the movie inception ...,inception follows a skilled thief named dom co...
6071,Human: Take the following text and rewrite it ...,"Dear colleagues, I wanted to inform you that o...",human take the following text and rewrite it i...,dear colleagues i wanted to inform you that ou...
...,...,...,...,...
6310,"Human: Tracy, John and Jake found their combin...",Jake weighs 8kg more than 52 kg which is 8+52 ...,human tracy john and jake found their combined...,jake weighs more than kg which is kg tracy and...
17757,Human: Rewrite the following paragraph in a mo...,Greetings. Upon reflecting on the influence of...,human rewrite the following paragraph in a mor...,greetings upon reflecting on the influence of ...
9640,Human: Read the paragraph and find any spellin...,Jake had just gotten his new puppy when sudden...,human read the paragraph and find any spelling...,jake had just gotten his new puppy when sudden...
14736,Human: Imagine you're a detective trying to so...,1. Collect and analyze the surveillance footag...,human imagine you are a detective trying to so...,collect and analyze the surveillance footage t...


In [7]:
df = pd.read_parquet("/kaggle/input/ubuntu-datasets/ubuntu_qa.parquet")
df.head()
df.dropna(axis=0, how='any', inplace=True)
df['QUESTION_CLEAN'] = df['question'].apply(lambda x: preprocess_sentence(x))
df['ANSWER_CLEAN'] = df['answer'].apply(lambda x: preprocess_sentence(x))

In [8]:
df.drop(df[df.ANSWER_CLEAN ==""].index, inplace=True)
df.drop(df[df.ANSWER_CLEAN.str.split().str.len() <16].index, inplace=True)

In [9]:
df

Unnamed: 0,question,answer,QUESTION_CLEAN,ANSWER_CLEAN
1,is there a difference between using 'bash scri...,./foo.sh executes the file. if the file starts...,is there a difference between using bash scrip...,foosh executes the file if the file starts wit...
4,any help with figuring out what driver my x se...,"the x server isn't just the video driver, its ...",any help with figuring out what driver my x se...,the x server is not just the video driver its ...
5,can anyone help me please installing ubuntu fr...,what is so hard in that? just download unetboo...,can anyone help me please installing ubuntu fr...,what is so hard in that just download unetboot...
8,it seems that 10.10 is still weak on drivers f...,how interesting... i wish i knew that before i...,it seems that is still weak on drivers for old...,how interesting i wish i knew that before i in...
12,"hey all, i just created a new fat32 partition....",if this is on your hard drive and not flash ju...,hey all i just created a new partition however...,if this is on your hard drive and not flash ju...
...,...,...,...,...
12056,"hi, how can i stop someone from resetting my r...",you may be able to disable usb & cd booting in...,hi how can i stop someone from resetting my ro...,you may be able to disable usb cd booting in ...
12063,"when i install kde, do i get the option to cho...",yes... via gdm or kdm ... you can edit /etc/x1...,when i install kde do i get the option to choo...,yes via gdm or kdm you can edit to configure ...
12093,i have just got one game and it's looks like b...,it's probably not executable. either chmod it ...,i have just got one game and it is looks like ...,it is probably not executable either chmod it ...
12096,"hi, i need some help. i installed ubuntu 7.04 ...",edit the kernel boot options... nosplash quiet...,hi i need some help i installed ubuntu in my c...,edit the kernel boot options nosplash quiet t...


In [10]:
from sklearn.model_selection import train_test_split
from torch.nn.utils.rnn import pad_sequence
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader

tokenizer = get_tokenizer('basic_english')

special_tokens = ['<SOS', '<EOS>', '<UNK>', '<PAD>']

# create zip from querstion and answer
qa_pairs = list(zip(df['QUESTION_CLEAN'], df['ANSWER_CLEAN']))


# Split into training and validation sets
train_pairs, val_pairs = train_test_split(qa_pairs, test_size=0.5, random_state=42)

# Separate questions and answers for convenience
train_questions, train_answers = zip(*train_pairs)
val_questions, val_answers = zip(*val_pairs)

# Define the tokenizer
tokenizer = get_tokenizer('basic_english')

# Function to yield list of tokens
def yield_tokens(data_iter):
    for text in data_iter:
        yield tokenizer(text)

# Build vocabulary
train_texts = train_questions + train_answers + val_questions + val_answers
VOCAB = build_vocab_from_iterator(yield_tokens(train_texts), specials=['<PAD>', '<SOS>', '<EOS>', '<UNK>'])
VOCAB.set_default_index(VOCAB['<UNK>'])

VOCAB_SIZE = len(VOCAB)
# INPUT_SEQ_LEN = 25
# TARGET_SEQ_LEN = 25

# INPUT_SEQ_LEN = df['QUESTION_CLEAN'].str.split().str.len().max()
# TARGET_SEQ_LEN = df['ANSWER_CLEAN'].str.split().str.len().max()

INPUT_SEQ_LEN = df['QUESTION_CLEAN'].str.split().str.len().median().__int__()
TARGET_SEQ_LEN = df['ANSWER_CLEAN'].str.split().str.len().median().__int__()


print('VOCAB_SIZE:', VOCAB_SIZE)
print('INPUT_SEQ_LEN:', INPUT_SEQ_LEN)
print('TARGET_SEQ_LEN:', TARGET_SEQ_LEN)


def tokens_to_text(tokens):
    # check if token is tensor or numpy array
    if isinstance(tokens, torch.Tensor):
        tokens = tokens.cpu().numpy()
    special_tokens = np.array([VOCAB['<SOS>'], VOCAB['<PAD>'], VOCAB['<UNK>'], VOCAB['<EOS>']])
    tokens = [token for token in tokens if token not in special_tokens]
    return ' '.join(VOCAB.lookup_tokens(tokens))


def text_to_tokens(text):
    return [VOCAB[token] for token in tokenizer(text)]

# Create a custom PyTorch Dataset
class QADataset(Dataset):
    def __init__(self, df, vocab, tokenizer, input_seq_len, target_seq_len):
        self.df = df
        self.vocab = vocab
        self.tokenizer = tokenizer
        self.input_seq_len = input_seq_len
        self.target_seq_len = target_seq_len

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        question, answer = row['QUESTION_CLEAN'], row['ANSWER_CLEAN']

        # Tokenize and encode the sequences
        question_tokens = text_to_tokens(question)
        answer_tokens = text_to_tokens(answer)

        # Pad the sequences
        enc_src = self.pad_sequence(question_tokens + [self.vocab['<EOS>']], self.input_seq_len)
        dec_src = self.pad_sequence([self.vocab['<SOS>']] + answer_tokens, self.target_seq_len)
        trg = self.pad_sequence([self.vocab['<SOS>']] + answer_tokens + [self.vocab['<EOS>']], self.target_seq_len)

        return enc_src, dec_src, trg

    def pad_sequence(self, seq, max_len):
        return F.pad(torch.LongTensor(seq), (0, max_len - len(seq)), mode='constant', value=self.vocab['<PAD>'])
    

dataset = QADataset(df, VOCAB, tokenizer, INPUT_SEQ_LEN, TARGET_SEQ_LEN)


dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

out = next(iter(dataloader))

VOCAB_SIZE: 6694
INPUT_SEQ_LEN: 19
TARGET_SEQ_LEN: 20


In [11]:
import math

class SelfAttention(nn.Module):
    def __init__(self, emb_size, heads):
        super(SelfAttention, self).__init__()
        self.emb_size = emb_size
        self.heads = heads
        self.head_dim = emb_size // heads
        
        assert (self.head_dim * heads == emb_size), "Embedding size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, emb_size)

    def forward(self, values, keys, query, mask):
        batch_size = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]
        
        # Split the embedding into self.heads different pieces
        values = values.reshape(batch_size, value_len, self.heads, self.head_dim)
        keys = keys.reshape(batch_size, key_len, self.heads, self.head_dim)
        queries = query.reshape(batch_size, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        # Calculate energy
        # energy = torch.matmul(queries, keys.transpose(1, 2)) # (batch_size, head, query_len, key_len)')
        energy = torch.einsum('bqhd,bkhd->bhqk', [queries, keys]) # (batch_size, head, query_len, key_len)')

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        d_k = keys.shape[3]
        attention = torch.softmax(energy / (d_k ** 0.5), dim=3) # (batch_size, head, query_len, key_len)

        out = torch.einsum('bhqk,bvhd->bqhd', [attention, values]) # (batch_size, query_len, head, embed_dim)
        out = out.reshape(batch_size, query_len, self.heads * self.head_dim) # (batch_size, query_len, embed_dim)
        out = self.fc_out(out) # (batch_size, query_len, embed_dim)
        return out
     
class TransformerBlock(nn.Module):
    def __init__(self, emb_size, heads, forward_expansion, drop_out) -> None:
        super(TransformerBlock, self).__init__()

        self.attention = SelfAttention(emb_size, heads)
        self.norm1 = nn.LayerNorm(emb_size)
        self.norm2 = nn.LayerNorm(emb_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(in_features=emb_size, out_features=forward_expansion * emb_size, bias=True),
            nn.ReLU(),
            nn.Linear(in_features=forward_expansion * emb_size, out_features=emb_size, bias=True),
        )

        self.dropout = nn.Dropout(drop_out)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)

        # Add skip connection, run through normalization and finally dropout
        norm = self.norm1(attention + query)
        norm = self.dropout(norm)

        forward = self.feed_forward(norm)
        out = self.norm2(forward + norm)
        return out
    
    
class WordPositionEmbedding(nn.Module):
    def __init__(self, vocab_size, max_seq_len, emb_size, device, fixed=True):
        super(WordPositionEmbedding, self).__init__()
        
        self.device = device
        self.fixed = fixed

        self.word_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=emb_size, device=device)
        
        if fixed:
            # Create fixed (non-learnable) position embeddings
            position = torch.arange(max_seq_len).unsqueeze(1)
            div_term = torch.exp(torch.arange(0, emb_size, 2).float() * (-math.log(10000.0) / emb_size))
            position_embedding = torch.zeros(max_seq_len, emb_size)
            position_embedding[:, 0::2] = torch.sin(position * div_term)
            position_embedding[:, 1::2] = torch.cos(position * div_term)
            # Register position_embedding as a buffer
            self.register_buffer('position_embedding', position_embedding)
        else:
            self.position_embedding = nn.Embedding(max_seq_len, emb_size)

    def forward(self, X):
        batch_size, seq_len = X.shape

        # Get word embeddings
        word = self.word_embedding(X)

        if self.fixed:
            # Use fixed position embeddings
            position = self.position_embedding[:seq_len, :]
        else:
            # Get position embeddings
            position_ids = torch.arange(seq_len, device=self.device).unsqueeze(0).expand(batch_size, seq_len)
            position = self.position_embedding(position_ids)
        
        # Add word and position embeddings
        embeddings = word + position
        return embeddings
    

class Encoder(nn.Module):
    def __init__(self, vocab_size, seq_len, emb_size, n_layers, heads, forward_expansion, drop_out, device):
        super(Encoder, self).__init__()

        self.emb_size = emb_size
        self.device = device
        
        self.embedding = WordPositionEmbedding(vocab_size, seq_len, emb_size, device, fixed=True)

        self.layers = nn.ModuleList(
            [
                TransformerBlock(emb_size, heads, forward_expansion, drop_out) for _ in range(n_layers)
            ]
        )

        self.dropout = nn.Dropout(drop_out)

    def forward(self, X, mask):
        batch_size, seq_len = X.shape
        out = self.dropout(self.embedding(X))

        for layer in self.layers:
            out = layer(out, out, out, mask)
        return out
        

class DecoderBlock(nn.Module):
    def __init__(self, emb_size, heads, forward_expansion, drop_out) -> None:
        super(DecoderBlock, self).__init__()

        self.attention = SelfAttention(emb_size, heads)
        self.norm = nn.LayerNorm(emb_size)
        self.transformer_block = TransformerBlock(emb_size, heads, forward_expansion, drop_out)
        self.dropout = nn.Dropout(drop_out)
    
    def forward(self, X, value, key, src_mask, trg_mask):
        attention = self.attention(X, X, X, trg_mask)
        query = self.dropout(self.norm(attention + X))
        out = self.transformer_block(value, key, query, src_mask)
        return out
    

class Decoder(nn.Module):
    def __init__(self, vocab_size, seq_len, emb_size, n_layers, heads, forward_expansion, drop_out, device) -> None:
        super(Decoder, self).__init__()

        self.device = device
        self.embedding = WordPositionEmbedding(vocab_size, seq_len, emb_size, device, fixed=True)

        self.layers = nn.ModuleList(
            [
                DecoderBlock(emb_size, heads, forward_expansion, drop_out) for _ in range(n_layers)
            ]
        )

        self.fc_out = nn.Linear(emb_size, vocab_size)
        self.dropout = nn.Dropout(drop_out)

    def forward(self, X, enc_out, src_mask, trg_mask):
        batch_size, seq_len = X.shape

        out = self.dropout(self.embedding(X))

        for layer in self.layers:
            out = layer(out, enc_out, enc_out, src_mask, trg_mask)

        out = self.fc_out(out)
        return out
    

class TransformerScratch(nn.Module):
    def __init__(self, 
                 inp_vocab_size, 
                 trg_vocab_size,
                 src_pad_idx,
                 trg_pad_idx,
                 emb_size,
                 n_layers=1,
                 heads=1,
                 forward_expansion=1,
                 drop_out=0.2,
                 max_seq_len=100,
                 device=torch.device('cuda'),
                 ) -> None:
        super(TransformerScratch, self).__init__()

        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

        self.encoder = Encoder(inp_vocab_size, max_seq_len, emb_size, n_layers, heads, forward_expansion, drop_out, device).to(device)
        self.decoder = Decoder(trg_vocab_size, max_seq_len, emb_size, n_layers, heads, forward_expansion, drop_out, device).to(device)

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        # (batch_size, 1, 1, src_len)
        return src_mask.to(self.device)
    
    def make_trg_mask(self, trg):
        batch_size, trg_seq_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_seq_len, trg_seq_len))).expand(batch_size, 1, trg_seq_len, trg_seq_len)
        # (batch_size, 1, trg_len, trg_len)
        return trg_mask.to(self.device)


    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_out = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_out, src_mask, trg_mask)
        return out

In [12]:
def step(model, enc_src, dec_src, trg, loss_fn, VOCAB, device):
    enc_src = enc_src.to(device)
    dec_src = dec_src.to(device)
    trg = trg.to(device)

    # Forward pass through the model
    logits = model(enc_src, dec_src)

    # Shift so we don't include the SOS token in targets, and remove the last logit to match targets
    logits = logits[:, :-1, :].contiguous()
    trg = trg[:, 1:].contiguous()
    
    loss = loss_fn(logits.view(-1, logits.shape[-1]), trg.view(-1))
    # loss = loss_fn(logits.permute(0, 2, 1), trg)

    # Calculate accuracy
    non_pad_elements = (trg != VOCAB['<PAD>']).nonzero(as_tuple=True)
    correct_predictions = (logits.argmax(dim=2) == trg).sum().item()
    accuracy = correct_predictions / len(non_pad_elements[0])

    return loss, accuracy

def train_step(model, iterator, optimizer, loss_fn, clip, VOCAB, device):
    model.train()  # Set the model to training mode
    epoch_loss = 0
    epoch_acc = 0

    for i, batch in enumerate(iterator):
        enc_src, dec_src, trg = batch
        
        # Zero the gradients
        optimizer.zero_grad()

        loss, accuracy = step(model, enc_src, dec_src, trg, loss_fn, VOCAB, device)
        
        # Backward pass
        loss.backward()

        # Clip gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)

        # Update parameters
        optimizer.step()

        # Accumulate loss and accuracy
        epoch_loss += loss.item()
        epoch_acc += accuracy

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate_step(model, iterator, loss_fn, VOCAB, device):
    model.eval()  # Set the model to evaluation mode
    epoch_loss = 0
    epoch_acc = 0

    with torch.no_grad():  # Disable gradient computation
        for i, batch in enumerate(iterator):
            enc_src, dec_src, trg = batch

            loss, accuracy = step(model, enc_src, dec_src, trg, loss_fn, VOCAB, device)

            # Accumulate loss and accuracy
            epoch_loss += loss.item()
            epoch_acc += accuracy

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def train(model, train_loader, optimizer, loss_fn, clip, epochs, VOCAB, device, val_loader=None):
    for epoch in range(epochs):
        train_loss, train_acc = train_step(model, train_loader, optimizer, loss_fn, clip, VOCAB, device)
        result = f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%'

        if val_loader:
            eval_loss, eval_acc = evaluate_step(model, val_loader, loss_fn, VOCAB, device)
            result += f'|| Eval Loss: {eval_loss:.3f} | Eval Acc: {eval_acc * 100:.2f}%'
            
        print(f'Epoch: {epoch + 1:02}')
        print(result)

    return model

In [13]:
import torch.optim as optim


transformer = TransformerScratch(
    inp_vocab_size = VOCAB_SIZE,
    trg_vocab_size = VOCAB_SIZE,
    src_pad_idx = VOCAB['<PAD>'],
    trg_pad_idx = VOCAB['<PAD>'],
    emb_size = 256,
    n_layers=2,
    heads=8,
    forward_expansion=4,
    drop_out=0.05,
    max_seq_len=TARGET_SEQ_LEN,
    device=device
).to(device)

loss_function = torch.nn.CrossEntropyLoss(ignore_index=VOCAB['<PAD>'], reduction='mean')
optimizer = optim.Adam(transformer.parameters(), lr=0.001)

transformer = train(transformer, dataloader, optimizer, loss_function, clip=1, epochs=100, VOCAB=VOCAB, device=device)

Epoch: 01
	Train Loss: 6.817 | Train Acc: 3.90%
Epoch: 02
	Train Loss: 6.245 | Train Acc: 4.08%
Epoch: 03
	Train Loss: 6.219 | Train Acc: 4.33%
Epoch: 04
	Train Loss: 6.204 | Train Acc: 4.30%
Epoch: 05
	Train Loss: 6.200 | Train Acc: 4.22%
Epoch: 06
	Train Loss: 6.198 | Train Acc: 3.91%
Epoch: 07
	Train Loss: 6.202 | Train Acc: 3.95%
Epoch: 08
	Train Loss: 6.199 | Train Acc: 3.99%
Epoch: 09
	Train Loss: 6.198 | Train Acc: 4.21%
Epoch: 10
	Train Loss: 6.186 | Train Acc: 4.30%
Epoch: 11
	Train Loss: 6.187 | Train Acc: 4.24%
Epoch: 12
	Train Loss: 6.180 | Train Acc: 4.24%
Epoch: 13
	Train Loss: 6.162 | Train Acc: 3.93%
Epoch: 14
	Train Loss: 6.150 | Train Acc: 4.21%
Epoch: 15
	Train Loss: 6.128 | Train Acc: 4.30%
Epoch: 16
	Train Loss: 6.114 | Train Acc: 4.31%
Epoch: 17
	Train Loss: 6.113 | Train Acc: 4.39%
Epoch: 18
	Train Loss: 6.085 | Train Acc: 4.35%
Epoch: 19
	Train Loss: 6.072 | Train Acc: 4.68%
Epoch: 20
	Train Loss: 5.980 | Train Acc: 7.03%
Epoch: 21
	Train Loss: 5.696 | Train Acc

In [14]:
class TransformerPytroch(nn.Module):
    def __init__(
        self,
        inp_vocab_size, 
        trg_vocab_size,
        src_pad_idx,
        trg_pad_idx,
        emb_size,
        n_layers=1,
        heads=1,
        forward_expansion=1,
        drop_out=0.2,
        max_seq_len=100,
        device=torch.device('cuda')
    ):
        super(TransformerPytroch, self).__init__()

        self.enc_embedding = WordPositionEmbedding(inp_vocab_size, max_seq_len, emb_size, device, fixed=False)
        self.dec_embedding = WordPositionEmbedding(trg_vocab_size, max_seq_len, emb_size, device, fixed=False)


        self.device = device
        self.transformer = nn.Transformer(
            d_model=emb_size,
            nhead=heads,
            num_encoder_layers=n_layers,
            num_decoder_layers=n_layers,
            dim_feedforward=forward_expansion,
            dropout=drop_out,
            batch_first=True,
            device=device
        )
        self.fc_out = nn.Linear(emb_size, trg_vocab_size)
        self.dropout = nn.Dropout(drop_out)
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

    def make_src_mask(self, src):
        src_mask = src == self.src_pad_idx

        # (N, src_len)
        return src_mask.to(self.device)

    def forward(self, src, trg):
        batch_size, trg_seq_length = trg.shape

        src_padding_mask = self.make_src_mask(src)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
            self.device
        )

        src_emb = self.dropout(self.enc_embedding(src))
        trg_emb = self.dropout(self.dec_embedding(trg))

        out = self.transformer(
            src_emb,
            trg_emb,
            src_key_padding_mask=src_padding_mask,
            tgt_mask=trg_mask,
        )
        out = self.fc_out(out)
        return out

In [15]:
pytorch_transformer = TransformerPytroch(
    inp_vocab_size = VOCAB_SIZE,
    trg_vocab_size = VOCAB_SIZE,
    src_pad_idx = VOCAB['<PAD>'],
    trg_pad_idx = VOCAB['<PAD>'],
    emb_size = 512,
    n_layers=1,
    heads=4,
    forward_expansion=4,
    drop_out=0.1,
    max_seq_len=TARGET_SEQ_LEN,
    device=device
).to(device)

loss_function = torch.nn.CrossEntropyLoss(ignore_index=VOCAB['<PAD>'], reduction='mean')
optimizer = optim.Adam(pytorch_transformer.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)


pytorch_transformer = train(pytorch_transformer, dataloader, optimizer, loss_function, clip=1, epochs=100, VOCAB=VOCAB, device=device)


Epoch: 01
	Train Loss: 6.675 | Train Acc: 4.55%
Epoch: 02
	Train Loss: 6.070 | Train Acc: 8.00%
Epoch: 03
	Train Loss: 5.739 | Train Acc: 10.06%
Epoch: 04
	Train Loss: 5.365 | Train Acc: 12.21%
Epoch: 05
	Train Loss: 4.941 | Train Acc: 14.58%
Epoch: 06
	Train Loss: 4.496 | Train Acc: 16.47%
Epoch: 07
	Train Loss: 4.024 | Train Acc: 19.72%
Epoch: 08
	Train Loss: 3.572 | Train Acc: 25.28%
Epoch: 09
	Train Loss: 3.146 | Train Acc: 31.80%
Epoch: 10
	Train Loss: 2.737 | Train Acc: 39.51%
Epoch: 11
	Train Loss: 2.385 | Train Acc: 46.75%
Epoch: 12
	Train Loss: 2.092 | Train Acc: 52.39%
Epoch: 13
	Train Loss: 1.826 | Train Acc: 57.38%
Epoch: 14
	Train Loss: 1.603 | Train Acc: 62.29%
Epoch: 15
	Train Loss: 1.415 | Train Acc: 66.42%
Epoch: 16
	Train Loss: 1.256 | Train Acc: 69.68%
Epoch: 17
	Train Loss: 1.130 | Train Acc: 72.20%
Epoch: 18
	Train Loss: 1.014 | Train Acc: 74.85%
Epoch: 19
	Train Loss: 0.915 | Train Acc: 77.11%
Epoch: 20
	Train Loss: 0.855 | Train Acc: 78.39%
Epoch: 21
	Train Loss:

In [16]:
def test_model(model, enc_src, dec_src, trg, VOCAB, max_length=15):
    enc_src = enc_src.to(device)
#     dec_src = dec_src.to(device)
    trg = trg.to(device)

    model.eval()

    generated_answers = torch.LongTensor().to(device)
    batch_size, seq_len = dec_src.shape
    
    with torch.no_grad():
        initial_dec_src = dec_src  # Shape: (batch_size, seq_len)

        dec_src = torch.LongTensor([VOCAB['<SOS>']]).unsqueeze(0).repeat(batch_size, 1).to(device)
#         dec_src = torch.LongTensor(dec_src[:, :2]).to(device)

        for i in range(max_length):
            logits = model(enc_src, dec_src)
            predictions = torch.argmax(logits, dim=2)
            
            # Append the last prediction
            dec_src = torch.cat((dec_src, predictions[:, i].unsqueeze(1)), dim=1)

            generated_answers = torch.cat((generated_answers, predictions[:, i].unsqueeze(1)), dim=1)

    
    for i in range(max_length):
        print('\nquestion: ', tokens_to_text(enc_src[i]))
        print('generated_answers: ', tokens_to_text(generated_answers[i]))
        print('real answer: ', tokens_to_text(trg[i]))
            
        




# Example usage:
batch = next(iter(dataloader))
enc_src, dec_src, trg = batch  # Ensure that these tensors are correctly shaped as per model requirements

# test_model(transformer, enc_src, dec_src, trg, VOCAB, max_length=TARGET_SEQ_LEN)
test_model(pytorch_transformer, enc_src, dec_src, trg, VOCAB, max_length=TARGET_SEQ_LEN)


question:  can i make cron start a process at say at night and stop it at say in the morning
generated_answers:  make a script to start the app and chave cron call that at then another sc a
real answer:  make a script to start the app and chave cron call that at then another sc

question:  how do i configure vpn
generated_answers:  it is not difficult to update to copy it should be updategrub to the kernel it is in the next
real answer:  it is focused on the settings for the vyprvpn service so just ignore the settings it requires for that

question:  hey can someone tell me how to install a firefox theme that is in a jar
generated_answers:  open firefox go to tools themes menu and drag and drop the jar file onto the list of themes can
real answer:  open firefox go to tools themes menu and drag and drop the jar file onto the list of themes

question:  anyone ever had an issue with evolution not grabbing pop email via gmail
generated_answers:  yes tough did not found a solution was not 

  output = torch._nested_tensor_from_mask(output, src_key_padding_mask.logical_not(), mask_check=False)
  return torch._native_multi_head_attention(


In [17]:
import torch


def prepare_model_input(question, max_length=50):
    # Tokenize the input question
    tokenized_question = text_to_tokens(question)
    enc_src = tokenized_question + [VOCAB['<EOS>']]  # Add SOS and EOS tokens
    
    # Prepare a placeholder for the decoder's input
    dec_src = torch.LongTensor([VOCAB['<SOS>']]).unsqueeze(0).to(device)

    # Convert to tensor and add batch dimension
    enc_src = F.pad(torch.LongTensor(enc_src), (0, max_length - len(enc_src)), mode='constant', value=VOCAB['<PAD>']).unsqueeze(0).to(device)

    return enc_src, dec_src


def chat_with_transformer(model, question, max_length=50, temperature=1):
    model.eval()
    with torch.no_grad():
        enc_src, dec_src = prepare_model_input(question, max_length=max_length)
        
        # Placeholder for the generated answer
        generated_answer = []
        for i in range(max_length):
            # Forward pass through the model
            logits = model(enc_src, dec_src)

            # Get the token with the highest probability for the next position from the last time step
            predictions = F.softmax(logits / temperature, dim=2)[:, i, :]
            predicted_token = torch.multinomial(predictions, num_samples=1).squeeze(1)

            # Break if the EOS token is predicted
            if predicted_token.item() == VOCAB['<EOS>']:
                break

            # Append the predicted token to the decoder's input for the next time step
            dec_src = torch.cat((dec_src, predicted_token.unsqueeze(0)), dim=1)

            # Append the predicted token to the generated answer
            generated_answer.append(predicted_token.item())

        # Convert the generated tokens to words
        return tokens_to_text(generated_answer)


# Example usage:
batch = next(iter(dataloader))
dec_src, enc_src, trg = batch

question = tokens_to_text(dec_src[0].tolist())
answer = tokens_to_text(trg[0].tolist())
print('Question:', question)
print('Real Answer:', answer)

transformer_response = chat_with_transformer(transformer, question, max_length=TARGET_SEQ_LEN, temperature=1.0)
print('\nScratch Transformer Response:', transformer_response)


transformer_response = chat_with_transformer(pytorch_transformer, question, max_length=TARGET_SEQ_LEN, temperature=1.0)
print('\nPytorch Transformer Response:', transformer_response)


question = 'Do you have friends?'
transformer_response = chat_with_transformer(pytorch_transformer, question, max_length=TARGET_SEQ_LEN, temperature=1.0)
print('\nTransformer Response 2:', transformer_response)

Question: hi i just tried to boot off the ubuntu livecd on my laptop but it fails in the initrd
Real Answer: try going to your bios setting and turning on legacy emulation for your disc drives etc

Scratch Transformer Response: to it to to it it it it it it it it it it it it now it it it

Pytorch Transformer Response: try going to your bios setting and turning on legacy emulation for your disc drives etc

Transformer Response 2: i also do not the same place but i do not see the legacy your system and got
