## English to Python code

Steps taken:

1. Read/Clean the dataset. Dataset consists of english sentence as a comment followed by python code corresponding to the sentence. The data required lot of cleaning which will be discussed now along with the reading.  Read the dataset line by line to extract the pairs of english sentence and corresponding python block. Difficulty in extraction comes from the fact that the english sentences start from "#" as well all the comments in python. We create an ignore pattern to first filter out lines that contain In[00]: or just #[num] or just bunch of \s or contain word "driver block/code". Also, we use a minimum length based on the histogram of english sentence to filter out the comments. 


### Importing Libs 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!cp  drive/MyDrive/NLP/english_python_data_cleaned.txt .

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch.optim as optim
import os, sys, re
from pathlib import Path
import matplotlib.pyplot as plt
from ipywidgets import widgets, interact
import cgi, string, sys, io
import keyword, token, tokenize
import spacy
import torchtext
from torchtext.legacy.data import Dataset
from torchtext import data
from torchtext.legacy.data import BucketIterator, Field, Example
import math, time, random
from collections import defaultdict
from torchtext.data.metrics import bleu_score

In [None]:
Path.ls = lambda x: list(x.iterdir())

In [None]:
ignore_pattern = re.compile(r'^\s*#+\s*[0-9]*\s*\n|^\s*\n+\s*$|\s*#!.*|^\s*#\s+in\[[0-9]+\]|^\s*#\s+driver')


In [None]:
device = 'cuda'

### Reading Data

In [None]:
datapath = Path(os.getcwd())

In [None]:
# empty variables to store the english sentences and python codeblocks
english_sents = []
python_codes = []
ignored_lines = []
removed_comments = []
minimum_length = 30
# open the file to read
with open(datapath/'english_python_data_cleaned.txt', 'r') as f:
    # flag to check for the first code block
    start_extraction = False
    python_block = []
    all_lines = f.readlines()
    # remove certain lines from the data set
    clean_lines = []
    for line in all_lines:
        if ignore_pattern.search(line.lower()):
            continue
        elif re.search(r'^\s+#\s*[0-9]*\s*write',line.lower()):
            clean_lines.append(re.sub(r'^\s+#','#',line))
        else:
            clean_lines.append(line)
    
    # walk through each line by line
    for i, line in enumerate(clean_lines):
        if re.search(r'^#.*',line.lower()):
            if len(line)<minimum_length and not re.search(r'^\s*#\s*[0-9]+\s*\w+',line):
                continue
            if len(python_block)>0:
                python_codes.append(''.join(python_block))
            elif start_extraction:
                if re.search(r'write', line.lower()):
                    english_sents[-1]+='\n'
                    english_sents[-1]+=line
                elif len(english_sents[-1])<minimum_length:
                    english_sents[-1]+='\n' 
                    english_sents[-1]+=line
                else:
                    removed_comments.append(line)
                continue
                
            if not start_extraction: 
                start_extraction = True
            english_sents.append(line)
            assert len(english_sents) == len(python_codes)+1,f"{len(english_sents)}, {len(python_codes)}, {english_sents[-1]}"
            python_block = []
        else:
            if not start_extraction:
                continue
            python_block.append(line)
    if len(python_block)>0:
        python_codes.append(''.join(python_block))
assert len(english_sents) == len(python_codes)

In [None]:
print(f"Number of python programs to learn from --> {len(python_codes)}")
print(f"Number of comments removed --> {len(removed_comments)}")

Number of python programs to learn from --> 4411
Number of comments removed --> 61


### Cleaning data

In [None]:
# checking if data is clean or not
english_sents_cleaned = []
for line in english_sents:
    temp = line.split('\n')
    if len(temp)>2:
        print(temp)
    else:
        english_sents_cleaned.append(line)

In [None]:
# measure the length of the english sentence 
english_sent_lens = np.array([len(l) for l in english_sents_cleaned])
# get the 5 percentile 
q5 = np.percentile(english_sent_lens, [5])
print(q5)
# if the 5% is less than 30-40 characters then fix the data

[40.]


In [None]:
# get rid of comments from python code
python_codes_cleaned = []
for codeblock in python_codes:
    tempblock = []
    for line in codeblock.split('\n'):
        if re.search(r'^\s*#', line):
            continue
        else:
            tempblock.append(line)
    python_codes_cleaned.append('\n'.join(tempblock))
        

In [None]:
# take note of how many newlines and spaces are there in every line
# fixing indentation
python_indent_fixed = []
for i,codeblock in enumerate(python_codes_cleaned):
    flag = False
    tempblock = []
    for line in codeblock.split('\n'):
        if re.search(r'^\s+', line):
            span = re.search(r'^\s+', line).span()
            if span[1]%4!=0:
                needed_spaces = int(np.round(span[1]/4.))
                line = ' '+line
                #flag = True
        tempblock.append(line.rstrip())
    if flag:
        print(i)
        print(codeblock)
        print('\n'.join(tempblock))
        break
    python_indent_fixed.append('\n'.join(tempblock))
    

### Tokenizing

In [None]:
class PyTokenizer:
    
    def __init__(self, raw):
        self.raw = raw.expandtabs().strip()
    def printme(self):
        print(self.raw)
    def tokenize(self):
        # self.lines store the start of a newline
        self.lines = [0, 0]
        pos = 0
        while 1:
            pos = self.raw.find('\n', pos) + 1
            if not pos: break
            self.lines.append(pos)
        self.lines.append(len(self.raw))
        # Parse the source and write it
        self.pos = 0
        text = io.StringIO(self.raw)
        stringtokens = []
        tokentypes = []
        cleantokens = []
        for tok in tokenize.generate_tokens(text.readline):
            stringtokens.append(tok.string)
            tokentypes.append(tok.type)
            if len(tok.string)==0 or tok.string=='\n' or tok.type==5:
                if tok.type==56:
                    cleantokens.append(token.tok_name[5])
                else:
                    cleantokens.append(token.tok_name[tok.type])
            elif tok.type==55:
                continue
            else:
                cleantokens.append(tok.string)
        return cleantokens
    

In [None]:
trial = python_indent_fixed[10]

In [None]:
mtokenizer = PyTokenizer(trial)
mtokenizer.printme()

def two_power(terms):
    result = list(map(lambda x: 2 ** x, range(terms)))
    print(f"The total terms are: {terms}")
    for i in range(terms):
        print(f"2^{i} = {result[i]}")


In [None]:
#mtokenizer.tokenize()

In [None]:
spacy_en = spacy.load('en_core_web_sm')

In [None]:
def tokenizer_en(text):
    text = re.sub(r'#','',text).strip()
    return [tok.text for tok in spacy_en.tokenizer(text)]

def tokenizer_py(text):
    ptokenizer = PyTokenizer(text)
    return ptokenizer.tokenize()
    

### Creating Dataset

In [None]:
SRC = Field(tokenize=tokenizer_en, lower=True, init_token='<sos>', eos_token='<eos>', batch_first=True)
TRG = Field(tokenize=tokenizer_py, lower=True, init_token='<sos>', eos_token='<eos>', batch_first=True)

In [None]:
fields = [('src', SRC), ('trg', TRG)]

In [None]:
examples = []
for i, (s,t) in enumerate(zip(english_sents_cleaned, python_indent_fixed)):
    try:
        examples.append(Example.fromlist([s,t],fields))
    except (tokenize.TokenError, IndentationError) as ex:
        print(i, ex)


In [None]:
ds = Dataset(examples, fields)

In [None]:
print(re.sub('newline', '\n', ''.join(vars(ds[0])['trg'])))

num1=1.5
num2=6.3
sum=num1+num2
print(f'sum: {sum}')
endmarker


In [None]:
train_ds, valid_ds = ds.split(split_ratio=0.8)

In [None]:
len(train_ds), len(valid_ds)

(3529, 882)

In [None]:
train_iterator, valid_iterator = BucketIterator.splits((train_ds, valid_ds), batch_size = 32, sort_key = lambda x: len(x.src),
                                                            sort_within_batch=True, device = device)

### Build Vocab

In [None]:
SRC.build_vocab(train_ds, min_freq=2)
TRG.build_vocab(train_ds, min_freq=2)

In [None]:
len(SRC.vocab), len(TRG.vocab)

(1454, 3615)

### Model

In [None]:
class Encoder(nn.Module):
    def __init__(self, 
                 input_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim,
                 dropout, 
                 device,
                 max_length = 1000):
        super().__init__()

        self.device = device
        
        self.tok_embedding = nn.Embedding(input_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([EncoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim,
                                                  dropout, 
                                                  device) 
                                     for _ in range(n_layers)])
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len]
        #src_mask = [batch size, 1, 1, src len]
        
        batch_size = src.shape[0]
        src_len = src.shape[1]
        
        pos = torch.arange(0, src_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
        
        #pos = [batch size, src len]
        
        src = self.dropout((self.tok_embedding(src) * self.scale) + self.pos_embedding(pos))

        
        #src = [batch size, src len, hid dim]
        
        for layer in self.layers:
            src = layer(src, src_mask)
            
        #src = [batch size, src len, hid dim]
            
        return src               

In [None]:
class EncoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim,  
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src, src_mask):
        
        #src = [batch size, src len, hid dim]
        #src_mask = [batch size, 1, 1, src len] 
                
        #self attention
        _src, _ = self.self_attention(src, src, src, src_mask)
        
        #dropout, residual connection and layer norm
        src = self.self_attn_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        #positionwise feedforward
        _src = self.positionwise_feedforward(src)
        
        #dropout, residual and layer norm
        src = self.ff_layer_norm(src + self.dropout(_src))
        
        #src = [batch size, src len, hid dim]
        
        return src

In [None]:
class MultiHeadAttentionLayer(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout, device):
        super().__init__()
        
        assert hid_dim % n_heads == 0
        
        self.hid_dim = hid_dim
        self.n_heads = n_heads
        self.head_dim = hid_dim // n_heads
        
        self.fc_q = nn.Linear(hid_dim, hid_dim)
        self.fc_k = nn.Linear(hid_dim, hid_dim)
        self.fc_v = nn.Linear(hid_dim, hid_dim)
        
        self.fc_o = nn.Linear(hid_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([self.head_dim])).to(device)
        
    def forward(self, query, key, value, mask = None):
        
        batch_size = query.shape[0]
        
        #query = [batch size, query len, hid dim]
        #key = [batch size, key len, hid dim]
        #value = [batch size, value len, hid dim]
                
        Q = self.fc_q(query)
        K = self.fc_k(key)
        V = self.fc_v(value)
        
        #Q = [batch size, query len, hid dim]
        #K = [batch size, key len, hid dim]
        #V = [batch size, value len, hid dim]
                
        Q = Q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        K = K.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        V = V.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3)
        
        #Q = [batch size, n heads, query len, head dim]
        #K = [batch size, n heads, key len, head dim]
        #V = [batch size, n heads, value len, head dim]
                
        energy = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
        
        #energy = [batch size, n heads, query len, key len]
        
        if mask is not None:
            energy = energy.masked_fill(mask == 0, -1e10)
        
        attention = torch.softmax(energy, dim = -1)
                
        #attention = [batch size, n heads, query len, key len]
                
        x = torch.matmul(self.dropout(attention), V)
        
        #x = [batch size, n heads, query len, head dim]
        
        x = x.permute(0, 2, 1, 3).contiguous()
        
        #x = [batch size, query len, n heads, head dim]
        
        x = x.view(batch_size, -1, self.hid_dim)
        
        #x = [batch size, query len, hid dim]
        
        x = self.fc_o(x)
        
        #x = [batch size, query len, hid dim]
        
        return x, attention

In [None]:
class PositionwiseFeedforwardLayer(nn.Module):
    def __init__(self, hid_dim, pf_dim, dropout):
        super().__init__()
        
        self.fc_1 = nn.Linear(hid_dim, pf_dim)
        self.fc_2 = nn.Linear(pf_dim, hid_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        
        #x = [batch size, seq len, hid dim]
        
        x = self.dropout(torch.relu(self.fc_1(x)))
        
        #x = [batch size, seq len, pf dim]
        
        x = self.fc_2(x)
        
        #x = [batch size, seq len, hid dim]
        
        return x

In [None]:
class Decoder(nn.Module):
    def __init__(self, 
                 output_dim, 
                 hid_dim, 
                 n_layers, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device,
                 max_length = 1000):
        super().__init__()
        
        self.device = device
        
        self.tok_embedding = nn.Embedding(output_dim, hid_dim)
        self.pos_embedding = nn.Embedding(max_length, hid_dim)
        
        self.layers = nn.ModuleList([DecoderLayer(hid_dim, 
                                                  n_heads, 
                                                  pf_dim, 
                                                  dropout, 
                                                  device)
                                     for _ in range(n_layers)])
        
        self.fc_out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim])).to(device)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
                
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        
        pos = torch.arange(0, trg_len).unsqueeze(0).repeat(batch_size, 1).to(self.device)
                            
        #pos = [batch size, trg len]
            
        trg = self.dropout((self.tok_embedding(trg) * self.scale) + self.pos_embedding(pos))


                
        #trg = [batch size, trg len, hid dim]
        
        for layer in self.layers:
            trg, attention = layer(trg, enc_src, trg_mask, src_mask)
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        output = self.fc_out(trg)
        
        #output = [batch size, trg len, output dim]
            
        return output, attention

In [None]:
class DecoderLayer(nn.Module):
    def __init__(self, 
                 hid_dim, 
                 n_heads, 
                 pf_dim, 
                 dropout, 
                 device):
        super().__init__()
        
        self.self_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.enc_attn_layer_norm = nn.LayerNorm(hid_dim)
        self.ff_layer_norm = nn.LayerNorm(hid_dim)
        self.self_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.encoder_attention = MultiHeadAttentionLayer(hid_dim, n_heads, dropout, device)
        self.positionwise_feedforward = PositionwiseFeedforwardLayer(hid_dim, 
                                                                     pf_dim, 
                                                                     dropout)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, trg, enc_src, trg_mask, src_mask):
        
        #trg = [batch size, trg len, hid dim]
        #enc_src = [batch size, src len, hid dim]
        #trg_mask = [batch size, 1, trg len, trg len]
        #src_mask = [batch size, 1, 1, src len]
        
        #self attention
        _trg, _ = self.self_attention(trg, trg, trg, trg_mask)
        
        #dropout, residual connection and layer norm
        trg = self.self_attn_layer_norm(trg + self.dropout(_trg))
            
        #trg = [batch size, trg len, hid dim]
            
        #encoder attention
        _trg, attention = self.encoder_attention(trg, enc_src, enc_src, src_mask)
        
        #dropout, residual connection and layer norm
        trg = self.enc_attn_layer_norm(trg + self.dropout(_trg))
                    
        #trg = [batch size, trg len, hid dim]
        
        #positionwise feedforward
        _trg = self.positionwise_feedforward(trg)
        
        #dropout, residual and layer norm
        trg = self.ff_layer_norm(trg + self.dropout(_trg))
        
        #trg = [batch size, trg len, hid dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return trg, attention

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, 
                 encoder, 
                 decoder, 
                 src_pad_idx, 
                 trg_pad_idx, 
                 device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device
        
    def make_src_mask(self, src):
        
        #src = [batch size, src len]
        
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)

        #src_mask = [batch size, 1, 1, src len]

        return src_mask
    
    def make_trg_mask(self, trg):
        
        #trg = [batch size, trg len]
        
        trg_pad_mask = (trg != self.trg_pad_idx).unsqueeze(1).unsqueeze(2)
        
        #trg_pad_mask = [batch size, 1, 1, trg len]
        
        trg_len = trg.shape[1]
        
        trg_sub_mask = torch.tril(torch.ones((trg_len, trg_len), device = self.device)).bool()
        
        #trg_sub_mask = [trg len, trg len]
            
        trg_mask = trg_pad_mask & trg_sub_mask
        
        #trg_mask = [batch size, 1, trg len, trg len]
        
        return trg_mask

    def forward(self, src, trg):
        
        #src = [batch size, src len]
        #trg = [batch size, trg len]
                
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        
        #src_mask = [batch size, 1, 1, src len]
        #trg_mask = [batch size, 1, trg len, trg len]
        
        enc_src = self.encoder(src, src_mask)
        
        #enc_src = [batch size, src len, hid dim]
                
        output, attention = self.decoder(trg, enc_src, trg_mask, src_mask)
        
        #output = [batch size, trg len, output dim]
        #attention = [batch size, n heads, trg len, src len]
        
        return output, attention

### Training utils

In [None]:
def initialize_weights(m):
    if hasattr(m, 'weight') and m.weight.dim()>1:
        nn.init.xavier_normal_(m.weight.data)
    

In [None]:
def count_parameters(model):
    total_params = 0
    for p in model.parameters():
        if p.requires_grad:
            total_params+=p.numel()
    print(f"Model has {total_params:,} trainable parameters")

In [None]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        output, _ = model(src, trg[:,:-1])
                
        #output = [batch size, trg len - 1, output dim]
        #trg = [batch size, trg len]
            
        output_dim = output.shape[-1]
        if isinstance(criterion, nn.CTCLoss):
            output_lengths = output.shape[1]
            target_lengths = trg.shape[1]-1
            output = output.contiguous().permute(1,0,2)
            trg = trg[:,1:].contiguous()
            loss = criterion(output, trg, output_lengths, target_lengths)
        else:
            output = output.contiguous().view(-1, output_dim)
            trg = trg[:,1:].contiguous().view(-1)
            loss = criterion(output, trg)
                
        #output = [batch size * trg len - 1, output dim]
        #trg = [batch size * trg len - 1]
            
        
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, metrics=None):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output, _ = model(src, trg[:,:-1])
            
            #output = [batch size, trg len - 1, output dim]
            #trg = [batch size, trg len]
            
            output_dim = output.shape[-1]
            if isinstance(criterion, nn.CTCLoss):
                output_lengths = torch.full(size=(output.shape[0],), fill_value = output.shape[1], dtype=torch.long)
                target_lengths = trg.shape[1]-1
                output = output.contiguous().permute(1,0,2)
                trg = trg[:,1:].contiguous()
                loss = criterion(output, trg, output_lengths, target_lengths)

            else:
                output = output.contiguous().view(-1, output_dim)
                trg = trg[:,1:].contiguous().view(-1)
                loss = criterion(output, trg)
            # compute bleu score
            #if metrics is not None:
                
            #output = [batch size * trg len - 1, output dim]
            #trg = [batch size * trg len - 1]

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
candidate_corpus = [[0, 0, 'pytorch', 'test'], ['Another', 'Sentence']]
references_corpus = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']], [['No', 'Match']]]
bleu_score(candidate_corpus, references_corpus)


AttributeError: ignored

### Training Model


In [None]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
HID_DIM = 256
ENC_LAYERS = 3
DEC_LAYERS = 3
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.2

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]
TRG_PAD_IDX = TRG.vocab.stoi[TRG.pad_token]

model = Seq2Seq(enc, dec, SRC_PAD_IDX, TRG_PAD_IDX, device).to(device)

In [None]:
count_parameters(model)

Model has 6,692,383 trainable parameters


In [None]:
model.apply(initialize_weights);

In [None]:
LEARNING_RATE = 0.0005

optimizer = optim.Adam(model.parameters(), lr = LEARNING_RATE)

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index = TRG_PAD_IDX)
#criterion = nn.CTCLoss()

In [None]:
N_EPOCHS = 40
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'best_model.pt')
        patience=0
    elif patience<5:
        patience+=1
    else:
        print(f"Early stopping as model's validation loss failed to improve beyond {best_valid_loss}")
        break
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 8s
	Train Loss: 4.210 | Train PPL:  67.381
	 Val. Loss: 2.830 |  Val. PPL:  16.940
Epoch: 02 | Time: 0m 8s
	Train Loss: 2.777 | Train PPL:  16.078
	 Val. Loss: 2.392 |  Val. PPL:  10.931
Epoch: 03 | Time: 0m 8s
	Train Loss: 2.408 | Train PPL:  11.115
	 Val. Loss: 2.131 |  Val. PPL:   8.423
Epoch: 04 | Time: 0m 8s
	Train Loss: 2.162 | Train PPL:   8.690
	 Val. Loss: 1.975 |  Val. PPL:   7.208
Epoch: 05 | Time: 0m 8s
	Train Loss: 1.972 | Train PPL:   7.185
	 Val. Loss: 1.867 |  Val. PPL:   6.467
Epoch: 06 | Time: 0m 8s
	Train Loss: 1.815 | Train PPL:   6.142
	 Val. Loss: 1.751 |  Val. PPL:   5.760
Epoch: 07 | Time: 0m 8s
	Train Loss: 1.672 | Train PPL:   5.324
	 Val. Loss: 1.684 |  Val. PPL:   5.388
Epoch: 08 | Time: 0m 8s
	Train Loss: 1.550 | Train PPL:   4.710
	 Val. Loss: 1.592 |  Val. PPL:   4.914
Epoch: 09 | Time: 0m 8s
	Train Loss: 1.438 | Train PPL:   4.212
	 Val. Loss: 1.556 |  Val. PPL:   4.741
Epoch: 10 | Time: 0m 8s
	Train Loss: 1.336 | Train PPL:   3.806


In [None]:
!cp best_model.pt drive/MyDrive/NLP/EngToPython/.

In [None]:
model.load_state_dict(torch.load('best_model.pt'))

test_loss = evaluate(model, valid_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 1.226 | Test PPL:   3.407 |


### Testing Model


In [None]:
def translate_to_python(sentence, src_field, trg_field, model, device, max_len = 1000):
    
    model.eval()
        
    if isinstance(sentence, str):
        #nlp = spacy.load('en')
        tokens = tokenizer_en(sentence)
    else:
        tokens = [token.lower() for token in sentence]

    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
        
    src_indexes = [src_field.vocab.stoi[token] for token in tokens]

    src_tensor = torch.LongTensor(src_indexes).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)

    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [trg_field.vocab.stoi[trg_field.init_token]]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == trg_field.vocab.stoi[trg_field.eos_token]:
            break
    
    trg_tokens = [trg_field.vocab.itos[i] for i in trg_indexes]
    
    return trg_tokens[1:], attention

In [None]:
token_type_dict = defaultdict(lambda: [])
token_dictionary = dict()
for python_block in python_indent_fixed:
    text = python_block.expandtabs().strip()
    text = io.StringIO(text)
    for tok in tokenize.generate_tokens(text.readline):
        token_type_dict[tok.type].append(tok.string)
        token_dictionary[tok.string] = tok.type
        

In [None]:
def print_code(code, tok_dict={}):
    code_str = ''
    num_indents = 0
    after_newline = False
    prev_special = False
    for tok in code:
        str_type = tok_dict.get(tok, 1)
        if tok=='endmarker':
            break
        elif tok=='indent':
            num_indents+=1
        elif tok=='newline':
            code_str+='\n'
            after_newline = True
        elif tok=='dedent':
            num_indents-=1
        elif after_newline:
            code_str+='    '*num_indents
            code_str+=tok
            after_newline=False
            if prev_special:
                prev_special = False
            #if str_type==1 or str_type==3:
            #    code_str+=' '+tok
            #else:
            #    code_str+=tok
        elif str_type==53:
            prev_special = True
            code_str+=tok
        elif prev_special:
            code_str+=tok
            prev_special=False
        else:
            code_str+=' '+tok if len(code_str) > 0 else tok
    print(code_str)


In [None]:
token_dictionary['while']

1

In [None]:
num=55
print(english_sents_cleaned[num])
code, _ = translate_to_python(english_sents_cleaned[num], SRC, TRG, model, device, max_len = 1000)
print_code(code[:-1], tok_dict=token_dictionary)

# Write a function that takes number of disks in tower of hanaoi problem and returns the minimum number of steps required

def knapsack(w,n):
    if n==0:
        return 0
    return 1



#Evaluate Model - BLEU

In [None]:
from torchtext.data.metrics import bleu_score
def calculate_bleu(data, src_field, trg_field, model, device, max_len = 1000):    
    trgs = []
    pred_trgs = []
    
    for datum in data:
        
        src = vars(datum)['src']
        trg = vars(datum)['trg']
        
        pred_trg, _ = translate_to_python(src, src_field, trg_field, model, device, max_len)
        
        #cut off <eos> token
        pred_trg = pred_trg[:-1]
        
        pred_trgs.append(pred_trg)
        trgs.append([trg])
    
    from nltk.translate.bleu_score import sentence_bleu
    score=0.0
    for i,j in zip(trgs,pred_trgs):
        score+=sentence_bleu(i,j)
    return score/len(trgs)

In [None]:
trgs_list = []
preds_list = []
pred_tokens = output.argmax(2)
trg_field=TRG
for j in range(len(pred_tokens)):
     #cut off <eos> token and <pad> token
    ig1= trg_field.vocab.stoi[trg_field.pad_token]
    ig2= trg_field.vocab.stoi[trg_field.eos_token]
    pred_list = [trg_field.vocab.itos[i] for i in list(pred_tokens[j]) if ((i!=ig1) and (i!=ig2))]
    trg_list = [trg_field.vocab.itos[i] for i in list(trg[j,1:]) if ((i!=ig1) and (i!=ig2))]
    preds_list.append(pred_list)
    trgs_list.append(trg_list)
bleu_score(preds_list, trgs_list) 

In [None]:
bleu_score = calculate_bleu(valid_ds, SRC, TRG, model, device)
print(f'BLEU score = {bleu_score*100:.2f}')

Corpus/Sentence contains 0 counts of 4-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 3-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().
Corpus/Sentence contains 0 counts of 2-gram overlaps.
BLEU scores might be undesirable; use SmoothingFunction().


BLEU score = 38.47
