# Code Autocomplete 

<b>Name:</b> Pyae Sone Kyaw  <b>Student Id:</b> st123225

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchtext, datasets, math
from torchtext.vocab import vocab as torchTextVocab
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer
from tqdm import tqdm

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

#make our work comparable if restarted the kernel
SEED = 1234
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

cpu


# 1) First thing first , Load Dataset as always

In [3]:
dataset = load_dataset("lvwerra/codeparrot-clean", split="train", streaming=True)

Using custom data configuration lvwerra--codeparrot-clean-fb728533b9673c8b


In [4]:
iter_dataset = iter(dataset)

why iter? because it is a generator and it is memory efficient! represents an iterable over data samples. particularly suitable for cases where random reads are expensive, such as reading a large dataset from disk.

In [5]:
list(next(iter_dataset))

['repo_name',
 'path',
 'copies',
 'size',
 'content',
 'license',
 'hash',
 'line_mean',
 'line_max',
 'alpha_frac',
 'autogenerated']

# 2) Preprocessing

### Extracting data related to pytorch code

Since our goal is to extract code related to Python code, we are going to do that by extracting the first 1000 repo suspected to contain Pytorch code. 
However, we also need to know that other libraries are also used in Pytorch code. So, we are going to extract the first 1000 repo suspected to contain Pytorch code and also the first 1000 repo suspected to contain other libraries used in Pytorch code. This is done by using the following code:

In [6]:
pytorch_related = []

In [7]:
threshold = 1000
count = 0

for repo in iter_dataset:
    if 'torch' in repo['repo_name']:
        pytorch_related.append(repo)
        count += 1
        
        if count % 100 == 0:
            print(count)
        
        if count == threshold:
            break

100
200
300
400
500
600
700
800
900
1000


In [8]:
idx = 0
print(pytorch_related[idx]["repo_name"])
print(pytorch_related[idx]["content"][:1100])

huggingface/pytorch-transformers
# Copyright 2020 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import unittest

import pytest

from transformers import pipeline
from transformers.testing_utils import is_pipeline_test, is_torch_available, require_torch, slow

from .test_pipelines_common import MonoInputPipelineCommonMixin


if is_torch_available():
    from transformers.models.mbart import MBart50TokenizerFast, MBartForConditionalGeneration


class Tra

### Creating Vocabulary - Tokenizing First 1000 Repo
of course, to tokenize the code, we need to create a vocabulary. We are going to use a simple tokenize function to create a vocab. it's meant to provide a lexical scanner for Python code. However, in this case, the module was extremely useful since apart from returning tokens, it also return its type which made it possible to treat numbers and strings alike. Also during tokenizing, comments were excluded since they are not useful for training the model.

In [9]:
import tokenize
import io

In [10]:
tok_name = tokenize.tok_name

In [11]:
print(tok_name)

{0: 'ENDMARKER', 1: 'NAME', 2: 'NUMBER', 3: 'STRING', 4: 'NEWLINE', 5: 'INDENT', 6: 'DEDENT', 7: 'LPAR', 8: 'RPAR', 9: 'LSQB', 10: 'RSQB', 11: 'COLON', 12: 'COMMA', 13: 'SEMI', 14: 'PLUS', 15: 'MINUS', 16: 'STAR', 17: 'SLASH', 18: 'VBAR', 19: 'AMPER', 20: 'LESS', 21: 'GREATER', 22: 'EQUAL', 23: 'DOT', 24: 'PERCENT', 25: 'LBRACE', 26: 'RBRACE', 27: 'EQEQUAL', 28: 'NOTEQUAL', 29: 'LESSEQUAL', 30: 'GREATEREQUAL', 31: 'TILDE', 32: 'CIRCUMFLEX', 33: 'LEFTSHIFT', 34: 'RIGHTSHIFT', 35: 'DOUBLESTAR', 36: 'PLUSEQUAL', 37: 'MINEQUAL', 38: 'STAREQUAL', 39: 'SLASHEQUAL', 40: 'PERCENTEQUAL', 41: 'AMPEREQUAL', 42: 'VBAREQUAL', 43: 'CIRCUMFLEXEQUAL', 44: 'LEFTSHIFTEQUAL', 45: 'RIGHTSHIFTEQUAL', 46: 'DOUBLESTAREQUAL', 47: 'DOUBLESLASH', 48: 'DOUBLESLASHEQUAL', 49: 'AT', 50: 'ATEQUAL', 51: 'RARROW', 52: 'ELLIPSIS', 53: 'COLONEQUAL', 54: 'OP', 55: 'AWAIT', 56: 'ASYNC', 57: 'TYPE_IGNORE', 58: 'TYPE_COMMENT', 59: 'ERRORTOKEN', 60: 'COMMENT', 61: 'NL', 62: 'ENCODING', 63: 'N_TOKENS', 256: 'NT_OFFSET'}


In [13]:
def python_code_tokenizer(content):
    tokenized_code = []
    
    try:
        for token in tokenize.generate_tokens(io.StringIO(content).readline):
            encoding = tok_name[token.type]
            if encoding == "COMMENT" or encoding== "NL":
                continue
            elif encoding == "NUMBER":
                tokenized_code.append("<NUMBER>")
            elif encoding == "STRING":
                tokenized_code.append("<STRING>")
            else:
                tokenized_code.append(token.string)
    except:
        return []
    
    return tokenized_code

In [14]:
tokenized_code_list = []

for code in pytorch_related:
    tokenized_code = python_code_tokenizer(code["content"])
    
    if len(tokenized_code) > 0:
        tokenized_code_list.append(tokenized_code)

In [15]:
print(tokenized_code_list[0][:20])

['import', 'unittest', '\n', 'import', 'pytest', '\n', 'from', 'transformers', 'import', 'pipeline', '\n', 'from', 'transformers', '.', 'testing_utils', 'import', 'is_pipeline_test', ',', 'is_torch_available', ',']


In [16]:
len(tokenized_code_list)

999

### Splitting datasets into train,valid and test set 

In [17]:
Len = len(tokenized_code_list)

In [18]:
end   = int(0.7 * Len)
train = tokenized_code_list[:end]

start = int(0.7 * Len)
end   = int(0.8 * Len)
valid = tokenized_code_list[start: end]

start = int(0.8 * Len)
test  = tokenized_code_list[start:]

In [19]:
len(train), len(valid), len(test)

(699, 100, 200)

In [20]:
def convert_to_Dataset(dataset, feature_name):
    my_list = []
    
    for data in dataset:
        my_list.append({feature_name: data})
    
    return Dataset.from_list(my_list)

In [21]:
train_dataset = convert_to_Dataset(train, "tokens")
val_dataset   = convert_to_Dataset(valid, "tokens")
test_dataset  = convert_to_Dataset(test, "tokens")

In [22]:
print(train_dataset, val_dataset, test_dataset)

Dataset({
    features: ['tokens'],
    num_rows: 699
}) Dataset({
    features: ['tokens'],
    num_rows: 100
}) Dataset({
    features: ['tokens'],
    num_rows: 200
})


### Creating Vocab

In [23]:
# flattened_code = [item for sublist in train_dataset["tokens"] for item in sublist]
# vocab = list(set(flattened_code))

In [24]:
# vocab.insert(0, '<unk>')
# vocab.insert(1, '<eos>')

In [25]:
# print(vocab[:10])

In [26]:
# from collections import Counter, OrderedDict

In [27]:
# counter = Counter(vocab)

In [28]:
# counter_dict = OrderedDict(counter)

In [29]:
# vocab = Vocab(counter_dict)
# vocab.set_default_index(vocab['<unk>'])
# print(len(vocab))                         
# print(vocab.get_itos()[:10])    

In [30]:
vocab = torchtext.vocab.build_vocab_from_iterator(train_dataset['tokens']) 
vocab.insert_token('<unk>', 0)           
vocab.insert_token('<eos>', 1)            
vocab.set_default_index(vocab['<unk>'])   
print(len(vocab))                         
print(vocab.get_itos()[:10])      

24900
['<unk>', '<eos>', '\n', '.', ',', '(', ')', '=', '<STRING>', ':']


#### Saving Vocab Object for future use in web app 

In [31]:
torch.save(vocab, 'vocab_obj.pth')

In [34]:
# vocab_obj = torch.load('vocab_obj.pth')

# 3) Prepare Data loader for Model

In [35]:
def get_data(dataset, vocab, batch_size):
    data = []                                                   
    for example in dataset:
        if example['tokens']:         
            #appends eos so we know it ends....so model learn how to end...                             
            tokens = example['tokens'].append('<eos>')   
            #numericalize          
            tokens = [vocab[token] for token in example['tokens']] 
            data.extend(tokens)                                    
    data = torch.LongTensor(data)                                 
    num_batches = data.shape[0] // batch_size #get the int number of batches...
    data = data[:num_batches * batch_size] #make the batch evenly, and cut out any remaining                      
    data = data.view(batch_size, num_batches)          
    return data #[batch size, bunch of tokens]

In [36]:
batch_size = 128
train_data = get_data(train_dataset, vocab, batch_size)
valid_data = get_data(val_dataset, vocab, batch_size)
test_data  = get_data(test_dataset, vocab, batch_size)

# 4) Model

In [37]:
class LSTMLanguageModel(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, num_layers, dropout_rate):
                
        super().__init__()
        self.num_layers = num_layers
        self.hid_dim = hid_dim
        self.emb_dim = emb_dim

        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hid_dim, num_layers=num_layers, 
                    dropout=dropout_rate, batch_first=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(hid_dim, vocab_size)
        
        self.init_weights()
        
    def init_weights(self):
        init_range_emb = 0.1
        init_range_other = 1/math.sqrt(self.hid_dim)
        self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
        self.fc.weight.data.uniform_(-init_range_other, init_range_other)
        self.fc.bias.data.zero_()
        for i in range(self.num_layers):
            self.lstm.all_weights[i][0] = torch.FloatTensor(self.emb_dim,
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 
            self.lstm.all_weights[i][1] = torch.FloatTensor(self.hid_dim, 
                    self.hid_dim).uniform_(-init_range_other, init_range_other) 

    def init_hidden(self, batch_size, device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        cell   = torch.zeros(self.num_layers, batch_size, self.hid_dim).to(device)
        return hidden, cell
    
    def detach_hidden(self, hidden):
        hidden, cell = hidden
        hidden = hidden.detach()
        cell = cell.detach()
        return hidden, cell

    def forward(self, src, hidden):
        #src: [batch size, seq len]
        embedding = self.dropout(self.embedding(src))
        #embedding: [batch size, seq len, emb_dim]
        output, hidden = self.lstm(embedding, hidden)      
        #output: [batch size, seq len, hid_dim]
        #hidden = h, c = [num_layers * direction, seq len, hid_dim)
        output = self.dropout(output) 
        prediction = self.fc(output)
        #prediction: [batch size, seq_len, vocab size]
        return prediction, hidden

# 5) Training

In [216]:
vocab_size = len(vocab)
emb_dim = 1024
hid_dim = 1024
num_layers = 2
dropout_rate = 0.65              
lr = 1e-3   

In [217]:
model = LSTMLanguageModel(vocab_size, emb_dim, hid_dim, num_layers, dropout_rate).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {num_params:,} trainable parameters')

The model has 67,813,700 trainable parameters


In [218]:
def get_batch(data, seq_len, idx):
    #this data is from get_data()
    #train_data.shape # [batch_size, number of batches....]
    src    = data[:, idx:idx+seq_len]                   
    target = data[:, idx+1:idx+seq_len+1]  #target simply is ahead of src by 1            
    return src, target

In [219]:
def train(model, data, optimizer, criterion, batch_size, seq_len, clip, device):
    
    epoch_loss = 0
    model.train()
    # drop all batches that are not a multiple of seq_len
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)
    
    for idx in tqdm(range(0, num_batches - 1, seq_len), desc='Training: ',leave=False):
        optimizer.zero_grad()
        hidden = model.detach_hidden(hidden)

        src, target = get_batch(data, seq_len, idx) #src, target: [batch size, seq len]
        src, target = src.to(device), target.to(device)
        batch_size = src.shape[0]
        prediction, hidden = model(src, hidden)               

        prediction = prediction.reshape(batch_size * seq_len, -1)  #prediction: [batch size * seq len, vocab size]  
        target = target.reshape(-1)
        loss = criterion(prediction, target)
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip) #prevent gradient explosion - clip is basically 
        optimizer.step()
        epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [220]:
def evaluate(model, data, criterion, batch_size, seq_len, device):

    epoch_loss = 0
    model.eval()
    num_batches = data.shape[-1]
    data = data[:, :num_batches - (num_batches -1) % seq_len]
    num_batches = data.shape[-1]

    hidden = model.init_hidden(batch_size, device)

    with torch.no_grad():
        for idx in range(0, num_batches - 1, seq_len):
            hidden = model.detach_hidden(hidden)
            src, target = get_batch(data, seq_len, idx)
            src, target = src.to(device), target.to(device)
            batch_size= src.shape[0]

            prediction, hidden = model(src, hidden)
            prediction = prediction.reshape(batch_size * seq_len, -1)
            target = target.reshape(-1)

            loss = criterion(prediction, target)
            epoch_loss += loss.item() * seq_len
    return epoch_loss / num_batches

In [221]:
n_epochs = 50
seq_len  = 15
clip    = 0.25

lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.5, patience=0)

best_valid_loss = float('inf')

for epoch in range(n_epochs):
    train_loss = train(model, train_data, optimizer, criterion, 
                batch_size, seq_len, clip, device)
    valid_loss = evaluate(model, valid_data, criterion, batch_size, 
                seq_len, device)

    lr_scheduler.step(valid_loss)

    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './models/best-val-auto.pt')
    print(f'\tepoch: {epoch+1}')
    print(f'\tTrain Perplexity: {math.exp(train_loss):.3f}')
    print(f'\tValid Perplexity: {math.exp(valid_loss):.3f}')

                                                           

	epoch: 1
	Train Perplexity: 77.465
	Valid Perplexity: 64.815


                                                           

	epoch: 2
	Train Perplexity: 18.981
	Valid Perplexity: 17.753


                                                           

	epoch: 3
	Train Perplexity: 13.566
	Valid Perplexity: 15.742


                                                           

	epoch: 4
	Train Perplexity: 11.074
	Valid Perplexity: 14.332


                                                           

	epoch: 5
	Train Perplexity: 9.491
	Valid Perplexity: 13.856


                                                           

	epoch: 6
	Train Perplexity: 8.325
	Valid Perplexity: 13.241


                                                           

	epoch: 7
	Train Perplexity: 7.456
	Valid Perplexity: 13.185


                                                           

	epoch: 8
	Train Perplexity: 6.751
	Valid Perplexity: 12.913


                                                           

	epoch: 9
	Train Perplexity: 6.173
	Valid Perplexity: 12.680


                                                           

	epoch: 10
	Train Perplexity: 5.713
	Valid Perplexity: 12.516


                                                           

	epoch: 11
	Train Perplexity: 5.322
	Valid Perplexity: 12.624


                                                           

	epoch: 12
	Train Perplexity: 4.976
	Valid Perplexity: 12.665


                                                           

	epoch: 13
	Train Perplexity: 4.792
	Valid Perplexity: 12.156


                                                           

	epoch: 14
	Train Perplexity: 4.652
	Valid Perplexity: 12.269


                                                           

	epoch: 15
	Train Perplexity: 4.632
	Valid Perplexity: 11.775


                                                           

	epoch: 16
	Train Perplexity: 4.555
	Valid Perplexity: 11.737


                                                           

	epoch: 17
	Train Perplexity: 4.490
	Valid Perplexity: 11.800


                                                           

	epoch: 18
	Train Perplexity: 4.506
	Valid Perplexity: 11.528


                                                           

	epoch: 19
	Train Perplexity: 4.475
	Valid Perplexity: 11.530


                                                           

	epoch: 20
	Train Perplexity: 4.539
	Valid Perplexity: 11.383


                                                           

	epoch: 21
	Train Perplexity: 4.522
	Valid Perplexity: 11.340


                                                           

	epoch: 22
	Train Perplexity: 4.502
	Valid Perplexity: 11.357


                                                           

	epoch: 23
	Train Perplexity: 4.582
	Valid Perplexity: 11.265


                                                           

	epoch: 24
	Train Perplexity: 4.557
	Valid Perplexity: 11.266


                                                           

	epoch: 25
	Train Perplexity: 4.625
	Valid Perplexity: 11.208


                                                           

	epoch: 26
	Train Perplexity: 4.617
	Valid Perplexity: 11.167


                                                           

	epoch: 27
	Train Perplexity: 4.602
	Valid Perplexity: 11.146


                                                           

	epoch: 28
	Train Perplexity: 4.582
	Valid Perplexity: 11.164


                                                           

	epoch: 29
	Train Perplexity: 4.624
	Valid Perplexity: 11.150


                                                           

	epoch: 30
	Train Perplexity: 4.667
	Valid Perplexity: 11.143


                                                           

	epoch: 31
	Train Perplexity: 4.656
	Valid Perplexity: 11.135


                                                           

	epoch: 32
	Train Perplexity: 4.652
	Valid Perplexity: 11.131


                                                           

	epoch: 33
	Train Perplexity: 4.648
	Valid Perplexity: 11.124


                                                           

	epoch: 34
	Train Perplexity: 4.640
	Valid Perplexity: 11.114


                                                           

	epoch: 35
	Train Perplexity: 4.640
	Valid Perplexity: 11.108


                                                           

	epoch: 36
	Train Perplexity: 4.638
	Valid Perplexity: 11.098


                                                           

	epoch: 37
	Train Perplexity: 4.633
	Valid Perplexity: 11.090


                                                           

	epoch: 38
	Train Perplexity: 4.626
	Valid Perplexity: 11.080


                                                           

	epoch: 39
	Train Perplexity: 4.623
	Valid Perplexity: 11.075


                                                           

	epoch: 40
	Train Perplexity: 4.618
	Valid Perplexity: 11.071


                                                           

	epoch: 41
	Train Perplexity: 4.620
	Valid Perplexity: 11.066


                                                           

	epoch: 42
	Train Perplexity: 4.615
	Valid Perplexity: 11.069


                                                           

	epoch: 43
	Train Perplexity: 4.622
	Valid Perplexity: 11.069


                                                           

	epoch: 44
	Train Perplexity: 4.631
	Valid Perplexity: 11.067


                                                           

	epoch: 45
	Train Perplexity: 4.632
	Valid Perplexity: 11.067


                                                           

	epoch: 46
	Train Perplexity: 4.631
	Valid Perplexity: 11.067


                                                           

	epoch: 47
	Train Perplexity: 4.636
	Valid Perplexity: 11.067


                                                           

	epoch: 48
	Train Perplexity: 4.634
	Valid Perplexity: 11.067


                                                           

	epoch: 49
	Train Perplexity: 4.630
	Valid Perplexity: 11.067


                                                           

	epoch: 50
	Train Perplexity: 4.628
	Valid Perplexity: 11.067


# 6) Testing the Model

In [222]:
model.load_state_dict(torch.load('./models/best-val-auto.pt',  map_location=device))
test_loss = evaluate(model, test_data, criterion, batch_size, seq_len, device)
print(f'Test Perplexity: {math.exp(test_loss):.3f}')

Test Perplexity: 15.408


# 7) Real Time Inference / Testing

In [224]:
def generate(prompt, max_seq_len, temperature, model, tokenizer, vocab, device, seed=None):
    if seed is not None:
        torch.manual_seed(seed)
    model.eval()
    tokens = tokenizer(prompt)
    indices = [vocab[t] for t in tokens]
    batch_size = 1
    hidden = model.init_hidden(batch_size, device)
    with torch.no_grad():
        for i in range(max_seq_len):
            src = torch.LongTensor([indices]).to(device)
            prediction, hidden = model(src, hidden)
            
            #prediction: [batch size, seq len, vocab size]
            #prediction[:, -1]: [batch size, vocab size] #probability of last vocab
            
            probs = torch.softmax(prediction[:, -1] / temperature, dim=-1)  
            prediction = torch.multinomial(probs, num_samples=1).item()    
            
            while prediction == vocab['<unk>']: #if it is unk, we sample again
                prediction = torch.multinomial(probs, num_samples=1).item()

            if prediction == vocab['<eos>']:    #if it is eos, we stop
                break

            indices.append(prediction) #autoregressive, thus output becomes input

    itos = vocab.get_itos()
    tokens = [itos[i] for i in indices]
    return tokens

In [226]:
prompt = 'import'
max_seq_len = 10
seed = 0

#smaller the temperature, more diverse tokens but comes 
#with a tradeoff of less-make-sense sentence
temperatures = [0.5, 0.7, 0.75, 0.8, 1.0]
for temperature in temperatures:
    generation = generate(prompt, max_seq_len, temperature, model, python_code_tokenizer, 
                          vocab, device, seed)
    print(str(temperature)+'\n'+' '.join(generation)+'\n')

0.5
import   def _score_sentence ( x , * args , ** kwargs

0.7
import   def _score_sentence ( d , * args , ** kwargs

0.75
import   def _score_sentence ( d , * args , ** kwargs

0.8
import   def _score_sentence ( d , * args , ** kwargs

1.0
import   from . test_generation_utils import LayoutLMTokenizerFast 
 from . join import

