<a href="https://colab.research.google.com/github/shabalin13/code-search/blob/main/delivery3/PML%26DL_delivery3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Delivery 2

##Implementing search

In [25]:
EMBEDDINGS_PRECOMPUTED = True
EMBEDDINGS_ON_GOOGLE_DRIVE = True
if EMBEDDINGS_ON_GOOGLE_DRIVE:
    from google.colab import drive
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers --quiet
!pip install datasets --quiet
!apt install libomp-dev
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import DataLoader
from enum import Enum, auto
from tqdm import tqdm

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    !pip install faiss-gpu -q
else:
    !pip install faiss -q

tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
model = AutoModel.from_pretrained("microsoft/codebert-base")

dataset = load_dataset("code_x_glue_ct_code_to_text", 'python')

Reading package lists... 0%Reading package lists... 0%Reading package lists... 0%Reading package lists... 4%Reading package lists... 4%Reading package lists... 4%Reading package lists... 4%

In [None]:
train, valid, test = dataset['train'], dataset['validation'], dataset['test']

In [None]:
class SeqType(Enum):
  CODE = auto()
  DOC = auto()


class TokenizeCollator(object):
    def __init__(self, tokenizer, seq_type):
        self.tokenizer = tokenizer
        self.seq_type = seq_type

    def __call__(self, batch):
        return self.create_one_batch(batch)

    def create_one_batch(self, batch):
        tokens_batch = list(map(lambda item: self.get_formatted_input(item), batch))
        batch_encoding = self.tokenizer(tokens_batch, padding=True, return_tensors='pt', return_token_type_ids=True, truncation=True)
        tokens_ids = batch_encoding.input_ids.to(device)
        token_type_ids = batch_encoding.token_type_ids.to(device)
        attention_mask = batch_encoding.attention_mask.to(device)
        return tokens_ids, token_type_ids, attention_mask

    def get_formatted_input(self, item):
        if self.seq_type == SeqType.CODE:
            return self.get_formatted_input_for_code(item)
        elif self.seq_type == SeqType.DOC:
            return self.get_formatted_input_for_doc(item)
        else:
            raise Exception("Incorrect sequence type")

    def get_formatted_input_for_code(self, item):
        doc_tokens = ' '.join(item['docstring_tokens'])
        code_tokens = ' '.join(item['code_tokens'])
        formatted_input = self.tokenizer.cls_token + doc_tokens + self.tokenizer.sep_token+code_tokens + self.tokenizer.sep_token
        return formatted_input

    def get_formatted_input_for_doc(self, item):
        doc_tokens = ' '.join(item['docstring_tokens'])
        code_tokens = ''
        formatted_input = self.tokenizer.cls_token + doc_tokens + self.tokenizer.sep_token+code_tokens + self.tokenizer.sep_token
        return formatted_input 


code_tokenize_collate_fn = TokenizeCollator(tokenizer, SeqType.CODE)
doc_tokenize_collate_fn = TokenizeCollator(tokenizer, SeqType.DOC)

BATCH_SIZE = 256
test_code_tokens_ids = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=code_tokenize_collate_fn, num_workers=0)
test_doc_tokens_ids = DataLoader(test, batch_size=BATCH_SIZE, shuffle=False, collate_fn=doc_tokenize_collate_fn, num_workers=0)

# for idx, batch in enumerate(test_tokens_ids):
#   # print(batch.shape)
#   print(batch)
#   if idx >= 0:
#     break

In [None]:
# torch.cuda.empty_cache()
model.to(device)


def print_gpu_memory_usage(idx=''):
    print(idx)
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')
    print()


if not EMBEDDINGS_PRECOMPUTED:
    batched_test_code_embs = []
    for batch in tqdm(test_code_tokens_ids):
        tokens_ids, token_type_ids, attention_mask = batch
        with torch.no_grad():
            embs = model(input_ids=tokens_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            batched_test_code_embs.append(embs)

    batched_test_doc_embs = []
    for batch in tqdm(test_doc_tokens_ids):
        tokens_ids, token_type_ids, attention_mask = batch
        with torch.no_grad():
            embs = model(input_ids=tokens_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
            batched_test_doc_embs.append(embs)

    test_code_embeddings = torch.cat(batched_test_code_embs, dim=0)
    test_doc_embeddings = torch.cat(batched_test_doc_embs, dim=0)
    if EMBEDDINGS_ON_GOOGLE_DRIVE:
        %cd /content/drive/MyDrive/PML&DL/Project
    torch.save(test_code_embeddings, 'test_code_embeddings.pt')
    torch.save(test_doc_embeddings, 'test_doc_embeddings.pt')

In [None]:
if EMBEDDINGS_PRECOMPUTED:
    if EMBEDDINGS_ON_GOOGLE_DRIVE:
        %cd /content/drive/MyDrive/PML&DL/Project
    test_code_embeddings = torch.load('test_code_embeddings.pt', map_location=torch.device('cpu'))
    test_doc_embeddings = torch.load('test_doc_embeddings.pt', map_location=torch.device('cpu'))

In [None]:
import numpy as np
import faiss

class FaissKNeighbors:
    def __init__(self, is_cuda):
        self.index = None
        self.is_cuda = is_cuda

    def fit(self, X):
        self.index = faiss.IndexFlatL2(X.shape[1])
        if self.is_cuda:
            res = faiss.StandardGpuResources()
            self.index = faiss.index_cpu_to_gpu(res, 0, self.index)
        if type(X) == torch.Tensor:
            X = X.numpy()
        self.index.add(X)

    def predict(self, X, k):
        if type(X) == torch.Tensor:
            X = X.numpy()
        distances, indices = self.index.search(X, k=k)
        return indices

In [None]:
test_faiss = FaissKNeighbors(is_cuda=device=='cuda')

In [None]:
k = 1000
mrrs = []
for beg_idx in tqdm(range(0, len(test_code_embeddings), k)):
    if beg_idx + k > len(test_code_embeddings):
        break
    doc_embs_subset = test_doc_embeddings[beg_idx:beg_idx + k]
    code_embs_subset = test_code_embeddings[beg_idx:beg_idx + k]
    test_faiss.fit(code_embs_subset)  
    preds = test_faiss.predict(doc_embs_subset, k=k)

    targets = np.repeat(np.expand_dims(range(k), 1), k, axis=1)

    reciprocal_ranks = 1 / (np.argwhere(np.equal(preds, targets))[:,1] + 1)
    mrr_ = np.mean(reciprocal_ranks)
    mrrs.append(mrr_)

In [None]:
mrr = np.mean(mrrs)
print('Mean Reciprocal rank is: ', mrr)

#Delivery 3

In [None]:
# class FineTunedCodeBert:
#     def __init__(self, model, is_freeze_bert=True):
#         self.model = model

#         if is_freeze_bert:
#             for p in self.model.parameters():
#                 p.requires_grad = False
#             for p in self.model.pooler.parameters():
#                 p.requires_grad = True

#     def forward(self, X):
#         tokens_ids, token_type_ids, attention_mask = X
#         embs = self.model(input_ids=tokens_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
#         return embs


for p in model.parameters():
    p.requires_grad = False
for p in model.pooler.parameters():
    p.requires_grad = True

In [None]:
learning_rate = 1e-5
epochs = 8
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
loss_fn = torch.nn.MSELoss()

In [None]:
def train(code_dataloader, doc_dataloader, epoch):

    model.train()
    running_loss = 0
    for iteration, (code_tokens, doc_tokens) in tqdm(enumerate(zip(code_dataloader, doc_dataloader))):
        optimizer.zero_grad()
        tokens_ids, token_type_ids, attention_mask = code_tokens
        code_embs = model(input_ids=tokens_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
        tokens_ids, token_type_ids, attention_mask = doc_tokens
        doc_embs = model(input_ids=tokens_ids, token_type_ids=token_type_ids, attention_mask=attention_mask).pooler_output
        loss = loss_fn(doc_embs, code_embs)
        running_loss += loss.item()
        loss.backward()
        optimizer.step()

        if iteration % 50 == 0:
            _loss = running_loss / (iteration + 1)
            print("epoch: {}\titeration: {}\tloss: {}\tthis iteration loss: {}".format(epoch, iteration, _loss, loss))

    torch.save(model, 'codebert.pt')

In [None]:
train_code_model_input = DataLoader(train, batch_size=BATCH_SIZE, shuffle=False, collate_fn=code_tokenize_collate_fn, num_workers=0)
train_doc_model_input = DataLoader(train, batch_size=BATCH_SIZE, shuffle=False, collate_fn=doc_tokenize_collate_fn, num_workers=0)
val_code_model_input = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=code_tokenize_collate_fn, num_workers=0)
val_doc_model_input = DataLoader(valid, batch_size=BATCH_SIZE, shuffle=False, collate_fn=doc_tokenize_collate_fn, num_workers=0)


In [None]:
for epoch in range(epochs):
    train(train_code_model_input, train_doc_model_input, epoch)

In [None]:
for p in model.pooler.parameters():
    print(p)