In [None]:
import sys
sys.path.append("../codesearchnet")


import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"

import tensorflow as tf
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)

In [None]:
import os.path as osp


# import torch
# import torch.nn.functional as F
# from torch_geometric.datasets import Planetoid
# import torch_geometric.transforms as T
# from torch_geometric.nn import GCNConv, GAE, VGAE
# from torch_geometric.utils import train_test_split_edges


import swifter
import fasttext as ft
import numpy as np
import networkx as nx
from livelossplot import PlotLosses
from livelossplot.outputs import TensorboardTFLogger
from matplotlib import pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, Lambda, LSTM, Embedding, Bidirectional
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras import backend as K
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization


from code_parser import *
from data_reader import get_data_df
from siamese_model_keras import *
from keras_preprocessing_helper import *

In [None]:
embeddings_dim_c, embeddings_dim_q = 256, 256
max_len_code, max_len_query = 48, 28
batch_size = 32
epochs = 10

exp_name = "try"

In [None]:
# load emebeddigs for for query and code
query_ft = ft.load_model("../resources/python_processed/query_ft.bin")
code_ft = ft.load_model("../resources/python_processed/code_no_ast.bin")

In [None]:
train_gen = get_generator("../resources/data/", ["python"] , ["train"], max_len_query, max_len_code, query_ft, code_ft)
valid_gen = get_generator("../resources/data/", ["python"] , ["valid"], max_len_query, max_len_code, query_ft, code_ft)
test_gen = get_generator("../resources/data/", ["python"] , ["test"], max_len_query, max_len_code, query_ft, code_ft)

In [None]:
train_ds = tf.data.Dataset.from_generator(train_gen,  
                                    (tf.float32, tf.float32),  
                                    (tf.TensorShape([max_len_query, embeddings_dim_q]),
                                     tf.TensorShape([max_len_code, embeddings_dim_c])))

valid_ds = tf.data.Dataset.from_generator(valid_gen,  
                                    (tf.float32, tf.float32),  
                                    (tf.TensorShape([max_len_query, embeddings_dim_q]),
                                     tf.TensorShape([max_len_code, embeddings_dim_c])))

test_ds = tf.data.Dataset.from_generator(test_gen,  
                                    (tf.float32, tf.float32),  
                                    (tf.TensorShape([max_len_query, embeddings_dim_q]),
                                     tf.TensorShape([max_len_code, embeddings_dim_c])))

In [None]:
train_ds = train_ds.batch(batch_size).prefetch(batch_size*2)
valid_ds = valid_ds.batch(batch_size).prefetch(batch_size*2)
test_ds = test_ds.batch(batch_size).prefetch(batch_size*2)

In [None]:
model = get_model_lstm(max_len_query, max_len_code, embeddings_dim_q, embeddings_dim_c)

In [None]:
optimizer = tf.optimizers.Adam()

In [None]:
liveloss = PlotLosses(outputs=[TensorboardTFLogger("./exp/tb/", run_id=exp_name)])
logs = {}
best_val_loss = 100000.0

for epoch in range(epochs):

    # TRAINING
    losses = []
    mrrs = []
    for x in train_ds:
        
        with tf.GradientTape() as tape:
            logits = model(x)
            loss_value = softmax_loss(None, logits)

        mrr_value = mrr(None, logits)

        # calculate gradient
        gradients = tape.gradient(loss_value, model.trainable_variables)

        # Update the weights
        optimizer.apply_gradients(zip(gradients, model.trainable_weights))

        losses.append(loss_value)
        mrrs.append(mrr_value)

        print(f"Epoch: {epoch}; Loss: {loss_value}; MRR: {mrr_value} <- Train", end="\r")
        
    logs['loss'] = np.mean(losses)
    logs['mrr'] = np.mean(mrrs)
    
    
    # VALIDATION
    losses = []
    mrrs = []
    for x in valid_ds:
        
        logits = model(x)
        
        loss_value = softmax_loss(None, logits)
        mrr_value = mrr(None, logits)
        
        losses.append(loss_value)
        mrrs.append(mrr_value)
        
        print(f"Epoch: {epoch}; Loss: {loss_value}; MRR: {mrr_value} <- Test", end="\r")
        
    logs['val_loss'] = np.mean(losses)
    logs['val_mrr'] = np.mean(mrrs)
    
    if logs['val_loss'] < best_val_loss:
        best_val_loss = logs['val_/loss']
        model.save(f"exp/{exp_name}.h5")
    
    liveloss.update(logs)
    liveloss.send()

In [None]:
model = tf.keras.models.load_model(f"exp/{exp_name}.h5")

In [None]:
    losses = []
    mrrs = []
    for x in get_dataset("test"):
        
        logits = model(x)
        
        loss_value = loss_(None, logits)
        mrr_value = mrr(None, logits)
        
        losses.append(loss_value)
        mrrs.append(mrr_value)
        
        print(f"Epoch: test; Loss: {loss_value}; MRR: {mrr_value} <- Test", end="\r")

In [None]:
np.mean(mrrs)

## Predit/Demo

In [None]:
from annoy import AnnoyIndex
import random


In [None]:
model = tf.keras.models.load_model(f"exp/{exp_name}.h5")

In [None]:
model.summary()

In [None]:
query_encoder = tf.keras.models.Model(model.get_layer('input_1').input, model.get_layer('lstm').output)
code_encoder = tf.keras.models.Model(model.get_layer('input_2').input, model.get_layer('lstm_1').output)

In [None]:
# get feature vector for both query and code
querys = []
codes = []

for q,c in test_ds.take(1000):
    querys.extend(query_encoder(q))
    codes.extend(code_encoder(c))

In [None]:
# create an index for fast matching of vectors

t = AnnoyIndex(256, 'angular')
for i in range(len(codes)):
    t.add_item(i, codes[i])

In [None]:
# build the tree

t.on_disk_build("exp/code_no_ast_embedding_screath_try.annoy_on_disk")
t.build(100)

In [None]:
# save the tree for later reference
t.save('exp/code_no_ast_embedding_screath_try.annoyme')

In [None]:
result = t.get_nns_by_vector(codes[2155], n=2, include_distances=False)
result

In [None]:
correct = 0 
for i in range(len(querys)):
    result = t.get_nns_by_vector(querys[i], n=10, include_distances=False)
    if i in result:
        correct += 1

In [None]:
correct/len(querys)

In [None]:
## snippet for loading a tree
from annoy import AnnoyIndex
t = AnnoyIndex(256, 'angular')
t.load('exp/code_no_ast_embedding_screath_try.annoyme')

In [None]:
with open("../resources/data/python_dedupe_definitions_v2.pkl", "rb") as f:
    import pickle 
    definations = pickle.load(f)