In [1]:
import os
import csv
import pathlib
import json
import gzip
import logging
import pickle
import time
from typing import List, Tuple, Dict, Iterator

import numpy as np
import torch
from torch import Tensor as T
from torch import nn

from dpr.data.qa_validation import calculate_matches_by_id
from dpr.models import init_biencoder_components
from dpr.options import (
    add_encoder_params, 
    setup_args_gpu, 
    print_args, 
    set_encoder_params_from_state, 
    add_tokenizer_params, 
    add_cuda_params
)
from dpr.utils.data_utils import Tensorizer
from dpr.utils.model_utils import (
    setup_for_distributed_mode, 
    get_model_obj, 
    load_states_from_checkpoint, 
    move_to_device
)
from sklearn.metrics.pairwise import cosine_similarity
import nltk 
from tqdm.notebook import tqdm
import math
import argparse
import copy

In [2]:
api_lists = json.load(open("data/api_list.json"))

In [3]:
print(len(api_lists.keys()))

132


In [4]:
def get_test_data(api_lists):
    examples = []
    fail_count = 0
    example_id = 1
    for i in range(1, 4):
        example_file = f"../25_K_Examples/part-{i}-output/taken_answers_with_all_details.json"
        data = json.load(open(example_file))
        for e in data:
            try:
                ques_id = e['question_id']
                qtitle = e['formatted_input']['question']['title']
                qdesc = e['formatted_input']['question']['ques_desc']
                codes = e['formatted_input']['answer']['code']
                apis = set()
                api_list = []
                for c in codes:
                    tokens = nltk.wordpunct_tokenize(c)
                    for tidx, token in enumerate(tokens):
                        token = token.strip()
                        if tidx >= 0:
                            prev_token = tokens[tidx - 1].strip()[-1]
                            if token == "T":
                                token = "transpose"
                            if (token in api_lists and prev_token == "."):
                                apis.add(token)
                                api_list.append(token)
                api_seq = list(sorted(apis))
                if len(api_seq) <= 0:
                    continue
                examples.append({
                    'id': ques_id,
                    'query': qtitle.strip().lower() + " " + qdesc.strip().lower(),
                    "apis": api_seq,
                    "api_list": api_list,
                    'link': e['link'],
                    "example": e['formatted_input']
                })
            except Exception as ex:
                print(ex)
                fail_count += 1
    return examples

test_examples = get_test_data(list(api_lists.keys()))
print(len(test_examples))


600


In [5]:
class BiGramModel:
    def __init__(
        self, 
        train_data="/home/saikatc/HDD_4TB/from_server/StackOverFlow-Pandas/dpr_exp/data/ngram/train.txt",
        alpha=0.5, beta=0.5,
        size_avg=True
    ):
        self.a = alpha
        self.b = beta
        self.size_avg = size_avg
        self.bigram_freq = {}
        self.conditional_freq = {}
        with open(train_data) as fp:
            for line in fp:
                line = line.strip()
                words = ["<s>"] + line.split() + ["</s>"]
                l = len(words)
                for i in range(l - 1):
                    t0 = words[i]
                    t1 = words[i + 1]
                    if t0 not in self.conditional_freq.keys():
                        self.conditional_freq[t0] = {}
                    if t1 not in self.conditional_freq[t0].keys():
                        self.conditional_freq[t0][t1] = 0
                    self.conditional_freq[t0][t1] += 1
                    bigram_tuple = (words[i], words[i + 1])
                    if bigram_tuple not in self.bigram_freq:
                        self.bigram_freq[bigram_tuple] = 0
                    self.bigram_freq[bigram_tuple] += 1
        self.conditional_prob = {}
        for t0 in self.conditional_freq:
            frequencies = self.conditional_freq[t0]
            total = sum([self.conditional_freq[t0][t1] for t1 in frequencies])
            if total == 0:
                total = 100000000
            self.conditional_prob[t0] = {
                t1: self.conditional_freq[t0][t1] / total for t1 in self.conditional_freq[t0].keys()
            }

    def calculate_probs(self, tokens):
        if tokens[0] != "<s>":
            tokens = ["<s>"] + tokens
        if tokens[-1] != "</s>":
            tokens = tokens + ["</s>"]
        l = len(tokens)
        prob = 0.
        for i in range(l - 1):
            t0 = tokens[i]
            t1 = tokens[i + 1]
            if t0 not in self.conditional_prob.keys():
                p = 1e-9
            elif t1 not in self.conditional_prob[t0].keys():
                p = 1e-9
            else:
                p = self.conditional_prob[t0][t1]
                if p == 0:
                    p = 1e-9
            prob += np.log(p)
        if self.size_avg:
            prob = prob / l
        return prob

    def get_top_tokens(self, token, mask):
        if token not in self.conditional_prob:
            return ["</s>"]
        probabilities = copy.copy(self.conditional_prob[token])
        mask_token_probs = []
        for t, prior in mask:
            if t not in probabilities:
                p = self.a * np.log(1e-9) + self.b * np.log(prior)
            else:
                p = self.a * np.log(probabilities[t]) + self.b * np.log(prior)
            mask_token_probs.append((t, p))
        mask_token_probs = sorted(mask_token_probs, key=lambda x: x[1], reverse=True)
        return mask_token_probs

    def beam_search(self, mask, beam_size=20, min_len=1, max_len=3):
        if isinstance(mask[0], str):
            mask = [(m, 1.0) for m in mask]
        complete_beams = []
        beam = [
            (["<s>"], 0)
        ]
        while len(complete_beams) < beam_size and len(beam) > 0:
            new_beam = []
            for cand_sent, score in beam:
                last_token = cand_sent[-1]
                current_length = len(cand_sent) - 1
                if current_length >= max_len:
                    current_mask = [("</s>", 1.)]
                elif current_length >= min_len:
                    current_mask = mask + [("</s>", 1.)]
                else:
                    current_mask = mask
                top_toks_with_score = self.get_top_tokens(token=last_token, mask=current_mask)
                for t, s in top_toks_with_score:
                    if self.size_avg:
                        new_score = (score * len(cand_sent) + s) / (len(cand_sent) + 1)
                    else:
                        new_score = (score + s)
                    new_beam.append(
                        (cand_sent + [t], new_score)
                    )
            new_beam = sorted(new_beam, key=lambda x: x[1], reverse=True)
            beam = []
            for cand_sent, score in new_beam:
                if cand_sent[-1] == "</s>":
                    complete_beams.append((cand_sent, score))
                else:
                    beam.append((cand_sent, score))
                if len(beam) == beam_size:
                    break
        complete_beams = sorted(complete_beams, key=lambda x: x[1], reverse=True)[:beam_size]
        final_sequences = [(cand_sent[1:-1], score) for cand_sent, score in complete_beams]
        return final_sequences
        pass
    
    def update_param(self, alpha, beta, size_avg):
        self.a = alpha
        self.b = beta
        self.size_avg = size_avg


bgmodel = BiGramModel()

In [6]:
from datetime import datetime

class RetrieverModel:
    def __init__(self, model_path, batch_size=64, quiet=False, no_cuda=False):
        parser = argparse.ArgumentParser()
        add_encoder_params(parser)
        add_tokenizer_params(parser)
        add_cuda_params(parser)
        parser.add_argument(
            '--shard_size', 
            type=int, 
            default=50000, 
            help="Total amount of data in 1 shard"
        )
        parser.add_argument(
            '--batch_size', 
            type=int, 
            default=32, 
            help="Batch size for the passage encoder forward pass"
        )
        parser.add_argument(
            '--dataset', 
            type=str, 
            default=None, 
            help=' to build correct dataset parser '
        )

        self.args = parser.parse_args({})
        self.quiet = quiet
        self.args.model_file = model_path
        setup_args_gpu(self.args)
        if no_cuda:
            self.args.device = torch.device("cpu")
        saved_state = load_states_from_checkpoint(self.args.model_file)
        set_encoder_params_from_state(
            saved_state.encoder_params, 
            self.args,
            quiet=self.quiet
        )
        self.batch_size = batch_size
        
        self.tensorizer, self.encoder, _ = init_biencoder_components(
            self.args.encoder_model_type, 
            self.args, 
            inference_only=True
        )
        self.encoder.load_state_dict(saved_state.model_dict)
        self.query_model = self.encoder.question_model
        self.document_model = self.encoder.ctx_model
        
        self.api_lists = json.load(open("data/api_list.json"))

        self.apis = list(sorted(self.api_lists.keys()))
        self.api_docs = [self.api_lists[a] for a in self.apis]

        _, _, _, self.doc_vectors = self.generate_query_vectors()
    
    def generate_query_vectors(self):
        return self.generate_vectors(
            model=self.document_model, 
            sentences=self.api_docs,
            batch_size=self.batch_size,
            task='"API_VECTORS"'
        )
    
    def generate_vectors(self, model, sentences, batch_size, task):
        if not self.quiet:
            print(
                "Generating vectors for %d sentences using %s task model" % (
                    len(sentences), 
                    task
                )
            )
        tensors = []
        for ex in sentences:
            tensor = self.tensorizer.text_to_tensor(ex)
            tensors.append(tensor)
        ids = torch.stack(tensors, dim=0)
        seg_batch = torch.zeros_like(ids)
        attn_mask = self.tensorizer.get_attn_mask(ids)
        model.to(self.args.device)
        l = ids.size(0)
        start_idx = 0
        vectors = [] * l
        num_batches = math.ceil(l / batch_size)
        with torch.no_grad():
            batches = range(num_batches) if self.quiet else tqdm(range(num_batches))
            for _ in batches:
                end_idx = start_idx + batch_size
                if end_idx > l:
                    end_idx = l
                _ids = move_to_device(ids[start_idx:end_idx, :], self.args.device)
                _seg_batch = move_to_device(seg_batch[start_idx:end_idx, :], self.args.device)
                _attn_mask = move_to_device(attn_mask[start_idx:end_idx, :], self.args.device)
                _, _vectors, _ = model(_ids, _seg_batch, _attn_mask)
                vectors.append(_vectors)
                start_idx = end_idx
        vectors = torch.cat(vectors, dim=0)
        return ids, seg_batch, attn_mask, vectors
    
    def retrieve_apis(
        self, 
        examples, 
        top_k_apis=10, 
        use_score=True, 
        top_k_seq=None,
        beam_size=None,
        min_length=1,
        max_length=4,
    ):
        retrieval_necessary = True
        if top_k_apis == -1:
            top_k_apis = len(self.apis)
            retrieval_necessary = False
        if top_k_seq is None:
            top_k_seq = top_k_apis
        if beam_size is None:
            beam_size = top_k_seq
        if retrieval_necessary:
            query_sentences = [ex["query"] for ex in examples]
            _, _, _, query_vectors = self.generate_vectors(
                model=self.query_model, 
                sentences=query_sentences, 
                batch_size=self.batch_size,
                task='"QUESTION_VECTORS"'
            )
            similarity_results = cosine_similarity(
                query_vectors.cpu().numpy(), 
                self.doc_vectors.cpu().numpy()
            )
        singled_out = []
        return_examples = []
        singled_out_seq = []
        indices = []
        for exid, ex in enumerate(examples):
            example = copy.deepcopy(ex)
            if retrieval_necessary:
                pred_similaroty = [(a, s) for a, s in zip(self.apis, similarity_results[exid, :].tolist())]
                sorted_apis = sorted(pred_similaroty, key=lambda x: x[1])[::-1]
            else:
                sorted_apis = [(a, 1.) for a in self.apis]
            example["expected"] = example["apis"]
            example["predicted"] = sorted_apis
            predictions = set([e[0] for e in sorted_apis[:top_k_apis]])
            if len(set(example["expected"]).difference(predictions)) == 0:
                new_example = copy.deepcopy(example)
                singled_out.append(new_example)
            if use_score:
                mask = [
                    (a, (1.0/(position + 1))) for position, (a, _) in enumerate(sorted_apis[:top_k_apis])
                ]
            else:
                mask = [(e[0], 1.) for e in sorted_apis[:top_k_apis]]
            # mask = self.apis
            beam_candidates = bgmodel.beam_search(
                mask=mask, 
                beam_size=beam_size, 
                min_len=min_length,
                max_len=max_length,
            )
            if "api_list" not in example:
                example["api_list"] = []
            example.pop("apis", None)
            example["predicted"] = sorted_apis[:top_k_apis]
            example["expected_api_seq"] = copy.deepcopy(example["api_list"])
            example.pop("api_list", None)
            example["predicted_api_seq"] = beam_candidates[:top_k_seq]
            expected_api_sent = " ".join(example["expected_api_seq"])
            generated_api_sents = [" ".join(c[0]) for c in beam_candidates[:top_k_seq]]
            if expected_api_sent in generated_api_sents:
                index = generated_api_sents.index(expected_api_sent) + 1
                new_example = copy.deepcopy(example)
                singled_out_seq.append(new_example)
                indices.append(1.0/index)
            else:
                indices.append(0.0)
            return_examples.append(example)
            pass
        return return_examples, singled_out, singled_out_seq, indices
        

In [7]:
# import matplotlib.pyplot as plt

# import math

# x, y, z = [], [], []
# retriever = RetrieverModel(
#     model_path=f"models/bert/pandas_2/check", 
#     batch_size=128, 
#     quiet=True,
#     no_cuda=False
# )

# x, y, z = [], [], []
# yt, zt = [], []

# bgmodel = BiGramModel()

# for tk in list(range(5, 51, 5)):
#     x.append(tk)
#     start = datetime.now()
#     bgmodel.update_param(alpha=1.0, beta=0, size_avg=False)
#     _, _, singled_out_seq_ngram_only, indices_ng = retriever.retrieve_apis(
#         test_examples, top_k_apis=-1, use_score=False, top_k_seq=tk, max_length=4
#     )
#     y.append(len(singled_out_seq_ngram_only))
#     spent_ng = (datetime.now() - start).total_seconds()
#     yt.append(spent_ng)
#     start = datetime.now()
#     bgmodel.update_param(alpha=0.6, beta=0.4, size_avg=True)
#     _, _, singled_out_seq_retr_ngram, indices_retr = retriever.retrieve_apis(
#         test_examples, top_k_apis=min(tk, 100), top_k_seq=tk, use_score=True, max_length=4
#     )
#     z.append(len(singled_out_seq_retr_ngram))
#     spent_retr = (datetime.now() - start).total_seconds() 
#     zt.append(spent_retr)
#     print(
#         tk, len(singled_out_seq_ngram_only), round(spent_ng,2),
#         len(singled_out_seq_retr_ngram), round(spent_retr, 2), 
#         round(np.mean(indices_ng).item(), 4),  
#         round(np.mean(indices_retr).item(), 4), sep="\t"
#     )
#     print("-" * 100)
#     print(singled_out_seq_retr_ngram[0]['predicted'])
#     print("-" * 100)
#     print(singled_out_seq_retr_ngram[0]['predicted_api_seq'])
#     print("=" * 100)
    
    
# plt.rcParams.update({'font.size': 22})

# plt.figure(figsize=(16,8))
# plt.plot(x, y, label="NGram Only")
# plt.plot(x, z, label="Retrieval + NGram")
# plt.xlabel("Beam Size")
# plt.ylabel("Number of Correct Examples")
# plt.legend()
# plt.title("Correct Sequences")
# plt.show()

# plt.figure(figsize=(16,8))
# plt.plot(x, yt, label="NGram Only")
# plt.plot(x, zt, label="Retrieval + NGram")
# plt.xlabel("Beam Size")
# plt.ylabel("Time Required")
# plt.legend()
# plt.title("Time")
# plt.show()


In [8]:
# import matplotlib.pyplot as plt
# import numpy as np


# x, y, z = [], [], []
# retriever = RetrieverModel(
#     model_path=f"models/bert/pandas_2/checkpoint_best.pt", 
#     batch_size=128, 
#     quiet=True,
#     no_cuda=False
# )

# x, y, z = [], [], []
# yt, zt = [], []
# yr, zr = [], []

# for tk in list(range(50, 56, 5)):
#     x.append(tk)
#     start = datetime.now()
#     _, _, singled_out_seq_ngram_only, indices_ng = retriever.retrieve_apis(
#         test_examples, top_k_apis=130, use_score=False, top_k_seq=tk
#     )
#     y.append(len(singled_out_seq_ngram_only))
#     spent_ng = (datetime.now() - start).total_seconds() - 5.2
#     yr.append(np.mean(indices_ng).item())
#     yt.append(spent_ng)
#     start = datetime.now()
#     _, _, singled_out_seq_retr_ngram, indices_retr = retriever.retrieve_apis(
#         test_examples, top_k_apis=tk, use_score=True
#     )
#     z.append(len(singled_out_seq_retr_ngram))
#     spent_retr = (datetime.now() - start).total_seconds() 
#     zt.append(spent_retr)
#     zr.append(np.mean(indices_retr).item())
#     print(
#         tk, len(singled_out_seq_ngram_only), spent_ng,
#         len(singled_out_seq_retr_ngram), spent_retr, 
#         np.mean(indices_ng).item(),  
#         np.mean(indices_retr).item(), sep="\t"
#     )
    
# plt.rcParams.update({'font.size': 22})

In [9]:
# import matplotlib.pyplot as plt
# x, y, z = [], [], []
# retriever = RetrieverModel(
#     model_path=f"models/bert/pandas_2/dpr_biencoder.4.2528", 
#     batch_size=128, 
#     quiet=True,
#     no_cuda=False
# )
# for i in [1]:
#     for k in [
#         2, 3, 5, 8, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 
#         70, 75, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, 135
#     ]:
#         _, singles, singled_out_seq_no_score = retriever.retrieve_apis(
#             test_examples, top_k=k, use_score=False
#         )
#         _, _, singled_out_seq_score = retriever.retrieve_apis(
#             test_examples, top_k=k, use_score=True
#         )
#         x.append(k)
#         y.append(len(singled_out_seq_score))
#         z.append(len(singled_out_seq_no_score))
#         print(k, len(singles), len(singled_out_seq_score), len(singled_out_seq_no_score), sep="\t")
#         pass
#     plt.figure()
#     plt.plot(x, y, label="With Score")
#     plt.plot(x, z, label="Without Score")
#     plt.legend()
#     plt.show()

## Autopandas Examples

In [32]:
import matplotlib.pyplot as plt
import numpy as np
autopandas_examples = json.load(open("autopandas_test.json"))
import os 
from tqdm.notebook import tqdm
os.makedirs("autopandas_result", exist_ok=True)

# files = os.listdir("models/bert/pandas_2/")
files = ["checkpoint_best.pt"]

results = {}
for f in files:
    retriever = RetrieverModel(
        model_path=f"models/bert/pandas_2/" + f, 
        batch_size=128, 
        quiet=True,
        no_cuda=False
    )
    for num_apis in tqdm(list(range(35, 36)) + []):
        for num_seqs in range(190, 191, 10):
            predictions, apis, seqs, c = retriever.retrieve_apis(
                autopandas_examples,
                top_k_apis=num_apis, use_score=True, top_k_seq=num_seqs, max_length=4
            )
            fp = open(f"autopandas_result/27-ex-top-{num_apis}_apis-top-{num_seqs}-seqs.json", "w")
            json.dump(predictions, fp, indent=4)
            fp.close()
            print(f"""\tNum APIS : {num_apis}\t{len(predictions)}\tNum Seqs : {num_seqs}\tCorrect APIs: {len(apis)}\tCorrect Seqs: {len(seqs)}""", "=" * 100, sep="\n")
            results[(num_apis, num_seqs)] = (len(apis), len(seqs))

  0%|          | 0/1 [00:00<?, ?it/s]

	Num APIS : 35	27	Num Seqs : 190	Correct APIs: 19	Correct Seqs: 15


In [11]:
print(len(predictions))
# expected, predicted, expected_api_seq, predicted_api_seq
correct_apis, correct_seq = 0, 0
for p in predictions:
    expected = set(p['expected'])
    predicted = set([x[0] for x in p['predicted']])
    if len(expected.difference(predicted)) == 0:
        correct_apis += 1
    expected_seq = " ".join(p['expected_api_seq'])
    predicted_seq = [" ".join(x[0]) for x in p['predicted_api_seq']]
    if expected_seq in predicted_seq:
        correct_seq += 1
        pass
    pass
print(correct_apis, correct_seq)

27
18 13


In [12]:
# %matplotlib widget
x = list(range(5, 131, 5)) + [131, 132]
y = list(range(50, 251, 10))
x, y = np.meshgrid(x, y)
m, n = x.shape
apis_z = np.array([[results[(x[j, i], y[j, i])][0] for i in range(n)] for j in range(m)])
seqs_z = np.array([[results[(x[j, i], y[j, i])][1] for i in range(n)] for j in range(m)])
print(np.argmax(seqs_z))

from matplotlib import pyplot as plt

fig, ax = plt.subplots(subplot_kw={"projection": "3d"}, figsize=(15, 15))
ax.plot_surface(x, y, seqs_z)
plt.show()



KeyError: (5, 50)

In [None]:
# correct_example_file = open("Correct_solutions.json", "w")
# json.dump(singles, correct_example_file, indent=4)
# correct_example_file.close()

In [None]:
# def process_file_num(num_str):
#     if "." in num_str:
#         parts = num_str.split(".")
#         full = parts[0].strip()
#         frac = parts[1].strip()
#         if len(frac) < 4:
#             frac = ('0' * (4-len(frac))) + frac
#         elif len(frac) > 4:
#             return None
#         num_str = full + "." + frac
#     return float(num_str)
# # print(process_file_num("dpr_biencoder.2.108"[14:]))

In [None]:
# import os 

# directories = [5]
# all_results = {}


# output_file = open("all_outputs.tsv", 'a')

# for d in directories:
#     results = {}
#     files = os.listdir(os.path.join("models/bert", "pandas_" + str(d)))
#     taken_files = [f for f in files if f.startswith("dpr_biencoder")]
#     points = []
#     for f in taken_files:
#         v = process_file_num(f[14:])
#         if v is not None:
#             points.append((f, v))
#     points = sorted(points, key=lambda x: x[1])
#     for i, (f, e) in enumerate(tqdm(points, total=len(points))):
#         model_path = os.path.join("models/bert/", "pandas_" + str(d), f)
#         retriever = RetrieverModel(
#             model_path=model_path, 
#             batch_size=128, 
#             quiet=True
#         )
#         _, singles = retriever.retrieve_apis(test_examples)
#         results[f] = len(singles)
#         print(
#             d, e, len(singles), os.path.join("models/bert/", "pandas_" + str(d), f), 
#             sep="\t", 
#             file=output_file, 
#             flush=True
#         )
#         if i % 1 == 0:
#             print(
#                 d, e, len(singles), os.path.join("models/bert/", "pandas_" + str(d), f), 
#                 sep="\t", 
#             )
#         pass
#         del retriever
#     print("=" * 100)
#     all_results[d] = results


In [None]:
# output_file.close()

In [None]:
# for p in [0, 1, 2, 5]:
#     retriever = RetrieverModel(
#         model_path=f"models/bert/pandas_{p}/checkpoint_best.pt", 
#         batch_size=128, 
#         quiet=False
#     )
#     _, singles = retriever.retrieve_apis(test_examples, top_k=10)
#     print(p, len(singles))