In [1]:
import os
import csv
import pathlib
import json
import gzip
import logging
import pickle
import time
from typing import List, Tuple, Dict, Iterator

import numpy as np
import torch
from torch import Tensor as T
from torch import nn

from dpr.data.qa_validation import calculate_matches_by_id
from dpr.models import init_biencoder_components
from dpr.options import (
    add_encoder_params, 
    setup_args_gpu, 
    print_args, 
    set_encoder_params_from_state, 
    add_tokenizer_params, 
    add_cuda_params
)
from dpr.utils.data_utils import Tensorizer
from dpr.utils.model_utils import (
    setup_for_distributed_mode, 
    get_model_obj, 
    load_states_from_checkpoint, 
    move_to_device
)

In [2]:
import json
import nltk 

api_lists = json.load(open("data/api_list.json"))

def get_test_data(api_lists):
    examples = []
    fail_count = 0
    example_id = 1
    for i in range(1, 4):
        example_file = f"../25_K_Examples/part-{i}-output/taken_answers_with_all_details.json"
        data = json.load(open(example_file))
        for e in data:
            try:
                ques_id = e['question_id']
                qtitle = e['formatted_input']['question']['title']
                qdesc = e['formatted_input']['question']['ques_desc']
                codes = e['formatted_input']['answer']['code']
                apis = set()
                for c in codes:
                    tokens = nltk.wordpunct_tokenize(c)
                    for tidx, token in enumerate(tokens):
                        token = token.strip()
                        if tidx >= 0:
                            prev_token = tokens[tidx - 1].strip()[-1]
                            if (token in api_lists and prev_token == ".") or token == "DataFrame":
                                apis.add(token)
                api_seq = list(sorted(apis))
                if len(api_seq) <= 0:
                    continue
                examples.append({
                    'id': ques_id,
                    'query': qtitle.strip().lower() + " " + qdesc.strip().lower(),
                    "apis": api_seq,
                    'link': e['link'],
                    "example": e['formatted_input']
                })
            except Exception as ex:
                print(ex)
                fail_count += 1
    return examples

_examples = get_test_data(list(api_lists.keys()))

_apis = list(sorted(api_lists.keys()))
_api_docs = [api_lists[a] for a in _apis]
print(len(_examples))
print(json.dumps(api_lists, indent=4))


608
{
    "DataFrame": "two-dimensional  size-mutable  potentially heterogeneous tabular data.",
    "agg": "aggregate using one or more operations over the specified axis.",
    "nunique": "count number of distinct elements in specified axis.",
    "copy": "make a copy of this objects indices and data.",
    "swaplevel": "swap levels i and j in a multiindex.",
    "count": "count non-na cells for each column or row.",
    "pivot": "return reshaped dataframe organized by given index / column values.",
    "idxmin": "return index of first occurrence of minimum over requested axis.",
    "drop_duplicates": "return dataframe with duplicate rows removed.",
    "mul": "get multiplication of dataframe and other  element-wise (binary operator mul).",
    "pct_change": "percentage change between the current and a prior element.",
    "melt": "unpivot a dataframe from wide to long format  optionally leaving identifiers set.",
    "to_string": "render a dataframe to a console-friendly tabular ou

In [3]:
model_path = "models/bert/pandas_1/checkpoint_best.pt"

In [4]:
import argparse
parser = argparse.ArgumentParser()

add_encoder_params(parser)
add_tokenizer_params(parser)
add_cuda_params(parser)
parser.add_argument(
    '--shard_size', 
    type=int, 
    default=50000, 
    help="Total amount of data in 1 shard"
)
parser.add_argument(
    '--batch_size', 
    type=int, 
    default=32, 
    help="Batch size for the passage encoder forward pass"
)
parser.add_argument(
    '--dataset', 
    type=str, 
    default=None, 
    help=' to build correct dataset parser '
)

args = parser.parse_args({})
args.model_file = model_path
setup_args_gpu(args)
saved_state = load_states_from_checkpoint(args.model_file)
set_encoder_params_from_state(saved_state.encoder_params, args)
print(args)

Overriding args parameter value from checkpoint state. Param = pretrained_model_cfg, value = google/bert_uncased_L-6_H-512_A-8
Overriding args parameter value from checkpoint state. Param = encoder_model_type, value = hf_bert
Overriding args parameter value from checkpoint state. Param = sequence_length, value = 512


Namespace(batch_size=32, dataset=None, device=device(type='cuda'), distributed_world_size=1, do_lower_case=False, encoder_model_type='hf_bert', fp16=False, fp16_opt_level='O1', local_rank=-1, model_file='models/bert/pandas_1/checkpoint_best.pt', n_gpu=1, no_cuda=False, pretrained_file=None, pretrained_model_cfg='google/bert_uncased_L-6_H-512_A-8', projection_dim=0, sequence_length=512, shard_size=50000)


In [5]:
tensorizer, encoder, _ = init_biencoder_components(args.encoder_model_type, args, inference_only=True)
encoder.load_state_dict(saved_state.model_dict)

query_model = encoder.question_model
document_model = encoder.ctx_model

In [6]:
args.device = torch.device("cpu")

In [7]:
import copy 

api_docs = copy.deepcopy(_api_docs)
apis = copy.deepcopy(_apis)
document_tensors = []
for a, d in zip(apis, api_docs):
    tensor = tensorizer.text_to_tensor(d, title=a)
    document_tensors.append(tensor)

doc_ids = move_to_device(torch.stack(document_tensors, dim=0), args.device)
doc_seg_batch = move_to_device(torch.zeros_like(doc_ids), args.device)
doc_attn_mask = move_to_device(tensorizer.get_attn_mask(doc_ids), args.device)

print(doc_ids.shape, doc_seg_batch.shape, doc_attn_mask.shape)

document_model.to(args.device)

with torch.no_grad():
    _, doc_vectors, _ = document_model(doc_ids, doc_seg_batch, doc_attn_mask)
print(doc_vectors.shape) 

torch.Size([132, 512]) torch.Size([132, 512]) torch.Size([132, 512])
torch.Size([132, 512])


In [8]:
from tqdm.notebook import tqdm


def get_query_vectors(sentences):
    query_tensors = []
    for ex in sentences:
        tensor = tensorizer.text_to_tensor(ex)
        query_tensors.append(tensor)

    query_ids = move_to_device(torch.stack(query_tensors, dim=0), args.device)
    query_seg_batch = move_to_device(torch.zeros_like(query_ids), args.device)
    query_attn_mask = move_to_device(tensorizer.get_attn_mask(query_ids), args.device)

    query_model.to(args.device)

    with torch.no_grad():
        _, query_vectors, _ = query_model(query_ids, query_seg_batch, query_attn_mask)
    return query_ids, query_seg_batch, query_attn_mask, query_vectors

test_examples = copy.deepcopy(_examples)
query_sentences = [ex["query"] for ex in test_examples]
query_ids, query_seg_batch, query_attn_mask, query_vectors = get_query_vectors(query_sentences)
print(query_vectors.shape)

torch.Size([608, 512])


In [9]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_results = cosine_similarity(query_vectors.cpu().numpy(), doc_vectors.cpu().numpy())
print(similarity_results.shape)

torch.save(
    (
        query_sentences, 
        query_ids.cpu(), 
        query_seg_batch.cpu(), 
        query_attn_mask.cpu(), 
        query_vectors.cpu(), 
        doc_vectors.cpu(), 
        similarity_results
    ), 
    "from_jupyter.pt"
)

(608, 132)


In [10]:
for exid, example in enumerate(test_examples):
    pred_similaroty = [(a, s) for a, s in zip(apis, similarity_results[exid, :].tolist())]
    sorted_apis = sorted(pred_similaroty, key=lambda x: x[1])[::-1]
    example["expected"] = example["apis"]
    example["predicted"] = sorted_apis
    
print(test_examples[0].keys())

dict_keys(['id', 'query', 'apis', 'link', 'example', 'expected', 'predicted'])


In [23]:
def analyze_performance(top_k):
    result = []
    percentage = []
    count = []
    full_correct = []
    for example in test_examples:
        match_found = 0
        prediction_apis = [x[0] for x in example["predicted"][:top_k]]
#         example["predicted"] = example["predicted"][:top_k]
        expected = set(example['expected'])
        num_apis = len(expected)
        for api in expected:
            if api in prediction_apis:
                match_found += 1
        result.append({
            'example': example,
            'num_apis': num_apis,
            'match_found': match_found,
            'percentage': match_found/float(num_apis),
            'full_match': 1 if match_found == len(expected) else 0
        })
        if match_found == len(expected):
            _ex = copy.deepcopy(example)
            _ex["predicted"] = _ex["predicted"][:top_k]
            _ex.pop("apis", None)
            full_correct.append(_ex)
        count.append(match_found)
        percentage.append(match_found/float(num_apis) * 100)
    return result, percentage, count, full_correct


def show_performance(k, max_apis=8):
    top_k_res, top_k_per, top_k_count, _ = analyze_performance(k)

    count_to_percentage = {}
    for res in top_k_res:
        if res['num_apis'] not in count_to_percentage:
            count_to_percentage[res['num_apis']] = []
        count_to_percentage[res['num_apis']].append(res)
        pass
    print("|" + ("=" * 55) + "|")
    print("|" + (" " * 25) + (f"Top %2d" % k) + (" " * 24) + "|")
    print("|" + ("-" * 55) + "|")
    print("| #APIs\t| #Examples\t| Full\t| Half\t| 1/3\t| 1/4\t|")
    print("|" + ("-" * 55) + "|")
    full, half, one_third, one_fourth, total_nums = 0, 0, 0, 0, 0
    for num_actual_api in sorted(count_to_percentage.keys()):
        if num_actual_api > max_apis:
            break
        results = count_to_percentage[num_actual_api]
        total_full_correct = sum([1 if r['percentage'] > 0.999 else 0 for r in results])
        full += total_full_correct
        total_half_correct = sum([1 if r['percentage'] >= 0.499 else 0 for r in results])
        half += total_half_correct
        total_one_third_correct = sum([1 if r['percentage'] >= 0.33 else 0 for r in results])
        one_third += total_one_third_correct
        total_one_fourth_correct = sum([1 if r['percentage'] >= 0.2499 else 0 for r in results])
        one_fourth += total_one_fourth_correct
        total_nums += len(results)
        print(
            "| %d\t| %d\t\t| %d\t| %d\t| %d\t| %d\t|" % (
                num_actual_api, 
                len(results),
                total_full_correct, 
                total_half_correct, 
                total_one_third_correct, 
                total_one_fourth_correct, 
            )
        )
        pass
    print("|" + ("=" * 55) + "|")
    print(
            "| %s\t| %d\t\t| %d\t| %d\t| %d\t| %d\t|" % (
                "total", 
                total_nums,
                full, 
                half, 
                one_third, 
                one_fourth, 
            )
        )
    print("|" + ("=" * 55) + "|")
    return top_k_res, count_to_percentage
    pass

top_k_res, top_k_per, top_k_count, full_correct = analyze_performance(10)
print(len(full_correct))
taken_ids = set()
for f in full_correct:
    if f['id'] != f['example']['qid']:
        print(f['id'], f['example']['qid'])
    taken_ids.add(f['id'])
    if len(f['expected']) >= 4:
        print(json.dumps(f, indent=4))
        print("=" * 100)
# print(json.dumps(full_correct[6:15], indent=4))
print(len(full_correct), len(taken_ids))

113
{
    "id": 63264777,
    "query": "python: append 2 columns of a dataframe together i am loading a csv file into a data frame using pandas. my dataframe looks something like this: i wish to append 2 of the columns into a new column: col4 needs to be created by appending the contents of col1 and col2 together. how can i do this in pandas/python? edit",
    "link": "https://stackoverflow.com/questions/63264777/python-append-2-columns-of-a-dataframe-together",
    "example": {
        "qid": 63264777,
        "link": "https://stackoverflow.com/questions/63264777/python-append-2-columns-of-a-dataframe-together",
        "question": {
            "title": "Python: Append 2 columns of a dataframe together",
            "ques_desc": "I am loading a csv file into a data frame using pandas. My dataframe looks something like this: I wish to append 2 of the columns into a new column: col4 needs to be created by appending the contents of col1 and col2 together. How can I do this in pandas/pyt

In [12]:
print(json.dumps(full_correct[6:15], indent=4))
fp = open("full_correct_results_top_10.json", "w")
json.dump(full_correct, fp, indent=4)
fp.close()

[
    {
        "id": 67257898,
        "query": "how to add a value to a new column by referencing the values in a column i have a dataframe like this: the xy column must be filled with the value of the column names in the reason column. let's look at the first row. the reason column shows our value x1. so our value in column xy, will be the value of x1 column in the first row. like this: is there a way to do this?",
        "link": "https://stackoverflow.com/questions/67257898/how-to-add-a-value-to-a-new-column-by-referencing-the-values-in-a-column",
        "example": {
            "qid": 67257898,
            "link": "https://stackoverflow.com/questions/67257898/how-to-add-a-value-to-a-new-column-by-referencing-the-values-in-a-column",
            "question": {
                "title": "How to add a value to a new column by referencing the values in a column",
                "ques_desc": "I have a dataframe like this: The xy column must be filled with the value of the column names

In [13]:
# print("""
#                    No Negative Samples
# |=======================================================|
# |                         Top  1                        |
# |-------------------------------------------------------|
# | #APIs	| #Examples	| Full	| Half	| 1/3	| 1/4	|
# |-------------------------------------------------------|
# | 1	| 111		| 13	| 13	| 13	| 13	|
# | 2	| 111		| 0	| 23	| 23	| 23	|
# | 3	| 110		| 0	| 0	| 28	| 28	|
# | 4	| 81		| 0	| 0	| 0	| 18	|
# | 5	| 66		| 0	| 0	| 0	| 0	|
# | 6	| 52		| 0	| 0	| 0	| 0	|
# | 7	| 19		| 0	| 0	| 0	| 0	|
# | 8	| 14		| 0	| 0	| 0	| 0	|
# |=======================================================|
# | total	| 564		| 13	| 36	| 64	| 82	|
# |=======================================================|

# |=======================================================|
# |                         Top  2                        |
# |-------------------------------------------------------|
# | #APIs	| #Examples	| Full	| Half	| 1/3	| 1/4	|
# |-------------------------------------------------------|
# | 1	| 111		| 23	| 23	| 23	| 23	|
# | 2	| 111		| 2	| 33	| 33	| 33	|
# | 3	| 110		| 0	| 4	| 38	| 38	|
# | 4	| 81		| 0	| 4	| 4	| 30	|
# | 5	| 66		| 0	| 0	| 4	| 4	|
# | 6	| 52		| 0	| 0	| 2	| 2	|
# | 7	| 19		| 0	| 0	| 0	| 1	|
# | 8	| 14		| 0	| 0	| 0	| 0	|
# |=======================================================|
# | total	| 564		| 25	| 64	| 104	| 131	|
# |=======================================================|

# |=======================================================|
# |                         Top  5                        |
# |-------------------------------------------------------|
# | #APIs	| #Examples	| Full	| Half	| 1/3	| 1/4	|
# |-------------------------------------------------------|
# | 1	| 111		| 31	| 31	| 31	| 31	|
# | 2	| 111		| 6	| 53	| 53	| 53	|
# | 3	| 110		| 0	| 13	| 57	| 57	|
# | 4	| 81		| 0	| 11	| 11	| 44	|
# | 5	| 66		| 0	| 0	| 12	| 12	|
# | 6	| 52		| 0	| 1	| 7	| 7	|
# | 7	| 19		| 0	| 0	| 0	| 4	|
# | 8	| 14		| 0	| 0	| 3	| 6	|
# |=======================================================|
# | total	| 564		| 37	| 109	| 174	| 214	|
# |=======================================================|

# |=======================================================|
# |                         Top 10                        |
# |-------------------------------------------------------|
# | #APIs	| #Examples	| Full	| Half	| 1/3	| 1/4	|
# |-------------------------------------------------------|
# | 1	| 111		| 43	| 43	| 43	| 43	|
# | 2	| 111		| 16	| 64	| 64	| 64	|
# | 3	| 110		| 3	| 24	| 70	| 70	|
# | 4	| 81		| 1	| 25	| 25	| 62	|
# | 5	| 66		| 0	| 6	| 21	| 21	|
# | 6	| 52		| 0	| 3	| 16	| 16	|
# | 7	| 19		| 0	| 0	| 2	| 6	|
# | 8	| 14		| 0	| 3	| 4	| 8	|
# |=======================================================|
# | total	| 564		| 63	| 168	| 245	| 290	|
# |=======================================================|
# """)

In [14]:
show_performance(1);

|                         Top  1                        |
|-------------------------------------------------------|
| #APIs	| #Examples	| Full	| Half	| 1/3	| 1/4	|
|-------------------------------------------------------|
| 1	| 127		| 20	| 20	| 20	| 20	|
| 2	| 143		| 0	| 45	| 45	| 45	|
| 3	| 108		| 0	| 0	| 34	| 34	|
| 4	| 74		| 0	| 0	| 0	| 25	|
| 5	| 64		| 0	| 0	| 0	| 0	|
| 6	| 25		| 0	| 0	| 0	| 0	|
| 7	| 20		| 0	| 0	| 0	| 0	|
| 8	| 23		| 0	| 0	| 0	| 0	|
| total	| 584		| 20	| 65	| 99	| 124	|


In [15]:
show_performance(2);

|                         Top  2                        |
|-------------------------------------------------------|
| #APIs	| #Examples	| Full	| Half	| 1/3	| 1/4	|
|-------------------------------------------------------|
| 1	| 127		| 32	| 32	| 32	| 32	|
| 2	| 143		| 7	| 65	| 65	| 65	|
| 3	| 108		| 0	| 4	| 48	| 48	|
| 4	| 74		| 0	| 4	| 4	| 34	|
| 5	| 64		| 0	| 0	| 5	| 5	|
| 6	| 25		| 0	| 0	| 2	| 2	|
| 7	| 20		| 0	| 0	| 0	| 1	|
| 8	| 23		| 0	| 0	| 0	| 2	|
| total	| 584		| 39	| 105	| 156	| 189	|


In [16]:
show_performance(5);

|                         Top  5                        |
|-------------------------------------------------------|
| #APIs	| #Examples	| Full	| Half	| 1/3	| 1/4	|
|-------------------------------------------------------|
| 1	| 127		| 49	| 49	| 49	| 49	|
| 2	| 143		| 21	| 96	| 96	| 96	|
| 3	| 108		| 1	| 13	| 75	| 75	|
| 4	| 74		| 0	| 20	| 20	| 51	|
| 5	| 64		| 0	| 3	| 21	| 21	|
| 6	| 25		| 0	| 1	| 7	| 7	|
| 7	| 20		| 0	| 0	| 3	| 7	|
| 8	| 23		| 0	| 0	| 3	| 8	|
| total	| 584		| 71	| 182	| 274	| 314	|


In [17]:
result, count_to_p = show_performance(10);

|                         Top 10                        |
|-------------------------------------------------------|
| #APIs	| #Examples	| Full	| Half	| 1/3	| 1/4	|
|-------------------------------------------------------|
| 1	| 127		| 68	| 68	| 68	| 68	|
| 2	| 143		| 40	| 114	| 114	| 114	|
| 3	| 108		| 3	| 36	| 93	| 93	|
| 4	| 74		| 1	| 33	| 33	| 64	|
| 5	| 64		| 1	| 8	| 35	| 35	|
| 6	| 25		| 0	| 4	| 12	| 12	|
| 7	| 20		| 0	| 2	| 4	| 14	|
| 8	| 23		| 0	| 1	| 7	| 16	|
| total	| 584		| 113	| 266	| 366	| 416	|
