In [1]:
import requests
import fim
import time
import json

from datasets import load_dataset
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset_name = './code_for_generation.json' # test dataset name
input_column = 'starcoder_inputs'
api_url = "http://192.168.1.73:8192/star/inference"
model_path = './starcoderbase'

In [3]:
tokenizer = AutoTokenizer.from_pretrained(model_path, use_auth_token=True)
(
    suffix_tok_id,
    prefix_tok_id,
    middle_tok_id,
    pad_tok_id,
) = fim.get_fim_token_ids(tokenizer)

In [4]:
def dataloader(dataset):
    for i, sample in enumerate(dataset):
        input = sample[input_column]
        request = {
            'inputs': input,
            "parameters": {},
        }
        response = requests.post(api_url, json=request)
        
        encode_input = tokenizer.encode(input)
        encode_output = tokenizer.encode(json.loads(response.content)['generated_text'])
        
        yield {
            'inputs': encode_input,
            'cropped_inputs': tokenizer.encode(sample['cropped_inputs']),
            'outputs': encode_output,
            'labels': tokenizer.encode(sample['inputs']), # this naming style is awful, fix it if you have some time
        }

In [5]:
def calculate_similarity(text_a, text_b):
    # Create a TfidfVectorizer object
    vectorizer = TfidfVectorizer()

    # Fit and transform the texts into TF-IDF vectors
    tfidf_matrix = vectorizer.fit_transform([text_a, text_b])

    # Calculate the cosine similarity between the vectors
    similarity_score = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

    return similarity_score

In [None]:
dataset = load_dataset('json', data_files=dataset_name, split='train', streaming=False, num_proc=None)
print(len(dataset))

test_dataloader = dataloader(dataset)

test_loss = 0.
for sample in test_dataloader:
    time.sleep(10)
    inputs = sample['inputs']
    outputs = sample['outputs']
    
    num_new_tokens = len(outputs) - len(inputs)
    if suffix_tok_id not in inputs or suffix_tok_id not in outputs:
        raise Exception()
    else:
        suffix_token_index = outputs.index(suffix_tok_id)
        generated_contents = outputs[suffix_token_index + 1:suffix_token_index + num_new_tokens + 1]
        print(suffix_token_index, generated_contents)
        
        labels = sample['labels']
        cropped_inputs = sample['cropped_inputs']
        
        cropped_length = len(labels) - len(cropped_inputs)
        common_code_len = min(cropped_length, num_new_tokens)
        
        cropped_label_contents = labels[suffix_token_index-1:suffix_token_index-1+cropped_length]
        
        similarity = calculate_similarity(tokenizer.decode(generated_contents), tokenizer.decode(cropped_label_contents))
        print(similarity)
        
        test_loss += (1-similarity)
        
print(30 * '*', 'total test loss: ', test_loss / len(dataset))
# ****************************** total test loss:  0.7949232750467062

In [None]:
dataset = load_dataset('json', data_files=dataset_name, split='train', streaming=False, num_proc=None)
print(len(dataset))

test_dataloader = dataloader(dataset)

test_loss = 0.
for sample in test_dataloader:
    time.sleep(10)
    inputs = sample['inputs']
    outputs = sample['outputs']
    
    num_new_tokens = len(outputs) - len(inputs)
    if suffix_tok_id not in inputs or suffix_tok_id not in outputs:
        raise Exception()
    else:
        suffix_token_index = outputs.index(suffix_tok_id)
        generated_contents = outputs[suffix_token_index + 1:suffix_token_index + num_new_tokens + 1]
        print(suffix_token_index, generated_contents)
        
        labels = sample['labels']
        cropped_inputs = sample['cropped_inputs']
        
        cropped_length = len(labels) - len(cropped_inputs)
        common_code_len = min(cropped_length, num_new_tokens)
        
        cropped_label_contents = labels[suffix_token_index-1:suffix_token_index-1+cropped_length]
        
        similarity = calculate_similarity(tokenizer.decode(generated_contents), tokenizer.decode(cropped_label_contents))
        print(similarity)
        
        test_loss += (1-similarity)
        
print(30 * '*', 'total test loss: ', test_loss / len(dataset))
# ****************************** total test loss:  0.8067387531195809