In [None]:
import os
from datasets import load_dataset
import sqlite3
import pandas as pd


SQLITE_PATH = 'humanevalpack.db'
if os.path.exists(SQLITE_PATH):
    con = sqlite3.connect(SQLITE_PATH)
    ds_df = pd.read_sql_query("SELECT * FROM humanevalpack", con).set_index('index')
else:
    ds = load_dataset(
        'bigcode/humanevalpack',
        'js',
    )
    ds_list = []
    for row in ds.get('test'):
        ds_list.append(row)
    ds_df = pd.DataFrame(ds_list)
    con = sqlite3.connect(SQLITE_PATH)
    ds_df.to_sql('humanevalpack',con)


In [None]:
import random
random_sample_index = random.randint(0, len(ds_df) - 1)
random_sample = ds_df.iloc[random_sample_index].to_dict()
random_sample

In [None]:
from modules.models import CodeT5

model = CodeT5.load_from_checkpoint('Model Checkpoints/CodeT5JS_v0-v1.ckpt')
if model.device != 'cpu':
    model.to('cpu')

model.mode = 'test'

In [None]:
import torch
from transformers import RobertaTokenizer

from modules.datasets import CodeT5Dataset
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-small')
encoded_samples = tokenizer(
    random_sample['buggy_solution'],
    max_length=470,
    padding='max_length',
    truncation=True,
    return_tensors='pt',
)

encoded_labels = tokenizer(
    random_sample['canonical_solution'],
    max_length=470,
    padding='max_length',
    truncation=True,
    return_tensors='pt',
)


In [None]:
s = {
    'input_ids': encoded_samples.input_ids,
    'attention_mask': encoded_samples.attention_mask,
    'labels': encoded_labels.input_ids,
}

model_output = model.forward(s)[1]
generated_token_ids = torch.argmax(model_output, dim=-1)

generated_code = tokenizer.batch_decode(generated_token_ids, skip_special_tokens=True)[0]
print(generated_code)
print(random_sample)

In [None]:
with open('random_output.js','w') as f:
    f.write(generated_code)
with open('random_bug.js', 'w') as f:
    f.write(random_sample['buggy_solution'])
with open('random_solution.js', 'w') as f:
    f.write(random_sample['canonical_solution'])

In [None]:
model_output

In [None]:
LOG_PATH = './logs',
MODEL_DIR = 'CodeT5JS'
CPKT_PATH = 'Model Checkpoints/CodeT5JS'
VERSION = int(input('Training Version: '))
DEBUG = True
DB_PATH = '/content/drive/MyDrive/Thesis/commitpack-datasets.db'
BATCH_SIZE = 16
TOKENIZER_MAX_LENGTH = 420