In [None]:
# ! pip install datasets transformers[sentencepiece] sacrebleu peft -q

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from datasets import Dataset
import evaluate
from tqdm.auto import tqdm

In [2]:
device = "cuda"

In [3]:
data = pd.read_json("../data/datafinal.json")

In [4]:
mlb = MultiLabelBinarizer()
mlb.fit(data["CodeList"])

In [5]:
test_data = data[~data["ContainsCode"].apply(lambda x: isinstance(x, bool))].copy()

In [7]:
model_checkpoint = "../codet5-basenew_model/checkpoint-4200"

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
for t in mlb.classes_:
    if t != tokenizer.decode(tokenizer(t)["input_ids"], skip_special_tokens=True):
        print("tokenizer not supported")
        break

In [10]:
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
model = model.to(device)

In [72]:
code_list = []
for _, row in tqdm(data.iterrows(), total=len(data)):
    if isinstance(row["ContainsCode"], str):
        output = model.generate(**tokenizer.encode_plus(row["Text"], return_tensors="pt").to(device), max_length=512)
        result = tokenizer.batch_decode(output, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
        code_list.append(result.replace('\n', '\\n').replace('\t', '\\t'))
    else:
        code_list.append(row["CodeList"])

  0%|          | 0/1908 [00:00<?, ?it/s]

In [73]:
mlb = MultiLabelBinarizer()
s1 = data["CodeList"]
t = mlb.fit_transform(s1)
mlb.classes_

array([' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',',
       '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F',
       'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
       'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '_', '`', 'a',
       'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
       '|', '}'], dtype=object)

In [74]:
t1 = mlb.fit_transform(code_list)
mlb.classes_

array([' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',',
       '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
       ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F',
       'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S',
       'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '_', '`', 'a',
       'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n',
       'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',
       '|', '}'], dtype=object)

In [75]:
t1.shape

(1908, 93)

In [76]:
submission = pd.DataFrame(t1)
submission.to_csv("submission.csv", index=False)

In [78]:
data["CodeList"] = code_list

In [None]:
data.to_csv("raw.csv", index=False)