# Load data

In [7]:
from datasets import load_dataset

In [8]:
dataset = load_dataset("code_search_net", "python", trust_remote_code=True, split="train")

In [9]:
train = dataset.data.to_pandas()

In [1]:
DATA_SAVE_PATH = "../data/train_sample.parquet"

In [None]:
train.sample(1000).reset_index(drop=True).to_parquet(DATA_SAVE_PATH)

# Tree splitter

In [2]:
import pandas as pd

In [3]:
from tree_sitter import Language, Parser
import tree_sitter_python as tspython

# Загрузка языка Python
PY_LANGUAGE = Language(tspython.language())

parser = Parser()
parser.language = PY_LANGUAGE

def extract_function_parts(code: str):
    tree = parser.parse(bytes(code, "utf8"))
    root_node = tree.root_node

    # Запрос для извлечения имени функции и её тела
    query = PY_LANGUAGE.query("""
        (function_definition
            name: (identifier) @function.name
            body: (block) @function.body)
    """)
    captures = query.captures(root_node)

    # Извлечение имени функции
    func_name = captures[0][0].text.decode("utf8") if captures else None

    # Извлечение тела функции (все узлы блока)
    body_nodes = [capture[0] for capture in captures if capture[1] == "function.body"]
    body_text = "\n".join([node.text.decode("utf8") for node in body_nodes]) if body_nodes else ""

    # Тело функции без комментариев и docstring
    body_no_comments = "\n".join([
        node.text.decode("utf8") for node in body_nodes[0].children
        if node.type not in ["comment", "expression_statement"] or (node.type == "expression_statement" and not (node.children and node.children[0].type == "string"))
    ]) if body_nodes else ""

    return pd.Series({
        "function_name": func_name,
        "body_with_comments": body_text,
        "body_without_comments": body_no_comments,
    })


In [4]:
train = pd.read_parquet(DATA_SAVE_PATH)

In [5]:
concatenated_train = pd.concat([
    train,
    train["whole_func_string"].apply(extract_function_parts)
], axis=1)

In [6]:
concatenated_train[["func_name", "function_name"]].assign(
    func_name_contains_function_name=pd.Series([x in y for x, y in zip(concatenated_train["function_name"], concatenated_train["func_name"], strict=True)])
).sample(10)

Unnamed: 0,func_name,function_name,func_name_contains_function_name
10,dump_cookie,dump_cookie,True
775,cli,cli,True
436,parse_seq,parse_seq,True
260,Resolver.read_registry,read_registry,True
761,stop_message_live_location,stop_message_live_location,True
276,load_config_key,load_config_key,True
733,imt,imt,True
879,_LDAPUser._get_groups,_get_groups,True
112,SemI.from_dict,from_dict,True
374,get_metar_from_mission,get_metar_from_mission,True


In [7]:
concatenated_train.columns

Index(['repository_name', 'func_path_in_repository', 'func_name',
       'whole_func_string', 'language', 'func_code_string', 'func_code_tokens',
       'func_documentation_string', 'func_documentation_tokens', 'split_name',
       'func_code_url', 'function_name', 'body_with_comments',
       'body_without_comments'],
      dtype='object')

In [None]:
QUANTILE_75_FUNC_NAME_LEN = int(concatenated_train["function_name"].apply(len).quantile(0.75))

# Load model

In [9]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

checkpoint = "Salesforce/codet5p-220m"
device = "cuda" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint).to(device)


In [10]:
import numpy as np
from tqdm import tqdm

In [None]:
def make_predictions(code_list: list[str]) -> list[str]:
    result = []
    for chunk in tqdm(np.array_split(
        [
            f"def <extra_id_1>:{code}"
            for code in code_list
        ],
        100,
    )):
        inputs = tokenizer(
            chunk.tolist(),
            padding=True,
            truncation=True,
            return_tensors="pt",
        ).to(device)
        outputs = model.generate(
            **inputs,
            max_length=QUANTILE_75_FUNC_NAME_LEN,
            num_beams=4,       # Beam search for potentially better quality
            early_stopping=True,
        )
        decoded_outputs = [
            tokenizer.decode(
                seq.to("cpu"),
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True,
            )
            for seq in outputs
        ]
        result.extend([x.split("(")[0].replace(" ", "").replace("\n", "").replace("\t", "") for x in decoded_outputs])
    return result

In [12]:
import evaluate

In [13]:
em_metric = evaluate.load("exact_match")
rouge_metric = evaluate.load("rouge")

# Run predictions wout comments

In [14]:
concatenated_train["predicted_function_name"] = make_predictions(concatenated_train["body_without_comments"])

100%|██████████| 100/100 [00:49<00:00,  2.01it/s]


## Evaluate

In [15]:
em_results = em_metric.compute(predictions=concatenated_train["predicted_function_name"], references=concatenated_train["function_name"])
print(f"EM Score: {em_results['exact_match']:.2f}")

EM Score: 0.09


In [16]:
rouge_results = rouge_metric.compute(predictions=concatenated_train["predicted_function_name"], references=concatenated_train["function_name"])
print(f"ROUGE-1 F1 Score: {rouge_results['rouge1']:.2f}")
print(f"ROUGE-L F1 Score: {rouge_results['rougeL']:.2f}")

ROUGE-1 F1 Score: 0.27
ROUGE-L F1 Score: 0.27


# Run predictions with comments

In [17]:
concatenated_train["predicted_function_name"] = make_predictions(concatenated_train["body_with_comments"])

100%|██████████| 100/100 [00:43<00:00,  2.31it/s]


## Evaluate

In [18]:
em_results = em_metric.compute(predictions=concatenated_train["predicted_function_name"], references=concatenated_train["function_name"])
print(f"EM Score: {em_results['exact_match']:.2f}")

EM Score: 0.18


In [19]:
rouge_results = rouge_metric.compute(predictions=concatenated_train["predicted_function_name"], references=concatenated_train["function_name"])
print(f"ROUGE-1 F1 Score: {rouge_results['rouge1']:.2f}")
print(f"ROUGE-L F1 Score: {rouge_results['rougeL']:.2f}")

ROUGE-1 F1 Score: 0.43
ROUGE-L F1 Score: 0.42


# Find worst

In [None]:
ASSUMED_WORST_SCORE = 0.15
TOP_K = 5

In [26]:
with open("worst_cases.txt", "w") as f:
    i = 0
    for prediction, reference, body in zip(
        concatenated_train["predicted_function_name"],
        concatenated_train["function_name"],
        concatenated_train["body_with_comments"],
        strict=True,
    ):
        rouge_results = rouge_metric.compute(predictions=[prediction], references=[reference])
        if rouge_results["rougeL"] < ASSUMED_WORST_SCORE and i < 5:
            print(f"ROGUE-L: {rouge_results['rougeL']:.2f}", file=f)
            print(f"Prediction: {prediction}", file=f)
            print(f"Reference: {reference}", file=f)
            print(f"Body with comments: {body}", file=f)
            print("####################################################", file=f)
            i += 1
        elif i == 5:
            break