In [2]:
import torch.utils.data
from transformers import BartConfig, Seq2SeqTrainingArguments, IntervalStrategy, SchedulerType, TrainingArguments


from data.dataset import init_dataset



Checking the attributes of pre_train pickle file

In [3]:
import pickle

filename = '/home/user1-selab3/Documents/pre_train.pk'

with open(filename, 'rb') as file:
    dataset = pickle.load(file)

attributes = dir(dataset)
print("Attributes in dataset:", attributes)

data_attributes = [attr for attr in attributes if not attr.startswith('__') and not callable(getattr(dataset, attr))]
print("Data-related attributes in dataset:", data_attributes)


Attributes in dataset: ['__add__', '__annotations__', '__class__', '__class_getitem__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__len__', '__lt__', '__module__', '__ne__', '__new__', '__orig_bases__', '__parameters__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__slots__', '__str__', '__subclasshook__', '__weakref__', '_is_protocol', 'args', 'asts', 'codes', 'codes_wo_name', 'dataset_dir', 'dataset_name', 'docs', 'languages', 'mode', 'names', 'names_wo_name', 'only_names', 'paths', 'save', 'set_task', 'size', 'sources', 'split', 'subset', 'task']
Data-related attributes in dataset: ['_is_protocol', 'args', 'asts', 'codes', 'codes_wo_name', 'dataset_dir', 'dataset_name', 'docs', 'languages', 'mode', 'names', 'names_wo_name', 'only_names', 'paths', 'size', 'sources', 'split', 'task']


To Check and print information about Codes and Asts if available

In [4]:

print("Total dataset length:", len(dataset))

#Java Code lines index
index_start = 45423
index_end = 226484

if hasattr(dataset, 'asts') and isinstance(dataset.asts, list):
    sliced_asts = dataset.asts[index_start:index_end]  # Adjust indices as needed
    print("Length of sliced AST data:", len(sliced_asts))
    if sliced_asts:
        print("Sample AST from sliced data:", sliced_asts[0])
    else:
        print("Sliced AST data is empty.")
else:
    print("The 'asts' attribute does not exist or is not a list.")


if hasattr(dataset, 'codes'):
    if isinstance(dataset.codes, list):
        sliced_codes = dataset.codes[index_start:index_end]  # Adjust indices as needed
        print("Length of sliced Code data:", len(sliced_codes))
        if sliced_codes:
            print("Sample Code from sliced data:", sliced_codes[0])
        else:
            print("Sliced Code data is empty.")
    else:
        print("The 'codes' attribute exists but is not a list, it is:", type(dataset.codes))
else:
    print("The 'codes' attribute does not exist.")


Total dataset length: 654393
Length of sliced AST data: 181061
Sample AST from sliced data: local_variable_declaration local_variable_declaration if_statement__ parenthesized_expression__ binary_expression__ binary_expression __binary_expression __parenthesized_expression if_statement__ parenthesized_expression expression_statement if_statement__ parenthesized_expression__ binary_expression__ unary_expression __binary_expression __parenthesized_expression return_statement __if_statement expression_statement __if_statement expression_statement if_statement__ parenthesized_expression__ unary_expression __parenthesized_expression return_statement __if_statement __if_statement expression_statement
Length of sliced Code data: 181061
Sample Code from sliced data: protected final void fastPathOrderedEmit ( U value , boolean delayError , Disposable disposable ) { final Observer < ? super V > observer = downstream ; final SimplePlainQueue < U > q = queue ; if ( wip . get ( ) == 0 && wip . compa

Saving the Asts and corresponding codes to jsonl file

In [5]:
if hasattr(dataset, 'asts') and hasattr(dataset, 'codes') and dataset.asts and dataset.codes:
    if len(dataset.asts) != len(dataset.codes):
        print("The lengths of 'asts' and 'codes' do not match.")
    else:
        ast_code_pairs = list(zip(dataset.asts, dataset.codes))
        
        unique_ast_code_pairs = set(ast_code_pairs)
        
        with open('unique_ast_code_pairs.jsonl', 'w') as file:
            file.write("Unique ASTs with corresponding Codes:\n")
            for ast, code in unique_ast_code_pairs:
                file.write(f"AST: {ast}\nCode: {code}\n\n")

        print("Unique AST and Code pairs have been written to 'unique_ast_code_pairs.jsonl'")
else:
    print("Either 'asts' or 'codes' attribute is empty or does not exist.")


Unique AST and Code pairs have been written to 'unique_ast_code_pairs.txt'


Checking Unique components in finetuning files:

In [None]:

filename = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/dataset_saved/fine_tune.summarization.java.test.pk'
# filename = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/dataset_saved/pre_train.pk'

with open(filename, 'rb') as file:
    dataset = pickle.load(file)
    print(len(dataset))
    
if hasattr(dataset, 'asts'):
    print("Type of 'asts':", type(dataset.asts))
    if isinstance(dataset.asts, list) and len(dataset.asts) > 0:
        print("Sample of 'asts':", dataset.asts[0])
    else:
        print("The 'asts' list is empty.")
else:
    print("The 'asts' attribute does not exist.")

if hasattr(dataset, 'asts') and dataset.asts:
    extracted_data = ' '.join(str(ast) for ast in dataset.asts)
    components = extracted_data.split()  
    unique_components = list(set(components))  
    unique_components.sort() 

    print("Unique components in 'asts':")
    for component in unique_components:
        print(component)
else:
    print("The 'asts' attribute is empty or does not exist.")

In [5]:
# if hasattr(dataset, 'asts') and hasattr(dataset, 'codes') and dataset.asts and dataset.codes:
#     if len(dataset.asts) != len(dataset.codes):
#         print("The lengths of 'asts' and 'codes' do not match.")
#     else:
#         ast_code_pairs = list(zip(dataset.asts, dataset.codes))
        
#         unique_ast_code_pairs = set(ast_code_pairs)
        
#         print("Unique ASTs with corresponding Codes:")
#         for ast, code in unique_ast_code_pairs:
#             print("AST:", ast, "Code:", code)
# else:
#     print("Either 'asts' or 'codes' attribute is empty or does not exist.")


In [None]:
# import pickle

# filename = '/home/user1-selab3/Documents/pre_train.pk'
# # filename = '/home/user1-selab3/Documents/research-shradha/CODE-SPT-Code/dataset/dataset_saved/pre_train.pk'

# with open(filename, 'rb') as file:
#     dataset = pickle.load(file)
#     print(len(dataset))

# if hasattr(dataset, 'asts') and isinstance(dataset.asts, list) and len(dataset.asts) >= 226483:
#     sliced_data = dataset.asts[45423:226484]  

#     print("Length of sliced data:", len(sliced_data))
#     if sliced_data:
#         print("Sample of 'asts' from sliced data:", sliced_data[0])
#     else:
#         print("Sliced data is empty.")
# else:
#     print("The 'asts' attribute does not exist or does not contain enough data.")

# if hasattr(dataset, 'asts') and dataset.asts:
#     extracted_data = ' '.join(str(ast) for ast in dataset.asts)
#     components = extracted_data.split() 
#     unique_components = list(set(components))  
#     unique_components.sort()  

#     print("Unique components in 'asts':")
#     for component in unique_components:
#         print(component)
# else:
#     print("The 'asts' attribute is empty or does not exist.")