# Data format parsing - JSON to CSV

In [1]:
import pandas as pd
import json
import os

In [2]:
data_directory = './data/train'  # Update this path

In [3]:
data = []
# Walk through the directory
for root, dirs, files in os.walk(data_directory):
    for file in files:
        if file.endswith('.json'):
            file_path = os.path.join(root, file)
            with open(file_path, 'r') as f:
                content = json.load(f)
                
                # Initial fields
                record = {
                    # Repository features
                    'repo_id': content['repository']['repo_id'],
                    'url': content['repository']['url'],
                    'language': content['repository'].get('language', ''),
                    'fork_count': content['repository'].get('fork_count', 0),
                    'stargazer_count': content['repository'].get('stars', 0),

                    # Focal class features
                    'focal_class_identifier': content['focal_class']['identifier'],
                    'focal_class_superclass': content['focal_class']['superclass'],
                    'focal_class_interfaces': content['focal_class']['interfaces'],
                    'focal_class_fields': content['focal_class']['fields'],
                    'focal_class_methods': content['focal_class']['methods'],
                    'focal_class_file': content['focal_class']['file'],

                    # Focal method features
                    'focal_method_identifier': content['focal_method']['identifier'],
                    'focal_method_parameters': content['focal_method']['parameters'],
                    'focal_method_modifiers': content['focal_method']['modifiers'],
                    'focal_method_return': content['focal_method']['return'],
                    'focal_method_body': content['focal_method']['body'],
                    'focal_method_signature': content['focal_method']['signature'],
                    'focal_method_full_signature': content['focal_method']['full_signature'],
                    'focal_method_class_method_signature': content['focal_method']['class_method_signature'],
                    'focal_method_testcase': content['focal_method']['testcase'],
                    'focal_method_constructor': content['focal_method']['constructor'],
                    'focal_method_invocations': content['focal_method'].get('invocations', []),

                    # Test class features
                    'test_class_identifier': content['test_class']['identifier'],
                    'test_class_superclass': content['test_class']['superclass'],
                    'test_class_interfaces': content['test_class']['interfaces'],
                    'test_class_fields': content['test_class']['fields'],
                    'test_class_file': content['test_class']['file'],

                    # Test case features
                    'test_case_identifier': content['test_case']['identifier'],
                    'test_case_parameters': content['test_case']['parameters'],
                    'test_case_modifiers': content['test_case']['modifiers'],
                    'test_case_return': content['test_case']['return'],
                    'test_case_body': content['test_case']['body'],
                    'test_case_signature': content['test_case']['signature'],
                    'test_case_full_signature': content['test_case']['full_signature'],
                    'test_case_class_method_signature': content['test_case']['class_method_signature'],
                    'test_case_testcase': content['test_case']['testcase'],
                    'test_case_constructor': content['test_case']['constructor'],
                    'test_case_invocations': content['test_case'].get('invocations', [])
                }
                data.append(record)

# Convert to DataFrame
df = pd.DataFrame(data)

# Save to CSV
csv_file_path = 'java_train_dataset.csv'  # Update this path
df.to_csv(csv_file_path, index=False)


In [7]:
df.head()

Unnamed: 0,repo_id,url,language,fork_count,stargazer_count,focal_class_identifier,focal_class_superclass,focal_class_interfaces,focal_class_fields,focal_class_methods,...,test_case_parameters,test_case_modifiers,test_case_return,test_case_body,test_case_signature,test_case_full_signature,test_case_class_method_signature,test_case_testcase,test_case_constructor,test_case_invocations
0,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,IgnoreMissingValuesConverter,,implements ITypeConverter<Float>,[{'original_string': 'private List<String> mis...,[{'identifier': 'IgnoreMissingValuesConverter'...,...,(),@Test public,void,@Test\n public void returns_null_if_value_i...,void returns_null_if_value_is_missing(),@Test public void returns_null_if_value_is_mis...,IgnoreMissingValuesConverterTest.returns_null_...,True,False,"[assertEquals, convert, assertEquals, convert,..."
1,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,DateUtilities,,,[],"[{'identifier': 'from', 'parameters': '(LocalD...",...,(),@Test public,void,@Test\n public void generated_date_has_utc_...,void generated_date_has_utc_offset_when_none_i...,@Test public void generated_date_has_utc_offse...,DateUtilitiesTest.generated_date_has_utc_offse...,True,False,"[of, of, atTime, atOffset, from, assertEquals,..."
2,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,DateUtilities,,,[],"[{'identifier': 'from', 'parameters': '(LocalD...",...,(),@Test public,void,@Test\n public void generated_date_has_give...,void generated_date_has_given_offset_when_offs...,@Test public void generated_date_has_given_off...,DateUtilitiesTest.generated_date_has_given_off...,True,False,"[of, of, ofHours, atTime, atOffset, from, asse..."
3,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,DateUtilities,,,[],"[{'identifier': 'from', 'parameters': '(LocalD...",...,(),@Test public,void,@Test\n public void generated_date_has_give...,void generated_date_has_given_timezone_when_gi...,@Test public void generated_date_has_given_tim...,DateUtilitiesTest.generated_date_has_given_tim...,True,False,"[of, of, atTime, ofHours, atOffset, from, asse..."
4,58314354,https://github.com/bytefish/JavaElasticSearchE...,Java,11,0,LocalWeatherDataConverter,,,[],"[{'identifier': 'convert', 'parameters': '(csv...",...,(),@Test public,void,@Test\n public void testConvert() throws Ex...,void testConvert(),@Test public void testConvert(),LocalWeatherDataConverterTest.testConvert(),True,False,"[setWban, setDate, of, setTime, of, setDryBulb..."


# Fuction assembly 
Extracting all function pieces and assembeling main function body and test cases

In [None]:
file_path = 'data/java_test_dataset.csv'

# Load the dataset
df = pd.read_csv(file_path)

# Select the relevant columns. Adjust these column names based on your specific needs.
# Based on your description, I'm including a broad selection that you can narrow down.
columns_of_interest = [
    'focal_class_identifier', 
    'focal_class_superclass', 
    'focal_class_interfaces', 
    'focal_class_fields', 
    'focal_class_methods', 
    'test_case_signature', 
    'test_case_full_signature', 
    'test_case_class_method_signature', 
    'test_case_testcase', 
    'test_case_constructor', 
    'test_case_invocations'
]

# Create a new dataset with only the columns of interest
new_df = df[columns_of_interest]

# Optionally, save this new dataset to a new CSV file
new_file_path = 'new_dataset.csv'
new_df.to_csv(new_file_path, index=False)

print(f"New dataset created and saved to {new_file_path}")
