In [1]:
import json
import os
import pandas as pd

In [2]:
# Path to directory
raw_result_path = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/results_raw/'

In [3]:
# Path to output csv
csv_result_path = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/results_csv/'

In [4]:
dataset_list = ["Education_expenditure_and_indicators",
               "Health_expectancy",
               "Listed_monuments", 
               "Livestock",
               "Milk_supply_and_dairy_production",
               "Mobility",
               "Plant_protection_products",
               "Population_dynamics",
               "Social_security",
               "Trade_and_industry"]

In [5]:
context_path_list = ["context/ChatGPT/", "no-context/ChatGPT/", "no-context/GoogleBard/"]

In [6]:
# Replacement ictionary
replacement_dict = {
    "Demography": "DEMOGRAPHY (POPULATION VITAL STATISTICS AND CENSUSES)", 
    "Economics": "Economics",
    "Education": "EDUCATION",
    "Health": "HEALTH",
    "History": "HISTORY",
    "Housing and Land Use": "HOUSING AND LAND USE",
    "Labour and Employment": "LABOUR AND EMPLOYMENT",
    "Law crime and legal systems": "LAW, CRIME AND LEGAL SYSTEMS",
    "Media, Communication and Language": "MEDIA, COMMUNICATION AND LANGUAGE",
    "Natural Environment": "NATURAL ENVIRONMENT",
    "Politics": "POLITICS",
    "Psychology": "PSYCHOLOGY",
    "Science and Technology": "SCIENCE AND TECHNOLOGY",
    "Social Stratification and Groupings": "SOCIAL STRATIFICATION AND GROUPINGS",
    "Social Welfare Policy and Systems": "SOCIAL WELFARE POLICY AND SYSTEMS",
    "Social welfare policy and systems": "SOCIAL WELFARE POLICY AND SYSTEMS",
    "Society and Culture": "SOCIETY AND CULTURE",
    "Trade, Industry and Markets": "TRADE, INDUSTRY AND MARKETS",
    "Transport and Travel": "TRANSPORT AND TRAVEL",
    "Other": "OTHER",
    "Social Welfare Systems/Structures": "Social welfare systems/structures",
    "Social Welfare Policy": "Social welfare policy",
    "Health Care Services and Policies": "Health care services and policies",
    "Time Use": "Time use",
    # Topics not in CESSDA
    "Geography": "NOT_FOUND",
    "Contract Research": "NOT_FOUND",
    "Mental Health": "NOT_FOUND",
    "Mental health": "NOT_FOUND",
    "Total On Education Institutions": "NOT_FOUND",
    "Government": "NOT_FOUND",
    "Economics and indicators": "NOT_FOUND",
    "Time": "NOT_FOUND",
    "Social welfare systems/structures: use and availability": "NOT_FOUND",
    "Food composition": "NOT_FOUND",
    "Health | Diet and nutrition": "NOT_FOUND",
    "Nutrition and food consumption": "NOT_FOUND",
    "Nutrition": "NOT_FOUND",
    "Health -> Diet and nutrition": "NOT_FOUND",
    "Food products": "NOT_FOUND",
    "Food and nutrition": "NOT_FOUND",
    "Education - Expenditure": "NOT_FOUND",
    "Economics - Economic conditions and indicators": "NOT_FOUND",
    "Education - Expenditure and financing": "NOT_FOUND",
    "Education - Expenditure/financing": "NOT_FOUND",
    "Science and Technology - Biotechnology": "NOT_FOUND",
    "Economics - Economic policy, public expenditure and revenue": "NOT_FOUND",
    "Science and Technology - Research and development": "NOT_FOUND",
    "Education and Compulsory and pre-school education": "NOT_FOUND",
    "Education - Compulsory and pre-school education": "NOT_FOUND",
    "Social welfare policy - Specific social services: use and availability": "NOT_FOUND",
    "Education - Higher and further education": "NOT_FOUND",
    "Social and cultural identity": "NOT_FOUND",
    "Economic instruments": "NOT_FOUND",
    "Transport and Travel - Passenger transport": "NOT_FOUND",
    "Government - Public finance": "NOT_FOUND",
    "Government - Public expenditure": "NOT_FOUND",
    "Government - Government, political systems and organisations": "NOT_FOUND",
    "Government budget/expenditure": "NOT_FOUND",
    "Politics - Government, political systems and organisations": "NOT_FOUND",
    "Social stratification and groupings - Equality, inequality and social exclusion": "NOT_FOUND",
    "Social Stratification and Groupings - Income, expenditure and wealth": "NOT_FOUND"
}

In [7]:
for context_path in context_path_list:
    for dataset in dataset_list:
        directory = raw_result_path + context_path + dataset + "/"
        
        # Iterate over files in the directory
        dict = {}

        for filename in os.listdir(directory):
            if os.path.isfile(os.path.join(directory, filename)) and filename.endswith('.txt'):
                run_str = filename.replace('-', '').replace('.txt', '')
                dict[run_str] = json.load(open(os.path.join(directory, filename), 'r'))
        
        # Convert the dictionary to a DataFrame
        df = pd.DataFrame.from_dict(dict, orient='index')

        # Reset the index to get 'run_index' as a column
        df.reset_index(inplace=True)

        # Rename the 'index' column to 'run_index'
        df.rename(columns={'index': 'run_index'}, inplace=True)
        
        # Iterate through each cell in the DataFrame
        for col in df.columns:
            for index, value in enumerate(df[col]):
                # Check if the value exactly matches any key in the dictionary
                if value in replacement_dict.keys():
                    # Replace the value with the corresponding value from the dictionary
                    df.at[index, col] = replacement_dict[value]
        
        # Ensure the directory exists, create it if necessary
        output_directory_path = csv_result_path + context_path
        os.makedirs(output_directory_path, exist_ok=True)
        
        # Specify the full file path including the filename
        output_file_name = dataset + ".csv"
        file_path = os.path.join(output_directory_path, output_file_name)
        
        # Save the DataFrame to a CSV file in the specified directory
        df.to_csv(file_path, index=False)