In [1]:
import pandas as pd
import os

In [2]:
# Path to results csv
csv_result_path = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/results_csv/'

# Path to CESSDA Topic Classification CSV
cessda_path = '/Users/brain/Documents/GitHub/LLMs-topic-classification/controlled-vocabs/cessda.csv'

# Path to output
mapped_csv_path = '/Users/brain/Documents/GitHub/LLMs-topic-classification/results/mapped_csv/'

In [3]:
context_path_list = ["context/ChatGPT/", "context/GoogleGemini/",
                     "no-context/ChatGPT/", "no-context/GoogleBard/", "no-context/GoogleGemini/"]

csv_list = ["Education_expenditure_and_indicators.csv",
            "Health_expectancy.csv",
            "Listed_monuments.csv", 
            "Livestock.csv",
            "Milk_supply_and_dairy_production.csv",
            "Mobility.csv",
            "Plant_protection_products.csv",
            "Population_dynamics.csv",
            "Social_security.csv",
            "Trade_and_industry.csv"]

In [4]:
soft_consistency_mapping = {
    1: [1, 2, 3, 4],
    5: [5, 6, 7, 8, 9, 10],
    11: [11, 12, 13, 14, 15, 16],
    17: [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
    30: [30],
    31: [31, 32, 33],
    34: [34, 35, 36, 37, 38, 39, 40, 41],
    42: [42, 43, 44],
    45: [45, 46, 47, 48, 49],
    50: [50, 51, 52, 53, 54],
    55: [55, 56, 57, 58, 59, 60, 61],
    62: [62],
    63: [63, 64, 65],
    66: [66, 67, 68, 69, 70, 71, 72, 73, 74, 75],
    76: [76, 77, 78, 79],
    80: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
    90: [90, 91, 92, 93],
    94: [94],
    95: [95]
}

In [5]:
# Read the CSV file into a DataFrame
cessda_df = pd.read_csv(cessda_path)

In [6]:
for context_path in context_path_list:
    for csv in csv_list:
        file_path = csv_result_path + context_path + csv
        
        df = pd.read_csv(file_path)
        
        ### STRICT CODE MAPPING
        # Map the values in the dataframe to their corresponding codes from CESSDA
        for col in df.columns:
            if col != 'run_index':
                # Define a dictionary to map names to codes
                topic_to_code = dict(zip(cessda_df['Code descriptive term'], cessda_df['Topic code']))
                topic_to_code.update(dict(zip(cessda_df['Code value'], cessda_df['Topic code'])))

                # Map values to their corresponding codes
                df = df.replace(topic_to_code)
        
        # Ensure the directory exists, create it if necessary
        output_directory_path = mapped_csv_path + context_path + "strict_consistency/"
        os.makedirs(output_directory_path, exist_ok=True)
        
        # Specify the full file path including the filename
        out_file_path = os.path.join(output_directory_path, csv)
        
        # Save the DataFrame to a CSV file in the specified directory
        df.to_csv(out_file_path, index=False)
        
        
        ### SOFT CODE MAPPING
        # Define a lambda function to map values
        map_function = lambda x: next((k for k, v in soft_consistency_mapping.items() if x in v), x)

        # Apply the lambda function to all elements in the DataFrame
        df_soft = df.map(map_function)
        
        # Ensure the directory exists, create it if necessary for soft mapping
        output_soft_mapping_directory_path = mapped_csv_path + context_path + "soft_consistency/"
        os.makedirs(output_soft_mapping_directory_path, exist_ok=True)
        
        # Specify the full file path including the filename for soft mapping
        out_soft_mapping_file_path = os.path.join(output_soft_mapping_directory_path, csv)
        
        # Save the DataFrame to a CSV file in the specified directory
        df_soft.to_csv(out_soft_mapping_file_path, index=False)