In [3]:
import os
import pandas as pd
import re


In [37]:

def extract_filename_info(filename):
    pattern = (r"results_all_methods_(?P<theme>.+)_hierarchy_t(?P<t>[\d\.]+)_"
               r"maxsub(?P<max_sub>\d+)_depth(?P<depth>\d+)"
               r"(?:_synonyms(?P<synonyms>\d+))?"
               r"(?:_(?P<branching>increasing|decreasing|constant|random))?.csv$")
    
    match = re.match(pattern, filename)
    
    if match:
        info = match.groupdict()
        info["synonyms"] = info["synonyms"] if info["synonyms"] else "0"
        info["branching"] = info["branching"] if info["branching"] else "constant"
        return info
    return None

def process_files(folder_path, string_filters):
    combined_df = pd.DataFrame()
    processed_files = []

    for file in os.listdir(folder_path):
        if file.endswith(".csv") and file.startswith("results_all_methods_"):
            if not any(s in file for s in string_filters):
                continue

            file_path = os.path.join(folder_path, file)
            file_info = extract_filename_info(file)

            df = pd.read_csv(file_path)

            # Add extracted filename info as new columns
            for key, value in file_info.items():
                df[key] = value
            
            combined_df = pd.concat([combined_df, df], ignore_index=True)
            processed_files.append(file)


    output_filename = "processed_results_" + "_".join(string_filters) + ".csv"
    output_path = os.path.join(folder_path, output_filename)
    combined_df.to_csv(output_path, index=False)
    return combined_df, processed_files

In [38]:

# Usage example
folder_path = "bertopic_results"  # Change this to the actual folder path
combined_df, processed_files = process_files('bertopic_results',['synonyms10'])

