In [8]:
import pandas as pd

In [14]:
import os
import glob

# Path to output directory
output_dir = "../outputs/final"
results_dir = "../outputs/results"

# Load all classification result CSV files
classification_files = glob.glob(os.path.join(output_dir, "results_classification*.csv"))
classification_dfs = {}
for file in classification_files:
    file_name = os.path.basename(file)
    classification_dfs[file_name] = pd.read_csv(file)
    
print(f"Loaded {len(classification_dfs)} classification CSV files")

# Load all topic modelling result CSV files
topic_modelling_files = glob.glob(os.path.join(output_dir, "results_topic_modelling*.csv"))
topic_modelling_dfs = {}
for file in topic_modelling_files:
    file_name = os.path.basename(file)
    topic_modelling_dfs[file_name] = pd.read_csv(file)
    
print(f"Loaded {len(topic_modelling_dfs)} topic modelling CSV files")

Loaded 12 classification CSV files
Loaded 3 topic modelling CSV files


In [19]:
# Concatenate all topic modeling dataframes into a single dataframe
all_topic_modelling_df = pd.concat(topic_modelling_dfs.values(), ignore_index=True)

# Print the shape to verify
print(f"Concatenated topic modeling dataframe shape: {all_topic_modelling_df.shape}")
print(all_topic_modelling_df.head())

# Save the concatenated dataframe to the results directory
topic_modelling_output_path = os.path.join(results_dir, "results_topic_modelling.csv")
topic_modelling_output_path_excel = os.path.join(results_dir, "results_topic_modelling.xlsx")
all_topic_modelling_df.to_csv(topic_modelling_output_path, index=False)
all_topic_modelling_df.to_excel(topic_modelling_output_path_excel, index=False)

print(f"Saved concatenated topic modeling results to {topic_modelling_output_path}")

Concatenated topic modeling dataframe shape: (63, 7)
              Dataset                Noise  Model  ARI Score  \
0  20 Newsgroups Full  AddRandomCharsNoise  LDA_4       0.00   
1  20 Newsgroups Full  AddRandomCharsNoise  LSI_4       0.01   
2  20 Newsgroups Full  AddRandomCharsNoise  NMF_4       0.06   
3  20 Newsgroups Full  AddRandomWordsNoise  LDA_4       0.00   
4  20 Newsgroups Full  AddRandomWordsNoise  LSI_4       0.00   

   Topics Coherence  Cosine Similarity  Reconstruction Error  
0              0.47               0.34                   NaN  
1              0.60               0.64                   NaN  
2              0.74               0.55             62.555435  
3              0.40               0.98                   NaN  
4              0.59               0.66                   NaN  
Saved concatenated topic modeling results to ../outputs/results/results_topic_modelling.csv


In [20]:
# Concatenate all classification dataframes into a single dataframe
all_classification_df = pd.concat(classification_dfs.values(), ignore_index=True)

# Print the shape to verify
print(f"Concatenated classification dataframe shape: {all_classification_df.shape}")
print(all_classification_df.head())

# Save the concatenated dataframe to the results directory
classification_output_path = os.path.join(results_dir, "results_classification.csv")
classification_output_path_excel = os.path.join(results_dir, "results_classification.xlsx")
all_classification_df.to_csv(classification_output_path, index=False)
all_classification_df.to_excel(classification_output_path_excel, index=False)

print(f"Saved concatenated classification results to {classification_output_path}")

Concatenated classification dataframe shape: (378, 7)
        Dataset                Noise                Model  Accuracy  F1 Score  \
0  AG News Full  AddRandomCharsNoise             LightGBM      0.25      0.40   
1  AG News Full  AddRandomCharsNoise      LightGBMRoberta      0.74      0.74   
2  AG News Full  AddRandomCharsNoise         RandomForest      0.50      0.50   
3  AG News Full  AddRandomCharsNoise  RandomForestRoberta      0.78      0.77   
4  AG News Full  AddRandomCharsNoise                  SVM      0.62      0.62   

   Precision  Recall  
0       1.00    0.25  
1       0.75    0.74  
2       0.56    0.50  
3       0.78    0.78  
4       0.63    0.62  
Saved concatenated classification results to ../outputs/results/results_classification.csv


In [22]:
results_classification = pd.read_csv('../outputs/results/results_classification.csv')
results_topic_modeling = pd.read_csv('../outputs/results/results_topic_modelling.csv')

In [25]:
# Supervised Classification: 8 functions * 7 noises * 6 datasets (LLM + Full) = 336
# Zero Shot Classification: 2 models * 7 noises * 3 datasets (only full) = 42
# Total = 336 + 42 = 378
len(results_classification) 

378

In [None]:
# Topic Modelling 3 models * 7 topics * 3 datasets
len(results_topic_modeling) 

63