In [30]:
import json
import re
import pandas as pd
import numpy as np

In [41]:
# Load the JSON file
file_path = "ml_project_train_and_evaluation.json"  # Replace with your file path
with open(file_path, "r") as file:
    json_data = json.load(file)

# Extract all outputs and their corresponding sources
outputs_with_sources = []
for cell in json_data["cells"]:
    if cell["cell_type"] == "code" and "outputs" in cell:
        source = "\n".join(cell["source"])
        for output in cell["outputs"]:
            if "text" in output:
                outputs_with_sources.append((source, "\n".join(output["text"])))
                
df_summary = pd.DataFrame(outputs_with_sources)

# Display the DataFrame
print(df_summary)

# Save to CSV
output_csv_path = "parsed_metrics_with_source.csv"
df_summary.to_csv(output_csv_path, index=False)
print(f"Results saved to: {output_csv_path}")

                                                    0  \
0   # 1. Logistic Regression on W100_O25_Features\...   
1   # 2. Decision Tree Classifier on W100_O25_Feat...   
2   # 3. Random Forest Classifier on W100_O25_Feat...   
3   # 4. Gaussian Naive Bayes on W100_O25_Features...   
4   # 5. Support Vector Classifier on W100_O25_Fea...   
..                                                ...   
95  # 6. K-Nearest Neighbors on W500_O50_Features\...   
96  # 7. AdaBoost Classifier on W500_O50_Features\...   
97  # 8. Gradient Boost on W500_O50_Features\n\nev...   
98  # 9. XGBoost Classifier on W500_O50_Features\n...   
99  # 10. Artificial Neural Network on W500_O50_Fe...   

                                                    1  
0   #------------------- #1. Logistic Regression M...  
1   #-------------------- #2. Decission Tree Class...  
2   #-------------------- #3. Random Forest Classi...  
3   #-------------------- #4. Gaussian Naive Bias ...  
4   #-------------------- #5. Suppo

In [42]:
source_file = pd.read_csv("parsed_metrics_with_source.csv")
source_file.head()


Unnamed: 0,0,1
0,# 1. Logistic Regression on W100_O25_Features\...,#------------------- #1. Logistic Regression M...
1,# 2. Decision Tree Classifier on W100_O25_Feat...,#-------------------- #2. Decission Tree Class...
2,# 3. Random Forest Classifier on W100_O25_Feat...,#-------------------- #3. Random Forest Classi...
3,# 4. Gaussian Naive Bayes on W100_O25_Features...,#-------------------- #4. Gaussian Naive Bias ...
4,# 5. Support Vector Classifier on W100_O25_Fea...,#-------------------- #5. Support Vector Class...


In [43]:
import re

results_list = []

# Function to extract pipeline prefix
def extract_source(command):
    match = re.search(r"on\s+(\w+)_Features", command)
    return match.group(1) if match else "Unknown"


def parse_metrics_flexible(source, text):
    try:
        # Define regex patterns
        model_pattern = r"#\s*\d+\.\s*(.*?)\s*on\s*(\w+)_Features"  # Extract model name and feature set
        accuracy_pattern = r"accuracy\s+([\d.]+)"  # Extract accuracy
        weighted_avg_pattern = r"weighted avg\s+([\d.]+)\s+([\d.]+)\s+([\d.]+)"  # Extract weighted avg precision, recall, f1-score
        kfold_pattern = r"K-fold cross-validation scores:\s*\[([\d.\s]+)\]"  # Extract K-fold scores
        stratified_pattern = r"Straified cross validation scores:\s*\[([\d.\s]+)\]"  # Extract stratified scores

        # Extract model name and feature set from the source
        model_match = re.search(model_pattern, source)
        if model_match:
            model = model_match.group(1).strip()
            feature_set = model_match.group(2).strip()
        else:
            model = "Unknown Model"
            feature_set = "Unknown Features"

        # Extract accuracy
        accuracy_match = re.search(accuracy_pattern, text)
        accuracy = float(accuracy_match.group(1)) if accuracy_match else None

        # Extract weighted avg precision, recall, and f1-score
        weighted_avg_match = re.search(weighted_avg_pattern, text)
        if weighted_avg_match:
            precision = float(weighted_avg_match.group(1))
            recall = float(weighted_avg_match.group(2))
            f1_score = float(weighted_avg_match.group(3))
        else:
            precision, recall, f1_score = None, None, None

        # Extract K-fold cross-validation scores and calculate average
        kfold_match = re.search(kfold_pattern, text)
        if kfold_match:
            kfold_scores = [float(x) for x in kfold_match.group(1).split()]
            kfold_avg = sum(kfold_scores) / len(kfold_scores)
        else:
            kfold_avg = None

        # Extract stratified cross-validation scores and calculate average
        stratified_match = re.search(stratified_pattern, text)
        if stratified_match:
            stratified_scores = [float(x) for x in stratified_match.group(1).split()]
            stratified_avg = sum(stratified_scores) / len(stratified_scores)
        else:
            stratified_avg = None

        # Append the extracted results to the results list
        results_list.append({
            "Source": f"{model} on {feature_set}",
            "Model": model,
            "Accuracy": accuracy,
            "Precision": precision,
            "Recall": recall,
            "F1 Score": f1_score,
            "K-fold score": kfold_avg,
            "Stratified score": stratified_avg
        })
    except Exception as e:
        print(f"Error processing source '{source}': {e}")

# Parse all rows with improved regex

for row in source_file.itertuples(index=False):
    parse_metrics_flexible(row[0], row[1])

# Convert parsed results to a DataFrame
df_parsed_metrics = pd.DataFrame(results_list)
print(df_parsed_metrics.head())

# for i in range(5):
#     print(outputs_with_sources[i][1])

df_parsed_metrics.to_csv("evaluation_metrics.csv")

                                  Source                      Model  Accuracy  \
0        Logistic Regression on W100_O25        Logistic Regression      0.35   
1   Decision Tree Classifier on W100_O25   Decision Tree Classifier      0.53   
2   Random Forest Classifier on W100_O25   Random Forest Classifier      0.65   
3       Gaussian Naive Bayes on W100_O25       Gaussian Naive Bayes      0.22   
4  Support Vector Classifier on W100_O25  Support Vector Classifier      0.59   

   Precision  Recall  F1 Score  K-fold score  Stratified score  
0       0.34    0.35      0.32      0.319709          0.314451  
1       0.53    0.53      0.53      0.620816          0.618992  
2       0.65    0.65      0.64      0.734933          0.736941  
3       0.24    0.22      0.18      0.200148          0.200095  
4       0.57    0.59      0.58      0.168098          0.169947  
