In [1]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from utils.retrieve_stats import *

def compute_results_table(path):
    path = Path(path)
    folders = sorted(list(path.glob("*")))

    # print(f"Processing results in: {path}")
    # print(f"Found folders: {folders}")

    # Get statistics for each run
    results = [getstats(folder, rolling_window=20, endofepoch=False) for folder in folders]
    best_epochs = {folder.name: find_bestepoch(result, avoid_premature=10) for folder, result in zip(folders, results)}

    # print(f"Best epochs identified: {best_epochs}")

    # Metrics to extract
    metrics = ["train-loss", "train-acc", "train-auc", "test-loss", "test-acc", "test-auc"]
    results_dict = {metric: [] for metric in metrics}

    # Process each run
    for run_name, best_epoch in best_epochs.items():
        epoch_dir = path / run_name / f"Epoch {best_epoch}"

        if not epoch_dir.exists():
            print(f"Warning: {epoch_dir} not found!")
            continue

        for metric in metrics:
            file_name = f"{metric.replace('-auc', '-preds')}.csv" if "auc" in metric else f"{metric}.csv"
            file_path = epoch_dir / file_name
            
            if file_path.exists():
                df = pd.read_csv(file_path, header=None if "auc" not in metric else 0)

                if "auc" in metric:
                    try:
                        auc_value = roc_auc_score(df["actual"], df["preds"])
                    except Exception as e:
                        print(f"Error computing AUC for {file_path}: {e}")
                        auc_value = np.nan
                    results_dict[metric].append(auc_value)
                else:
                    results_dict[metric].append(df.mean().values[0])  # Assume single value per CSV
            else:
                print(f"Warning: {file_path} not found!")
                results_dict[metric].append(np.nan)  # Use NaN for missing values

    # Convert to DataFrame
    summary = {
        "Metric": metrics,
        "Min": [np.nanmin(results_dict[m]) for m in metrics],
        "Max": [np.nanmax(results_dict[m]) for m in metrics],
        "Mean": [np.nanmean(results_dict[m]) for m in metrics]
    }

    df_summary = pd.DataFrame(summary)

    return df_summary



In [2]:
paths = [
    # Path.cwd() / "results" / "symbolic" / "atchley",
    # Path.cwd() / "results" / "symbolic" / "kidera",
    # Path.cwd() / "results" / "symbolic" / "aaprop",
    # Path.cwd() / "results" / "symbolic" / "random",
    # Path.cwd() / "results" / "tcr-bert",
    # Path.cwd() / "results" / "sceptr",

    Path.cwd() / "results-new" / "symbolic" / "atchley",
    Path.cwd() / "results-new" / "symbolic" / "kidera",
    Path.cwd() / "results-new" / "symbolic" / "aaprop",
    Path.cwd() / "results-new" / "symbolic" / "random",
    Path.cwd() / "results-new" / "tcr-bert",
    Path.cwd() / "results-new" / "sceptr"
]

for path in paths:
    print(f"Results for {path}")
    results = compute_results_table(path)
    # Display the table
    from IPython.display import display
    display(results)


Results for /Users/janp/Desktop/fyp/tcr-cancer-prediction/results-new/symbolic/atchley


Unnamed: 0,Metric,Min,Max,Mean
0,train-loss,0.688618,0.691822,0.690622
1,train-acc,0.584229,0.666667,0.621505
2,train-auc,0.552906,0.636003,0.603572
3,test-loss,0.688631,0.69185,0.690077
4,test-acc,0.6,0.657143,0.634286
5,test-auc,0.680448,0.762238,0.717449


Results for /Users/janp/Desktop/fyp/tcr-cancer-prediction/results-new/symbolic/kidera


Unnamed: 0,Metric,Min,Max,Mean
0,train-loss,0.690295,0.69279,0.691705
1,train-acc,0.519713,0.594982,0.565591
2,train-auc,0.53092,0.664085,0.587652
3,test-loss,0.688541,0.692858,0.691451
4,test-acc,0.5,0.685714,0.594286
5,test-auc,0.581633,0.754529,0.681069


Results for /Users/janp/Desktop/fyp/tcr-cancer-prediction/results-new/symbolic/aaprop


Unnamed: 0,Metric,Min,Max,Mean
0,train-loss,0.685945,0.689867,0.687681
1,train-acc,0.548387,0.605735,0.585663
2,train-auc,0.611785,0.645174,0.630501
3,test-loss,0.685076,0.694331,0.68852
4,test-acc,0.442857,0.657143,0.568571
5,test-auc,0.590194,0.736888,0.675747


Results for /Users/janp/Desktop/fyp/tcr-cancer-prediction/results-new/symbolic/random


Unnamed: 0,Metric,Min,Max,Mean
0,train-loss,0.688908,0.692767,0.690774
1,train-acc,0.512545,0.716846,0.602867
2,train-auc,0.55827,0.738416,0.650108
3,test-loss,0.687509,0.693821,0.690833
4,test-acc,0.514286,0.714286,0.611429
5,test-auc,0.599445,0.758152,0.657886


Results for /Users/janp/Desktop/fyp/tcr-cancer-prediction/results-new/tcr-bert


Unnamed: 0,Metric,Min,Max,Mean
0,train-loss,0.202796,0.229251,0.216024
1,train-acc,0.928315,0.9319,0.930108
2,train-auc,0.969697,0.979203,0.97445
3,test-loss,0.296123,0.330003,0.313063
4,test-acc,0.828571,0.871429,0.85
5,test-auc,0.923913,0.951531,0.937722


Results for /Users/janp/Desktop/fyp/tcr-cancer-prediction/results-new/sceptr


Unnamed: 0,Metric,Min,Max,Mean
0,train-loss,0.472772,0.654375,0.582046
1,train-acc,0.734848,0.954545,0.863636
2,train-auc,0.909136,0.997354,0.946723
3,test-loss,0.490476,0.672127,0.618702
4,test-acc,0.606061,0.969697,0.748485
5,test-auc,0.657025,1.0,0.800282


In [3]:
import numpy as np

def generate_latex_table(encoding_names, results_dict):
    """
    Generates a LaTeX table string from the computed results.

    Parameters:
        encoding_names (list): List of encoding names (Atchley, Kidera, etc.)
        results_dict (dict): Dictionary containing min, max, and mean for train and test metrics.

    Returns:
        str: LaTeX table code.
    """

    encoding_headers = {
        "atchley": "Atchley",
        "kidera": "Kidera",
        "aaprop": "AA Properties",
        "random": "Random",
        "tcr-bert": "TCR-BERT",
        "sceptr": "SCEPTR"
    }

    latex_str = ""

    # Define the column headers
    headers = ["Encoding", "Train Loss", "Train Accuracy", "Train AUC"]
    
    # Train Table
    latex_str += "\\begin{table}[h]\n    \\centering\n"
    latex_str += "    \\begin{tabular}{|l|c|c|c|}\n        \\hline\n"
    latex_str += "        " + " & ".join(["\\textbf{" + h + "}" for h in headers]) + " \\\\\n        \\hline\n"

    for encoding in encoding_names:
        train_loss = f"{results_dict[encoding]['train-loss']['Min']:.3f} - {results_dict[encoding]['train-loss']['Max']:.3f} ($\\mu$: {results_dict[encoding]['train-loss']['Mean']:.3f})"
        train_acc = f"{results_dict[encoding]['train-acc']['Min']:.3f} - {results_dict[encoding]['train-acc']['Max']:.3f} ($\\mu$: {results_dict[encoding]['train-acc']['Mean']:.3f})"
        train_auc = f"{results_dict[encoding]['train-auc']['Min']:.3f} - {results_dict[encoding]['train-auc']['Max']:.3f} ($\\mu$: {results_dict[encoding]['train-auc']['Mean']:.3f})"

        latex_str += f"        {encoding_headers[encoding]} & {train_loss} & {train_acc} & {train_auc} \\\\\n"

    latex_str += "        \\hline\n    \\end{tabular}\n"
    latex_str += "    \\caption{Results for the best-performing checkpoint on the train set}\n"
    latex_str += "    \\label{tab:train-results}\n"
    latex_str += "\\end{table}\n\n"

    # Test Table
    headers = ["Encoding", "Test Loss", "Test Accuracy", "Test AUC"]
    
    latex_str += "\\begin{table}[h]\n    \\centering\n"
    latex_str += "    \\begin{tabular}{|l|c|c|c|}\n        \\hline\n"
    latex_str += "        " + " & ".join(["\\textbf{" + h + "}" for h in headers]) + " \\\\\n        \\hline\n"

    for encoding in encoding_names:
        test_loss = f"{results_dict[encoding]['test-loss']['Min']:.3f} - {results_dict[encoding]['test-loss']['Max']:.3f} ($\\mu$: {results_dict[encoding]['test-loss']['Mean']:.3f})"
        test_acc = f"{results_dict[encoding]['test-acc']['Min']:.3f} - {results_dict[encoding]['test-acc']['Max']:.3f} ($\\mu$: {results_dict[encoding]['test-acc']['Mean']:.3f})"
        test_auc = f"{results_dict[encoding]['test-auc']['Min']:.3f} - {results_dict[encoding]['test-auc']['Max']:.3f} ($\\mu$: {results_dict[encoding]['test-auc']['Mean']:.3f})"

        latex_str += f"        {encoding_headers[encoding]} & {test_loss} & {test_acc} & {test_auc} \\\\\n"

    latex_str += "        \\hline\n    \\end{tabular}\n"
    latex_str += "    \\caption{Results for the best-performing checkpoint on the test set}\n"
    latex_str += "    \\label{tab:test-results}\n"
    latex_str += "\\end{table}\n"

    return latex_str

# Dictionary to store results for each encoding
results_dict = {}

for path in paths:
    encoding_name = path.name  # Extract encoding name from path
    df_summary = compute_results_table(path)  # Get computed table
    
    # Extract metrics
    
    results_dict[encoding_name] = {
        metric: {
            "Min": df_summary.loc[df_summary["Metric"] == metric]["Min"].values[0],
            "Max": df_summary.loc[df_summary["Metric"] == metric]["Max"].values[0],
            "Mean": df_summary.loc[df_summary["Metric"] == metric]["Mean"].values[0],
        }
        for metric in ["train-loss", "train-acc", "train-auc", "test-loss", "test-acc", "test-auc"]
    }

# Generate LaTeX code
print(results_dict.keys())
latex_code = generate_latex_table(list(results_dict.keys()), results_dict)

# # Print the LaTeX table code
print(latex_code)


dict_keys(['atchley', 'kidera', 'aaprop', 'random', 'tcr-bert', 'sceptr'])
\begin{table}[h]
    \centering
    \begin{tabular}{|l|c|c|c|}
        \hline
        \textbf{Encoding} & \textbf{Train Loss} & \textbf{Train Accuracy} & \textbf{Train AUC} \\
        \hline
        Atchley & 0.689 - 0.692 ($\mu$: 0.691) & 0.584 - 0.667 ($\mu$: 0.622) & 0.553 - 0.636 ($\mu$: 0.604) \\
        Kidera & 0.690 - 0.693 ($\mu$: 0.692) & 0.520 - 0.595 ($\mu$: 0.566) & 0.531 - 0.664 ($\mu$: 0.588) \\
        AA Properties & 0.686 - 0.690 ($\mu$: 0.688) & 0.548 - 0.606 ($\mu$: 0.586) & 0.612 - 0.645 ($\mu$: 0.631) \\
        Random & 0.689 - 0.693 ($\mu$: 0.691) & 0.513 - 0.717 ($\mu$: 0.603) & 0.558 - 0.738 ($\mu$: 0.650) \\
        TCR-BERT & 0.203 - 0.229 ($\mu$: 0.216) & 0.928 - 0.932 ($\mu$: 0.930) & 0.970 - 0.979 ($\mu$: 0.974) \\
        SCEPTR & 0.473 - 0.654 ($\mu$: 0.582) & 0.735 - 0.955 ($\mu$: 0.864) & 0.909 - 0.997 ($\mu$: 0.947) \\
        \hline
    \end{tabular}
    \caption{Results for 