In [None]:
import pandas as pd
import os
import re

In [10]:
# Function to extract details from a single .txt file
def parse_txt_file(file_path):
    with open(file_path, 'r') as file:
        content = file.read()

    # Extract general information
    date_time = re.search(r"Date and Time: (.+)", content).group(1)
    dataset_name = re.search(r"Dataset Name: (.+)", content).group(1)
    reg_type = re.search(r"Regularization Type: (.+)", content).group(1)
    total_run_time = re.search(r"Total Run Time: (.+)", content).group(1)

    # Extract experiment parameters
    random_seeds = re.search(r"Random Seeds: \[(.+)\]", content).group(1)
    dataset_sizes = re.search(r"Dataset Sizes: \[(.+)\]", content).group(1)
    reg_values = re.search(r"Regularization Values: \[(.+)\]", content).group(1)
    learning_rates = re.search(r"Learning Rates: \[(.+)\]", content).group(1)
    total_combinations = re.search(r"Total Combinations: (\d+)", content).group(1)

    # Extract training details
    batch_size = re.search(r"Batch Size: (\d+)", content).group(1)
    epochs = re.search(r"Epochs: (\d+)", content).group(1)

    # Extract performance
    best_hyperparams = re.search(r"Best Hyperparameters:\s+Seed: (\d+)\s+Data Size Percentage: (\d+)%\s+Regularization Value: ([\d.e-]+)\s+Learning Rate: ([\d.e-]+)", content)
    best_seed = best_hyperparams.group(1)
    best_data_size_pct = best_hyperparams.group(2)
    best_reg_val = best_hyperparams.group(3)
    best_lr = best_hyperparams.group(4)

    best_results = re.search(r"Best Results:\s+Train Loss: ([\d.e-]+)\s+Train Accuracy: ([\d.]+)%\s+Test Loss: ([\d.e-]+)\s+Test Accuracy: ([\d.]+)%", content)
    train_loss = best_results.group(1)
    train_accuracy = best_results.group(2)
    test_loss = best_results.group(3)
    test_accuracy = best_results.group(4)

    # Extract dataset details
    total_samples = re.search(r"Total Samples: (\d+)", content).group(1)
    train_split = re.search(r"Training Split: ([\d.]+)%", content).group(1)
    test_split = re.search(r"Testing Split: ([\d.]+)%", content).group(1)

    # Extract notes
    notes = re.search(r"Notes:\s+([\s\S]+?)\n\n", content).group(1).strip()

    # Extract model details
    model_type = re.search(r"Model Type: (.+)", content).group(1)
    model_architecture = re.search(r"Model Architecture:\s+([\s\S]+?)\n\s*Optimizer:", content).group(1).strip()
    optimizer = re.search(r"Optimizer: (.+)", content).group(1)
    loss_function = re.search(r"Loss Function: (.+)", content).group(1)

    # Return as a dictionary
    return {
        "file_name": os.path.basename(file_path),
        "date_time": date_time,
        "dataset_name": dataset_name,
        "reg_type": reg_type,
        "run_time": total_run_time,
        "seeds": random_seeds,
        "data_size": dataset_sizes,
        "reg_values": reg_values,
        "learn_rates": learning_rates,
        "combinations": total_combinations,
        "batch_size": batch_size,
        "epochs": epochs,
        "best_seed": best_seed,
        "best_data_size": best_data_size_pct,
        "best_reg_value": best_reg_val,
        "best_learn_rate": best_lr,
        "train_loss": train_loss,
        "train_acc": train_accuracy,
        "test_loss": test_loss,
        "test_acc": test_accuracy,
        "total_samples": total_samples,
        "train_split": train_split,
        "test_split": test_split,
        "notes": notes,
        "model_type": model_type,
        "model_arch": model_architecture,
        "optimizer": optimizer,
        "loss_function": loss_function
    }


In [None]:
# Folder containing .txt files
folder_path = r"C:\Users\canel\OneDrive\Desktop\SRP\hypothesis-testing3\hypothesis-testing\Variation_vs_Seeds\dna"
output_name = 'experiments.csv'

output_path = os.path.join(folder_path, output_name)
output_path

'C:\\Users\\canel\\OneDrive\\Desktop\\SRP\\hypothesis-testing3\\hypothesis-testing\\Variation_vs_Seeds\\dna\\experiments.csv'

In [12]:
# Collect data from all .txt files
data = []
txt_file_count = 0  # Counter for files processed
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        txt_file_count += 1
        file_path = os.path.join(folder_path, file_name)
        data.append(parse_txt_file(file_path))

# Print the number of files read
print(f"Number of .txt files read: {txt_file_count}")

# Create a dataframe
df = pd.DataFrame(data)

# Display the dataframe
df.head()

Number of .txt files read: 15


Unnamed: 0,file_name,date_time,dataset_name,reg_type,run_time,seeds,data_size,reg_values,learn_rates,combinations,...,test_loss,test_acc,total_samples,train_split,test_split,notes,model_type,model_arch,optimizer,loss_function
0,dna_l2_summary_20241129_192553.txt,2024-11-29 19:25:53,dna,l2,04:29:39,"1, 2, 3","1, 5, 10, 25, 50, 75, 100","0.0, 1e-05, 0.001, 0.1","1e-05, 0.001, 0.1",252,...,1.2747,93.18,3186,60,40,Write any comments about the experiment here.,MLP,- Linear(180 -> 2048)\n - ReLU\n - Linea...,Adam,CrossEntropyLoss
1,dna_l2_summary_20241130_045257.txt,2024-11-30 04:52:57,dna,l2,00:09:49,"1, 2, 3","100, 75, 50, 25, 10, 5, 1",1e-05,0.001,21,...,0.7272,93.33,3186,60,40,Write any comments about the experiment here.,MLP,- Linear(180 -> 2048)\n - ReLU\n - Linea...,Adam,CrossEntropyLoss
2,dna_l2_summary_20241130_050400.txt,2024-11-30 05:04:00,dna,l2,00:06:49,"4, 5","100, 75, 50, 25, 10, 5, 1",1e-05,0.001,14,...,0.7814,92.63,3186,60,40,Write any comments about the experiment here.,MLP,- Linear(180 -> 2048)\n - ReLU\n - Linea...,Adam,CrossEntropyLoss
3,dna_l2_summary_20241130_051151.txt,2024-11-30 05:11:51,dna,l2,00:16:22,"6, 7, 8, 9, 10","100, 75, 50, 25, 10, 5, 1",1e-05,0.001,35,...,1.3668,92.63,3186,60,40,Write any comments about the experiment here.,MLP,- Linear(180 -> 2048)\n - ReLU\n - Linea...,Adam,CrossEntropyLoss
4,dna_l2_summary_20241130_053407.txt,2024-11-30 05:34:07,dna,l2,00:32:13,"11, 12, 13, 14, 15, 16, 17, 18, 19, 20","100, 75, 50, 25, 10, 5, 1",1e-05,0.001,70,...,0.4301,93.73,3186,60,40,Write any comments about the experiment here.,MLP,- Linear(180 -> 2048)\n - ReLU\n - Linea...,Adam,CrossEntropyLoss


In [None]:
df.to_csv(output_path) 