In [4]:
import os
from pathlib import Path
import numpy as np

In [None]:
cwd = Path(os.getcwd())
PROJETC_ROOT = cwd.parent
DATA_PATH = PROJETC_ROOT / "data"

basis = np.load(DATA_PATH / "basis.npz")["matrix"]

In [18]:
# RESULTS_DIR = PROJECT_ROOT / "results" / "pca" / "non-normalized"

## Dataset size

In [17]:
num_samples = basis.shape[0]
original_num_features = basis.shape[1]

print(f"Number of samples: {num_samples}")
print(f"Number of features: {original_num_features}")

Number of samples: 1505141
Number of features: 844


In [21]:
X_NORM = np.load(DATA_PATH / 'X_norm.npz')['X_norm']
X = np.load(DATA_PATH / 'X.npz')['X']

## Experiments options

### Gathering results

In [30]:
RESULTS_DIR = PROJETC_ROOT/ "results"

In [130]:
methods = ["PCA", "PaCMAP", "tSNE", "TriMAP", "UMAP"]
methods =[m.lower() for m in methods]

archs = ["CPU", "GPU"]
archs = [a.lower() for a in archs]

dataset_options = ["normalized", "non_normalized", "non-normalized"]

In [42]:
# ...existing code...
RESULTS_DIR = PROJETC_ROOT/ "results"

# Iterate through the top-level items in RESULTS_DIR
print(f"Contents of {RESULTS_DIR}:")
for item in RESULTS_DIR.iterdir():
    if item.is_dir():
        print(f"  Directory: {item.name}")
    elif item.is_file():
        print(f"  File: {item.name}")

# If you want to iterate recursively through all subdirectories and files:
print(f"\nRecursive contents of {RESULTS_DIR}:")
for path_object in RESULTS_DIR.rglob('*'): # rglob for recursive globbing
    if path_object.is_file():
        print(f"  File: {path_object}") # Prints the full path
    elif path_object.is_dir():
        print(f"  Directory: {path_object}") # Prints the full path

Contents of c:\Users\basia\Desktop\Semestr_letni_8\LCS\projekt_2\LSC_Dimensionality_Reduction\results:
  Directory: pacmap
  Directory: pacmap_cpu
  Directory: parampacmap_gpu
  Directory: pca
  Directory: pca_gpu
  Directory: test
  Directory: trimap
  Directory: trimap_cpu
  Directory: tsne
  Directory: tsne_cpu
  Directory: tsne_gpu
  Directory: umap
  Directory: umap_cpu
  Directory: umap_gpu

Recursive contents of c:\Users\basia\Desktop\Semestr_letni_8\LCS\projekt_2\LSC_Dimensionality_Reduction\results:
  Directory: c:\Users\basia\Desktop\Semestr_letni_8\LCS\projekt_2\LSC_Dimensionality_Reduction\results\pacmap
  Directory: c:\Users\basia\Desktop\Semestr_letni_8\LCS\projekt_2\LSC_Dimensionality_Reduction\results\pacmap_cpu
  Directory: c:\Users\basia\Desktop\Semestr_letni_8\LCS\projekt_2\LSC_Dimensionality_Reduction\results\parampacmap_gpu
  Directory: c:\Users\basia\Desktop\Semestr_letni_8\LCS\projekt_2\LSC_Dimensionality_Reduction\results\pca
  Directory: c:\Users\basia\Desktop\

In [71]:
from dataclasses import dataclass
import re
from typing import Optional


@dataclass
class LogMetrics:
    timestamp: Optional[str] = None
    method_details: Optional[str] = None
    explained_variance_first_2: Optional[tuple[float, float]] = None
    total_explained_variance_100_comps: Optional[float] = None
    components_for_90_variance: Optional[int] = None
    wall_time_s: Optional[float] = None
    cpu_time_user_s: Optional[float] = None
    cpu_time_sys_s: Optional[float] = None
    cpu_time_total_s: Optional[float] = None
    gpu_kernel_time_s: Optional[float] = None
    gpu_memory_used_mb: Optional[float] = None
    gpu_memory_total_mb: Optional[float] = None
    gpu_model: Optional[str] = None
    gpu_driver: Optional[str] = None
    points: Optional[int] = None
    original_dims: Optional[int] = None
    pca_dims_saved: Optional[int] = None
    hostname: Optional[str] = None
    # Add other fields if necessary for other log types

def parse_log_metrics(log_content: str) -> LogMetrics:
    data = LogMetrics()
    for line in log_content.splitlines():
        line = line.strip()
        if not line:
            continue

        if data.timestamp is None and line.startswith("["):
            match_header = re.match(r"\[(.*?)\]\s*(.*)", line)
            if match_header:
                data.timestamp = match_header.group(1)
                data.method_details = match_header.group(2)
                continue
        
        m = re.search(r"Wall time: ([\d.]+) s", line)
        if m: data.wall_time_s = float(m.group(1)); continue
        
        m = re.search(r"CPU times: user ([\d.]+) s, sys ([\d.]+) s, total ([\d.]+) s", line)
        if m:
            data.cpu_time_user_s = float(m.group(1))
            data.cpu_time_sys_s = float(m.group(2))
            data.cpu_time_total_s = float(m.group(3))
            continue
        
        m = re.search(r"Explained variance \(first 2\): ([\d.]+)%, ([\d.]+)%", line)
        if m: data.explained_variance_first_2 = (float(m.group(1)), float(m.group(2))); continue
        
        m = re.search(r"Total explained variance \((\d+) comps\): ([\d.]+)%", line)
        if m: data.total_explained_variance_100_comps = float(m.group(2)); continue
            
        m = re.search(r"Components needed for >=90% variance: (\d+)", line)
        if m: data.components_for_90_variance = int(m.group(1)); continue

        m = re.search(r"GPU kernel time: ([\d.]+) s", line)
        if m: data.gpu_kernel_time_s = float(m.group(1)); continue

        m = re.search(r"GPU memory used: ([\d.]+) MB / ([\d.]+) MB", line)
        if m:
            data.gpu_memory_used_mb = float(m.group(1))
            data.gpu_memory_total_mb = float(m.group(2))
            continue

        m = re.search(r"GPU model: (.*?), Driver: (.*)", line)
        if m:
            data.gpu_model = m.group(1).strip()
            data.gpu_driver = m.group(2).strip()
            continue

        m = re.search(r"Points: (\d+), Original dims: (\d+), PCA dims saved: (\d+)", line)
        if m:
            data.points = int(m.group(1))
            data.original_dims = int(m.group(2))
            data.pca_dims_saved = int(m.group(3))
            continue
        
        m = re.search(r"Hostname: (.*)", line)
        if m: data.hostname = m.group(1).strip(); continue
    return data

In [137]:
from typing import List


@dataclass
class ExperimentResult:
    method: str
    architecture: str
    dataset_type: str
    metrics: LogMetrics
    precomputed_pca: str | None
    original_method_arch_string: str

all_experiment_results: List[ExperimentResult] = []


In [108]:
RESULTS_DIR

WindowsPath('c:/Users/basia/Desktop/Semestr_letni_8/LCS/projekt_2/LSC_Dimensionality_Reduction/results')

In [205]:
log_metrics_data = []
all_experiment_results = []

for time_file_path in RESULTS_DIR.rglob("*_time.txt"):
    if time_file_path.is_file():
        # Store log metrics          
        log_content = time_file_path.read_text(encoding='utf-8')
        log_metrics = parse_log_metrics(log_content)
        log_metrics_data.append(log_metrics)

        # Parse path to extract method, architecture, dataset type, and precomputed PCA
        relative_to_project_root = time_file_path.relative_to(RESULTS_DIR)
        path = os.sep + str(relative_to_project_root)
        path = path[1:] # Remove first slash

        parts_with_os_sep = path.split(os.sep)

        method = None
        normalized = "non-normalized"
        arch = "cpu"
        precomputed_pca = None

        for part in parts_with_os_sep:
            part = part.lower() # just to make sure

            if "_" in part:
                original_part = part
                parts = part.split("_") # Output: ['pacmap', 'cpu']
                for subpart in parts:
                    if "cmlu" in subpart:
                        arch = "gpu"
                    if subpart in methods:
                        method = subpart
                    elif subpart in dataset_options:
                        if subpart == "raw":
                            normalized = "non-normalized"
                        normalized = subpart
                    elif subpart in archs:
                        arch = subpart
                    elif "pca" in subpart:
                        precomputed_pca = True

            if "cmlu" in part:
                arch = "gpu"
            if part in methods:
                method = part
            elif part in dataset_options:
                if part == "raw":
                    normalized = "non-normalized"
                normalized = part
            elif part in archs:
                arch = part
            elif "pca" in part:
                precomputed_pca = True

        experiment = ExperimentResult(
                method=method,
                architecture=arch,
                dataset_type=normalized,
                metrics=log_metrics,
                precomputed_pca=precomputed_pca,
                original_method_arch_string=original_part
            )
        all_experiment_results.append(experiment)


In [207]:
import pandas as pd

In [198]:
all_experiment_results[40]

ExperimentResult(method='umap', architecture='gpu', dataset_type='non-normalized', metrics=[LogMetrics(timestamp='2025-05-30T17:34:28.319348', method_details='PaCMAP', explained_variance_first_2=None, total_explained_variance_100_comps=None, components_for_90_variance=None, wall_time_s=1720.77, cpu_time_user_s=1796.82, cpu_time_sys_s=7.65, cpu_time_total_s=1804.47, gpu_kernel_time_s=None, gpu_memory_used_mb=None, gpu_memory_total_mb=None, gpu_model=None, gpu_driver=None, points=None, original_dims=None, pca_dims_saved=None, hostname='ac0625'), LogMetrics(timestamp='2025-06-08T17:29:16.170107', method_details='PaCMAP (CPU)', explained_variance_first_2=None, total_explained_variance_100_comps=None, components_for_90_variance=None, wall_time_s=1941.74, cpu_time_user_s=2024.34, cpu_time_sys_s=4.77, cpu_time_total_s=2029.11, gpu_kernel_time_s=None, gpu_memory_used_mb=None, gpu_memory_total_mb=None, gpu_model=None, gpu_driver=None, points=None, original_dims=None, pca_dims_saved=None, hostna

In [206]:
vars(all_log_metrics[40])

{'timestamp': '2025-06-07T23:22:50.388660',
 'method_details': 'cuML‑UMAP',
 'explained_variance_first_2': None,
 'total_explained_variance_100_comps': None,
 'components_for_90_variance': None,
 'wall_time_s': 24.37,
 'cpu_time_user_s': None,
 'cpu_time_sys_s': None,
 'cpu_time_total_s': None,
 'gpu_kernel_time_s': None,
 'gpu_memory_used_mb': None,
 'gpu_memory_total_mb': None,
 'gpu_model': None,
 'gpu_driver': None,
 'points': None,
 'original_dims': None,
 'pca_dims_saved': None,
 'hostname': 't0011'}

In [84]:
import pandas as pd

In [209]:
df_metrics = pd.DataFrame([vars(metric) for metric in all_log_metrics])

In [208]:
df_results = pd.DataFrame([vars(res) for res in all_experiment_results])

In [166]:
df_results

Unnamed: 0,method,architecture,dataset_type,metrics,precomputed_pca,original_method_arch_string
0,pacmap,cpu,non-normalized,[LogMetrics(timestamp='2025-05-30T17:34:28.319...,,pacmap_time.txt
1,pacmap,cpu,non_normalized,[LogMetrics(timestamp='2025-05-30T17:34:28.319...,,pacmap_time.txt
2,pacmap,cpu,normalized,[LogMetrics(timestamp='2025-05-30T17:34:28.319...,,pacmap_time.txt
3,pacmap,cpu,normalized,[LogMetrics(timestamp='2025-05-30T17:34:28.319...,True,pacmap_time.txt
4,pacmap,cpu,normalized,[LogMetrics(timestamp='2025-05-30T17:34:28.319...,True,pacmap_time.txt
...,...,...,...,...,...,...
163,umap,gpu,non-normalized,[LogMetrics(timestamp='2025-05-30T17:34:28.319...,,umap_gpu_time.txt
164,umap,gpu,normalized,[LogMetrics(timestamp='2025-05-30T17:34:28.319...,,umap_gpu_time.txt
165,umap,gpu,non-normalized,[LogMetrics(timestamp='2025-05-30T17:34:28.319...,True,umap_gpu_time.txt
166,umap,gpu,non-normalized,[LogMetrics(timestamp='2025-05-30T17:34:28.319...,True,umap_gpu_time.txt


In [181]:
from dataclasses import fields

In [202]:
all_experiment_results

[ExperimentResult(method='pacmap', architecture='cpu', dataset_type='non-normalized', metrics=[LogMetrics(timestamp='2025-05-30T17:34:28.319348', method_details='PaCMAP', explained_variance_first_2=None, total_explained_variance_100_comps=None, components_for_90_variance=None, wall_time_s=1720.77, cpu_time_user_s=1796.82, cpu_time_sys_s=7.65, cpu_time_total_s=1804.47, gpu_kernel_time_s=None, gpu_memory_used_mb=None, gpu_memory_total_mb=None, gpu_model=None, gpu_driver=None, points=None, original_dims=None, pca_dims_saved=None, hostname='ac0625'), LogMetrics(timestamp='2025-06-08T17:29:16.170107', method_details='PaCMAP (CPU)', explained_variance_first_2=None, total_explained_variance_100_comps=None, components_for_90_variance=None, wall_time_s=1941.74, cpu_time_user_s=2024.34, cpu_time_sys_s=4.77, cpu_time_total_s=2029.11, gpu_kernel_time_s=None, gpu_memory_used_mb=None, gpu_memory_total_mb=None, gpu_model=None, gpu_driver=None, points=None, original_dims=None, pca_dims_saved=None, hos

In [217]:
if all_experiment_results:
    experiment_data_for_df = []
    for exp in all_experiment_results:
        exp_vars = vars(exp).copy()
        # print(exp_vars)

        if 'metrics' in exp_vars:
            metrics_data = exp_vars.pop('metrics')
            metrics_vars = metrics_data.__dict__
            combined_vars = {**exp_vars, **metrics_vars}
        else:
            combined_vars = exp_vars
        experiment_data_for_df.append(combined_vars)
    
    df_experiments = pd.DataFrame(experiment_data_for_df)


else:
    print("'all_experiment_results' is empty.")

In [218]:
df_experiments

Unnamed: 0,method,architecture,dataset_type,precomputed_pca,original_method_arch_string,timestamp,method_details,explained_variance_first_2,total_explained_variance_100_comps,components_for_90_variance,...,cpu_time_total_s,gpu_kernel_time_s,gpu_memory_used_mb,gpu_memory_total_mb,gpu_model,gpu_driver,points,original_dims,pca_dims_saved,hostname
0,pacmap,cpu,non-normalized,,pacmap_time.txt,2025-05-30T17:34:28.319348,PaCMAP,,,,...,1804.47,,,,,,,,,ac0625
1,pacmap,cpu,non_normalized,,pacmap_time.txt,2025-06-08T17:29:16.170107,PaCMAP (CPU),,,,...,2029.11,,,,,,,,,t0016
2,pacmap,cpu,normalized,,pacmap_time.txt,2025-06-08T15:42:32.155650,PaCMAP (CPU),,,,...,1965.12,,,,,,,,,t0011
3,pacmap,cpu,normalized,True,pacmap_time.txt,2025-06-08T15:39:51.778189,PaCMAP (CPU),,,,...,1766.17,,,,,,,,,t0016
4,pacmap,cpu,normalized,True,pacmap_time.txt,2025-06-08T17:26:16.031911,PaCMAP (CPU),,,,...,1842.09,,,,,,,,,t0016
5,,gpu,non_normalized,,parampacmap_gpu_time.txt,2025-06-08T00:02:56.257684,Parametric PaCMAP (GPU),,,,...,2375.59,,,,,,,,,t0003
6,,gpu,normalized,,parampacmap_gpu_time.txt,2025-06-08T00:04:56.206260,Parametric PaCMAP (GPU),,,,...,2476.85,,,,,,,,,t0003
7,pca,gpu,normalized,True,parampacmap_gpu_time.txt,2025-06-08T12:01:58.166864,Parametric PaCMAP (GPU),,,,...,2243.02,,,,,,,,,t0007
8,pca,gpu,normalized,True,parampacmap_gpu_time.txt,2025-06-08T12:02:07.991668,Parametric PaCMAP (GPU),,,,...,2266.05,,,,,,,,,t0016
9,pca,cpu,non-normalized,True,pca_time.txt,2025-05-31T23:59:39.218420,PCA (AFDB + ESMAtlas + MIP),,,,...,41.11,,,,,,,,,t0011


In [None]:
# ...existing code...
RESULTS_DIR = PROJETC_ROOT/ "results"

categories = {}

# # Iterate through the top-level items in RESULTS_DIR
# print(f"Contents of {RESULTS_DIR}:")
# for item in RESULTS_DIR.iterdir():
#     if item.is_dir():
#         print(f"  Directory: {item.name}")
#     elif item.is_file():
#         print(f"  File: {item.name}")

# If you want to iterate recursively through all subdirectories and files:
print(f"\nRecursive contents of {RESULTS_DIR}:")
for path_object in RESULTS_DIR.rglob('*'): # rglob for recursive globbing
    relative_to_project_root = path_object.relative_to(RESULTS_DIR)
    path = os.sep + str(relative_to_project_root)
    path = path[1:] # Remove first slash
    # if path_object.is_file():
    #     print(f"  File: {path_object}") # Prints the full path
    if path_object.is_dir():
        # print(f"  Directory: {path_object}") # Prints the full path
        print(path)
        parts = path.split(os.sep)
        cleaned_parts = parts[1:] if parts and not parts[0] else parts
        print(f"Cleaned parts: {cleaned_parts}")

    
        


Recursive contents of c:\Users\basia\Desktop\Semestr_letni_8\LCS\projekt_2\LSC_Dimensionality_Reduction\results:
pacmap
Cleaned parts: ['pacmap']
pacmap_cpu
Cleaned parts: ['pacmap_cpu']
parampacmap_gpu
Cleaned parts: ['parampacmap_gpu']
pca
Cleaned parts: ['pca']
pca_gpu
Cleaned parts: ['pca_gpu']
test
Cleaned parts: ['test']
trimap
Cleaned parts: ['trimap']
trimap_cpu
Cleaned parts: ['trimap_cpu']
tsne
Cleaned parts: ['tsne']
tsne_cpu
Cleaned parts: ['tsne_cpu']
tsne_gpu
Cleaned parts: ['tsne_gpu']
umap
Cleaned parts: ['umap']
umap_cpu
Cleaned parts: ['umap_cpu']
umap_gpu
Cleaned parts: ['umap_gpu']
pacmap_cpu\non_normalized
Cleaned parts: ['pacmap_cpu', 'non_normalized']
pacmap_cpu\normalized
Cleaned parts: ['pacmap_cpu', 'normalized']
pacmap_cpu\pca_non_normalized
Cleaned parts: ['pacmap_cpu', 'pca_non_normalized']
pacmap_cpu\pca_normalized
Cleaned parts: ['pacmap_cpu', 'pca_normalized']
parampacmap_gpu\non_normalized
Cleaned parts: ['parampacmap_gpu', 'non_normalized']
parampacma

In [36]:
import re

log_content = """
[2025-06-06T18:44:14.128470] PCA (AFDB, cuML GPU)
Explained variance (first 2): 51.92%, 18.80%
Total explained variance (100 comps): 98.43%
Components needed for >=90% variance: 10
Wall time: 1.23 s
CPU times: user 0.60 s, sys 0.09 s, total 0.69 s
GPU kernel time: 1.042 s
GPU memory used: 654.3 MB / 42600.3 MB
GPU model: NVIDIA A100-SXM4-40GB, Driver: 570.133.20
Points: 1505141, Original dims: 844, PCA dims saved: 10
Hostname: t0025
"""

wall_time = None
cpu_user_time = None
cpu_sys_time = None
cpu_total_time = None

for line in log_content.splitlines():
    if line.startswith("Wall time:"):
        match = re.search(r"Wall time: ([\d.]+) s", line)
        if match:
            wall_time = float(match.group(1))
    elif line.startswith("CPU times:"):
        match = re.search(r"user ([\d.]+) s, sys ([\d.]+) s, total ([\d.]+) s", line)
        if match:
            cpu_user_time = float(match.group(1))
            cpu_sys_time = float(match.group(2))
            cpu_total_time = float(match.group(3))

print(f"Wall time: {wall_time} s")
print(f"CPU user time: {cpu_user_time} s")
print(f"CPU sys time: {cpu_sys_time} s")
print(f"CPU total time: {cpu_total_time} s")

Wall time: 1.23 s
CPU user time: 0.6 s
CPU sys time: 0.09 s
CPU total time: 0.69 s
