TFG part 4- Sofía Valle López

Código para comprobar que la media es una buena representación de los datos

In [1]:
import os
import numpy as np
import plotly.graph_objects as go
from scipy.stats import shapiro
from analysis_functions_tfg import load_signal_and_metadata, extract_peak_to_peak_values

# Input and output folders
input_folder = "C:\\Users\\sofia\\OneDrive\\Escritorio\\TFGPython\\all_patients_fragments_filtered"
output_folder = "C:\\Users\\sofia\\OneDrive\\Escritorio\\TFGPython\\analysis_htmls\\statistics_per_fragment"
os.makedirs(output_folder, exist_ok=True)

# List of files and window sizes
fragment_files = [f for f in os.listdir(input_folder) if f.endswith(".txt")]
window_sizes = [8, 15, 30]

# Dictionary to store Shapiro-Wilk results
results_summary = {8: [], 15: [], 30: []}

# Loop through each file and each window size
for file_name in fragment_files:
    file_path = os.path.join(input_folder, file_name)
    fragment_name = os.path.splitext(file_name)[0]

    try:
        signal, fs, start_time, unit = load_signal_and_metadata(file_path, verbose=False)
    except Exception as e:
        print(f"Error loading {file_name}: {e}")
        continue

    for win_s in window_sizes:
        try:
            amplitudes = extract_peak_to_peak_values(signal, fs, win_s)
            if len(amplitudes) < 5:
                print(f"Skipping {fragment_name} with {win_s}s window: not enough windows.")
                continue

            amplitudes = np.array(amplitudes)
            subset = amplitudes[:5000] if len(amplitudes) > 5000 else amplitudes
            stat, p_value = shapiro(subset)

            # Store result for summary
            results_summary[win_s].append({
                "fragment": fragment_name,
                "p_value": p_value,
                "normal": p_value > 0.05
            })

            # Create histogram figure
            fig = go.Figure()

            fig.add_trace(go.Histogram(
                x=amplitudes,
                nbinsx=50,
                marker_color="cornflowerblue",
                showlegend=False
            ))

            fig.update_xaxes(title_text="Amplitude (mmHg)")
            fig.update_yaxes(title_text="Count")

            # Statistical summary text
            interpretation = (
                "Distribution is likely normal (p > 0.05)"
                if p_value > 0.05 else
                "Distribution is not normal (p ≤ 0.05)"
            )
            annotation_text = (
                f"<b>Shapiro-Wilk p-value:</b> {p_value:.5f}<br>"
                f"{interpretation}<br>"
                f"<b>Num windows:</b> {len(amplitudes)}<br>"
                f"<b>Mean:</b> {np.mean(amplitudes):.3f} mmHg &nbsp;&nbsp; "
                f"<b>Median:</b> {np.median(amplitudes):.3f} mmHg &nbsp;&nbsp; "
                f"<b>Std:</b> {np.std(amplitudes):.3f}"
            )

            # Add annotation below the plot
            fig.add_annotation(
                text=annotation_text,
                xref="paper", yref="paper",
                x=0.5, y=-0.45,
                showarrow=False,
                align="center",
                font=dict(size=13, color="black"),
                bgcolor="rgba(255,255,255,0.95)",
                bordercolor="gray",
                borderwidth=1,
                borderpad=6
            )

            # Final layout
            fig.update_layout(
                height=550,
                width=900,
                title_text=f"Statistical Analysis – {fragment_name} – {win_s}s windows",
                title_x=0.05,
                margin=dict(b=200, l=50, r=50, t=50),
                plot_bgcolor='white',
                paper_bgcolor='white'
            )

            # Save as HTML
            output_name = f"{fragment_name}_win{win_s}s.html"
            output_path = os.path.join(output_folder, output_name)
            fig.write_html(output_path)
            print(f"Saved: {output_name}")

        except Exception as e:
            print(f"Error processing {fragment_name} (win {win_s}s): {e}")

# === SUMMARY OF NORMALITY ===
print("\n=== SUMMARY OF NORMALITY TEST ===")
total_all = 0
normal_all = 0
not_normal_all = 0

for win_s in window_sizes:
    print(f"\nWindow size: {win_s} s")
    results = results_summary[win_s]
    total = len(results)
    normal = sum(r["normal"] for r in results)
    not_normal = total - normal
    total_all += total
    normal_all += normal
    not_normal_all += not_normal

    print(f" Normal: {normal}")
    print(f" Not normal: {not_normal}")

    print("  → Use median for:")
    for r in results:
        if not r["normal"]:
            print(f"     - {r['fragment']}")
    print("  → Use mean for:")
    for r in results:
        if r["normal"]:
            print(f"     - {r['fragment']}")

# === GLOBAL NORMALITY STATISTICS ===
print("\n=== GLOBAL NORMALITY STATISTICS ===")
if total_all > 0:
    pct_normal = (normal_all / total_all) * 100
    pct_not_normal = (not_normal_all / total_all) * 100
    print(f"Total tests: {total_all}")
    print(f"Normal: {normal_all} ({pct_normal:.2f}%)")
    print(f"Not normal: {not_normal_all} ({pct_not_normal:.2f}%)")
else:
    print("No data processed.")


Saved: paciente10_fragmento1_filtered_win8s.html
Saved: paciente10_fragmento1_filtered_win15s.html
Saved: paciente10_fragmento1_filtered_win30s.html
Saved: paciente10_fragmento2_filtered_win8s.html
Saved: paciente10_fragmento2_filtered_win15s.html
Saved: paciente10_fragmento2_filtered_win30s.html
Saved: paciente10_fragmento3_filtered_win8s.html
Saved: paciente10_fragmento3_filtered_win15s.html
Saved: paciente10_fragmento3_filtered_win30s.html
Saved: paciente11_fragmento1_filtered_win8s.html
Saved: paciente11_fragmento1_filtered_win15s.html
Saved: paciente11_fragmento1_filtered_win30s.html
Saved: paciente11_fragmento2_filtered_win8s.html
Saved: paciente11_fragmento2_filtered_win15s.html
Saved: paciente11_fragmento2_filtered_win30s.html
Saved: paciente11_fragmento3_filtered_win8s.html
Saved: paciente11_fragmento3_filtered_win15s.html
Saved: paciente11_fragmento3_filtered_win30s.html
Saved: paciente12_fragmento1_filtered_win8s.html
Saved: paciente12_fragmento1_filtered_win15s.html
Saved: 

Código del análisis final

In [3]:
# --- Standard libs
import os
import re
import numpy as np
import pandas as pd
from datetime import datetime

# --- Plotting
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# --- Analysis functions (local module)
from analysis_functions_tfg import (
    load_signal_and_metadata,
    time_peak_to_peak_amp,
    first_harmonic_amp,
    rms_of_harmonics,
    save_result_to_txt
)

# ========================================================================
# Configuration
# ========================================================================

# Folder containing ALL filtered fragments (.txt exported from LabChart)
data_dir = r"C:\Users\sofia\OneDrive\Escritorio\TFGPython\all_patients_fragments_filtered"

# Window sizes to analyze (seconds)
window_sizes = [8, 15, 30]

# Where each window-size block starts inside the 9-row shared figure
# 8s -> rows 1–3, 15s -> rows 4–6, 30s -> rows 7–9
subplot_map = {8: 1, 15: 4, 30: 7}

# List of analysis functions to run for every window size
analysis_functions = [
    ("time_peak_to_peak_amp", time_peak_to_peak_amp),
    ("first_harmonic_amp", first_harmonic_amp),
    ("rms_of_harmonics", rms_of_harmonics),
]

# ========================================================================
# Collect and NATURALLY sort all .txt files
#   Lexicographic sort puts paciente10 before paciente2; we want numeric.
# ========================================================================

file_paths = [
    os.path.join(data_dir, f)
    for f in os.listdir(data_dir)
    if f.lower().endswith(".txt")
]

def natural_sort_key(path: str):
    """
    Returns a tuple (patient_number, fragment_number, filename) to sort naturally.
    If the filename does not match the expected pattern, push it to the end.
    Expected pattern example: 'paciente12_fragmento3_... .txt'
    """
    name = os.path.basename(path).lower()
    m = re.search(r'paciente(\d+)_fragmento(\d+)', name)
    if m:
        patient = int(m.group(1))
        fragment = int(m.group(2))
    else:
        patient = 10**9
        fragment = 10**9
    return (patient, fragment, name)

file_paths = sorted(file_paths, key=natural_sort_key)

# ========================================================================
# Run all analyses
# ========================================================================

all_results_global = []

print(f"Found {len(file_paths)} .txt files in {data_dir}\n")

for file_path in file_paths:
    fragment_name = os.path.splitext(os.path.basename(file_path))[0]
    print(f"--> Processing: {fragment_name}")

    # Load PIC signal and metadata
    signal, fs, start_time, unit = load_signal_and_metadata(file_path)

    # Create one shared figure PER analysis (reset per file)
    shared_figs = {
        name: make_subplots(
            rows=9, cols=1,
            shared_xaxes=False,
            subplot_titles=[
                "8s – First", "8s – Last", "8s – Summary",
                "15s – First", "15s – Last", "15s – Summary",
                "30s – First", "30s – Last", "30s – Summary"
            ],
            vertical_spacing=0.07
        )
        for name, _ in analysis_functions
    }

    # Collect per-window-size results for this file
    all_results_this_file = []

    for win_s in window_sizes:
        result_combined = {}

        for analysis_name, analysis_function in analysis_functions:
            result_dict, shared_figs[analysis_name] = analysis_function(
                signal=signal,
                fs=fs,
                window_size_s=win_s,
                fragment_name=fragment_name,
                start_time=start_time,
                html_fig=shared_figs[analysis_name],
                subplot_row_start=subplot_map[win_s]
            )
            # Merge metrics returned by each analysis into a single row
            result_combined.update(result_dict)

        # Build row for this window size
        row = {
            "fragment_name": fragment_name,
            "start_time": start_time,
            "unit": unit,
            "window_size_s": win_s,
            **result_combined
        }
        all_results_this_file.append(row)

    # Append one line per window size to the rolling results .txt
    for result in all_results_this_file:
        save_result_to_txt(
            result,
            fragment_name=result["fragment_name"],
            start_time=result["start_time"],
            unit=result["unit"],
            window_size_s=result["window_size_s"]
        )

    # Save one HTML per analysis (for this file) aggregating all 3 window sizes
    for analysis_name, fig in shared_figs.items():
        output_dir = os.path.join("analysis_htmls", analysis_name)
        os.makedirs(output_dir, exist_ok=True)

        output_path = os.path.join(output_dir, f"{fragment_name}_ALL_WINDOWS.html")

        rows_per_window_size = 3   # First, Last, Summary
        total_rows = len(window_sizes) * rows_per_window_size
        height_per_row = 350
        figure_height = total_rows * height_per_row

        fig.update_layout(
            height=figure_height,
            width=1100,
            title_text=f"{analysis_name.replace('_', ' ').title()}: {fragment_name}",
            showlegend=False
        )
        fig.write_html(output_path)

    # Add this file's rows to the global table
    all_results_global.extend(all_results_this_file)

# ========================================================================
# Global summary DataFrame — NATURALLY sorted
# ========================================================================

df = pd.DataFrame(all_results_global)

# Extract numeric patient/fragment to sort dataframe naturally
df["patient"]   = df["fragment_name"].str.extract(r'paciente(\d+)').astype(int)
df["fragment"]  = df["fragment_name"].str.extract(r'fragmento(\d+)').astype(int)

df = df.sort_values(["patient", "fragment", "window_size_s"]) \
       .drop(columns=["patient", "fragment"]) \
       .reset_index(drop=True)

df  # display


Found 56 .txt files in C:\Users\sofia\OneDrive\Escritorio\TFGPython\all_patients_fragments_filtered

--> Processing: paciente1_fragmento1_filtered

--- File loaded: C:\Users\sofia\OneDrive\Escritorio\TFGPython\all_patients_fragments_filtered\paciente1_fragmento1_filtered.txt
Extracted 96000 samples | fs = 200.00 Hz | Start: 2024-11-18 15:35:00
Duration: 8.00 min
Unit: mmHg
First PIC values: [0.171104 0.183496 0.196939]
--> Processing: paciente1_fragmento2_filtered

--- File loaded: C:\Users\sofia\OneDrive\Escritorio\TFGPython\all_patients_fragments_filtered\paciente1_fragmento2_filtered.txt
Extracted 96000 samples | fs = 200.00 Hz | Start: 2024-11-18 19:35:00
Duration: 8.00 min
Unit: mmHg
First PIC values: [-0.086376 -0.098071 -0.109315]
--> Processing: paciente1_fragmento3_filtered

--- File loaded: C:\Users\sofia\OneDrive\Escritorio\TFGPython\all_patients_fragments_filtered\paciente1_fragmento3_filtered.txt
Extracted 96000 samples | fs = 200.00 Hz | Start: 2024-11-19 01:20:00
Duratio

Unnamed: 0,fragment_name,start_time,unit,window_size_s,num_windows,window_means,time_peak_to_peak_amp,first_harmonic_peak,first_harmonic_p2p,multi_harmonics_peak
0,paciente1_fragmento1_filtered,2024-11-18 15:35:00,mmHg,8,60,"[0.285, 0.198, 0.259, 0.262, 0.345, 0.264, 0.3...",0.800,0.259,0.517,0.271
1,paciente1_fragmento1_filtered,2024-11-18 15:35:00,mmHg,15,32,"[0.218, 0.219, 0.256, 0.246, 0.175, 0.343, 0.2...",0.799,0.227,0.455,0.237
2,paciente1_fragmento1_filtered,2024-11-18 15:35:00,mmHg,30,16,"[0.142, 0.207, 0.163, 0.206, 0.147, 0.125, 0.1...",0.798,0.161,0.321,0.167
3,paciente1_fragmento2_filtered,2024-11-18 19:35:00,mmHg,8,60,"[0.25, 0.277, 0.287, 0.308, 0.287, 0.244, 0.30...",0.787,0.246,0.491,0.263
4,paciente1_fragmento2_filtered,2024-11-18 19:35:00,mmHg,15,32,"[0.276, 0.249, 0.18, 0.19, 0.138, 0.208, 0.12,...",0.786,0.211,0.423,0.222
...,...,...,...,...,...,...,...,...,...,...
163,paciente20_fragmento2_filtered,2023-07-10 18:24:00,mmHg,15,32,"[1.919, 1.709, 1.869, 2.033, 1.954, 1.902, 1.9...",5.877,1.531,3.061,1.932
164,paciente20_fragmento2_filtered,2023-07-10 18:24:00,mmHg,30,16,"[2.01, 1.751, 2.024, 1.421, 1.525, 1.597, 1.71...",5.878,1.491,2.982,1.786
165,paciente20_fragmento3_filtered,2023-07-11 04:38:00,mmHg,8,60,"[1.944, 1.613, 1.702, 1.817, 2.009, 1.732, 1.6...",5.014,1.403,2.806,1.761
166,paciente20_fragmento3_filtered,2023-07-11 04:38:00,mmHg,15,32,"[1.652, 1.737, 1.619, 1.598, 1.518, 1.625, 1.6...",5.016,1.395,2.791,1.711
