In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [7]:
log_dir = Path('../../results/logs/preprocessing-logs')

In [8]:
def load_logs(log_dir, category):
    csv_files = list(log_dir.glob(f"./{category}_preprocessing_report.csv"))
    all_dfs = [pd.read_csv(csv_file) for csv_file in csv_files]
    df = pd.concat(all_dfs, ignore_index=True)
    return df

In [9]:
def analyze_logs(log_dir, category):
    df = load_logs(log_dir, category)

    # Convert numeric columns
    df["GC Content (%)"] = df["GC Content (%)"].astype(float)
    df["Avg Length"] = df["Avg Length"].astype(float)

    # Summary counts
    kept = df[df["Status"] == "KEPT"]
    deleted = df[df["Status"] != "KEPT"]

    print(f"\n===== Summary for {category.upper()} =====")
    print(f"Total genomes processed: {len(df)}")
    print(f" - KEPT: {len(kept)}")
    print(f" - DELETED: {len(deleted)}")

    # Aggregate contig stats
    total_contigs_before = df["Original Contigs"].sum()
    total_after_filter = df["After Filtering"].sum()

    print(f"\nContig Counts:")
    print(f" - Original contigs: {total_contigs_before:,}")
    print(f" - After filtering: {total_after_filter:,}")

    # GC and Length stats (for KEPT)
    mean_gc = kept["GC Content (%)"].mean() if not kept.empty else 0.0
    mean_len = kept["Avg Length"].mean() if not kept.empty else 0.0
    total_bases = (kept["After Filtering"] * kept["Avg Length"]).sum()

    print(f"\nStatistics for KEPT files:")
    print(f" - Mean GC %: {mean_gc:.2f}")
    print(f" - Mean Avg Length: {mean_len:.1f} bp")
    print(f" - Total bases retained: {int(total_bases):,}")

    # Status breakdown
    print(f"\nStatus breakdown:")
    print(df["Status"].value_counts())

    return df


In [10]:
protozoa_df = analyze_logs(log_dir, 'protozoa') 


===== Summary for PROTOZOA =====
Total genomes processed: 173
 - KEPT: 171
 - DELETED: 2

Contig Counts:
 - Original contigs: 54,770
 - After filtering: 30,670

Statistics for KEPT files:
 - Mean GC %: 37.81
 - Mean Avg Length: 691550.8 bp
 - Total bases retained: 5,496,258,912

Status breakdown:
Status
KEPT                        171
DELETED (empty or error)      2
Name: count, dtype: int64
