In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [25]:
log_dir = Path('../../results/logs/preprocessing-logs')

In [26]:
def load_logs(log_dir, category):
    csv_files = list(log_dir.glob(f"./{category}/{category}_batch_*_report.csv"))
    all_dfs = [pd.read_csv(csv_file) for csv_file in csv_files]
    df = pd.concat(all_dfs, ignore_index=True)
    return df

In [27]:
def analyze_logs(log_dir, category):
    df = load_logs(log_dir, category)
    
    # Convert numerical columns to float if necessary
    df["GC %"] = df["GC %"].astype(float)
    df["Avg Length"] = df["Avg Length"].astype(float)

    # Summary counts
    kept = df[df["Status"] == "KEPT"]
    deleted = df[df["Status"] != "KEPT"]

    print(f"\n===== Summary for {category.upper()} =====")
    print(f"Total genomes processed: {len(df)}")
    print(f" - KEPT: {len(kept)}")
    print(f" - DELETED/FAILED: {len(deleted)}")

    # Aggregate contig stats
    total_contigs_before = df["Original Contigs"].sum()
    total_after_filter = df["After Filtering"].sum()
    total_after_cdhit = df["After CD-HIT"].sum()
    
    print(f"\nContig Counts:")
    print(f" - Original contigs: {total_contigs_before:,}")
    print(f" - After filtering: {total_after_filter:,}")
    print(f" - After CD-HIT: {total_after_cdhit:,}")

    # GC and Length stats (for KEPT)
    mean_gc = kept["GC %"].mean()
    mean_len = kept["Avg Length"].mean()
    total_bases = (kept["After CD-HIT"] * kept["Avg Length"]).sum()

    print(f"\nStatistics for KEPT files:")
    print(f" - Mean GC %: {mean_gc:.2f}")
    print(f" - Mean Avg Length: {mean_len:.1f} bp")
    print(f" - Total bases retained: {int(total_bases):,}")

    # Status breakdown
    print(f"\nStatus breakdown:")
    print(df["Status"].value_counts())

    return df

In [28]:
archaea_df = analyze_logs(log_dir, 'archaea') 


===== Summary for ARCHAEA =====
Total genomes processed: 5883
 - KEPT: 5001
 - DELETED/FAILED: 882

Contig Counts:
 - Original contigs: 852,831
 - After filtering: 790,276
 - After CD-HIT: 788,172

Statistics for KEPT files:
 - Mean GC %: 45.86
 - Mean Avg Length: 21557.0 bp
 - Total bases retained: 7,660,900,037

Status breakdown:
Status
KEPT                               5001
DELETED (too few contigs)           626
FAILED (CD-HIT)                     250
DELETED (empty after filtering)       6
Name: count, dtype: int64
