In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [32]:
log_dir = Path('../../results/logs/preprocessing-logs')

In [33]:
def load_logs(log_dir, category):
    csv_files = list(log_dir.glob(f"./{category}/{category}_batch_*_report.csv"))
    all_dfs = [pd.read_csv(csv_file) for csv_file in csv_files]
    df = pd.concat(all_dfs, ignore_index=True)
    return df

In [34]:
def analyze_logs(log_dir, category):
    df = load_logs(log_dir, category)
    
    # Convert numerical columns to float if necessary
    df["GC %"] = df["GC %"].astype(float)
    df["Avg Length"] = df["Avg Length"].astype(float)

    # Summary counts
    kept = df[df["Status"] == "KEPT"]
    deleted = df[df["Status"] != "KEPT"]

    print(f"\n===== Summary for {category.upper()} =====")
    print(f"Total genomes processed: {len(df)}")
    print(f" - KEPT: {len(kept)}")
    print(f" - DELETED/FAILED: {len(deleted)}")

    # Aggregate contig stats
    total_contigs_before = df["Original Contigs"].sum()
    total_after_filter = df["After Filtering"].sum()
    total_after_cdhit = df["After CD-HIT"].sum()
    
    print(f"\nContig Counts:")
    print(f" - Original contigs: {total_contigs_before:,}")
    print(f" - After filtering: {total_after_filter:,}")
    print(f" - After CD-HIT: {total_after_cdhit:,}")

    # GC and Length stats (for KEPT)
    mean_gc = kept["GC %"].mean()
    mean_len = kept["Avg Length"].mean()
    total_bases = (kept["After CD-HIT"] * kept["Avg Length"]).sum()

    print(f"\nStatistics for KEPT files:")
    print(f" - Mean GC %: {mean_gc:.2f}")
    print(f" - Mean Avg Length: {mean_len:.1f} bp")
    print(f" - Total bases retained: {int(total_bases):,}")

    # Status breakdown
    print(f"\nStatus breakdown:")
    print(df["Status"].value_counts())

    return df

In [35]:
archaea_df = analyze_logs(log_dir, 'archaea') 

In [36]:
viral_df = analyze_logs(log_dir, 'viral')


===== Summary for VIRAL =====
Total genomes processed: 151090
 - KEPT: 0
 - DELETED/FAILED: 151090

Contig Counts:
 - Original contigs: 786,043
 - After filtering: 773,459
 - After Clustering: 0

Statistics for KEPT files:
 - Mean GC %: nan
 - Mean Avg Length: nan bp
 - Total bases retained: 0

Status breakdown:
Status
FAILED (MMseqs2)                   150326
DELETED (empty after filtering)       764
Name: count, dtype: int64
