In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [13]:
log_dir = Path('../../results/logs/preprocessing-logs')

In [14]:
def load_logs(log_dir, category):
    csv_files = list(log_dir.glob(f"./{category}_preprocessing_report.csv"))
    all_dfs = [pd.read_csv(csv_file) for csv_file in csv_files]
    df = pd.concat(all_dfs, ignore_index=True)
    return df

In [15]:
def analyze_logs(log_dir, category):
    df = load_logs(log_dir, category)

    # Convert numeric columns
    df["GC Content (%)"] = df["GC Content (%)"].astype(float)
    df["Avg Length"] = df["Avg Length"].astype(float)

    # Summary counts
    kept = df[df["Status"] == "KEPT"]
    deleted = df[df["Status"] != "KEPT"]

    print(f"\n===== Summary for {category.upper()} =====")
    print(f"Total genomes processed: {len(df)}")
    print(f" - KEPT: {len(kept)}")
    print(f" - DELETED: {len(deleted)}")

    # Aggregate contig stats
    total_contigs_before = df["Original Contigs"].sum()
    total_after_filter = df["After Filtering"].sum()

    print(f"\nContig Counts:")
    print(f" - Original contigs: {total_contigs_before:,}")
    print(f" - After filtering: {total_after_filter:,}")

    # GC and Length stats (for KEPT)
    mean_gc = kept["GC Content (%)"].mean() if not kept.empty else 0.0
    mean_len = kept["Avg Length"].mean() if not kept.empty else 0.0
    total_bases = (kept["After Filtering"] * kept["Avg Length"]).sum()

    print(f"\nStatistics for KEPT files:")
    print(f" - Mean GC %: {mean_gc:.2f}")
    print(f" - Mean Avg Length: {mean_len:.1f} bp")
    print(f" - Total bases retained: {int(total_bases):,}")

    # Status breakdown
    print(f"\nStatus breakdown:")
    print(df["Status"].value_counts())

    return df


In [16]:
archaea_df = analyze_logs(log_dir, 'archaea') 


===== Summary for ARCHAEA =====
Total genomes processed: 3243
 - KEPT: 3239
 - DELETED: 4

Contig Counts:
 - Original contigs: 391,542
 - After filtering: 374,747

Statistics for KEPT files:
 - Mean GC %: 45.64
 - Mean Avg Length: 378032.4 bp
 - Total bases retained: 6,176,019,445

Status breakdown:
Status
KEPT                        3239
DELETED (empty or error)       4
Name: count, dtype: int64


In [17]:
viral_df = analyze_logs(log_dir, 'viral') 


===== Summary for VIRAL =====
Total genomes processed: 129831
 - KEPT: 129677
 - DELETED: 154

Contig Counts:
 - Original contigs: 704,353
 - After filtering: 692,902

Statistics for KEPT files:
 - Mean GC %: 44.24
 - Mean Avg Length: 16836.4 bp
 - Total bases retained: 3,139,870,091

Status breakdown:
Status
KEPT                        129677
DELETED (empty or error)       154
Name: count, dtype: int64


In [18]:
protozoa_df = analyze_logs(log_dir, 'protozoa')


===== Summary for PROTOZOA =====
Total genomes processed: 879
 - KEPT: 877
 - DELETED: 2

Contig Counts:
 - Original contigs: 7,716,447
 - After filtering: 3,388,244

Statistics for KEPT files:
 - Mean GC %: 44.58
 - Mean Avg Length: 213361.0 bp
 - Total bases retained: 51,936,515,609

Status breakdown:
Status
KEPT                        877
DELETED (empty or error)      2
Name: count, dtype: int64
