In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [51]:
log_dir = Path('../../results/logs/preprocessing-logs')

In [52]:
def load_logs(log_dir, split, category):
    print(f'----------{split.upper()}----------')
    csv_files = list(log_dir.glob(f"./{split}/{category}_preprocessing_report.csv"))
    all_dfs = [pd.read_csv(csv_file) for csv_file in csv_files]
    df = pd.concat(all_dfs, ignore_index=True)
    return df

In [53]:
def analyze_logs(log_dir, category, split):
    df = load_logs(log_dir, split, category)
    # Convert numeric columns
    df["GC Content (%)"] = df["GC Content (%)"].astype(float)
    df["Avg Length"] = df["Avg Length"].astype(float)

    # Summary counts
    kept = df[df["Status"] == "KEPT"]
    deleted = df[df["Status"] != "KEPT"]

    print(f"\n===== Summary for {category.upper()} =====")
    print(f"Total genomes processed: {len(df)}")
    print(f" - KEPT: {len(kept)}")
    print(f" - DELETED: {len(deleted)}")

    # Aggregate contig stats
    total_contigs_before = df["Original Contigs"].sum()
    total_after_filter = df["After Filtering"].sum()

    print(f"\nContig Counts:")
    print(f" - Original contigs: {total_contigs_before:,}")
    print(f" - After filtering: {total_after_filter:,}")

    # GC and Length stats (for KEPT)
    mean_gc = kept["GC Content (%)"].mean() if not kept.empty else 0.0
    mean_len = kept["Avg Length"].mean() if not kept.empty else 0.0
    total_bases = (kept["After Filtering"] * kept["Avg Length"]).sum()

    print(f"\nStatistics for KEPT files:")
    print(f" - Mean GC %: {mean_gc:.2f}")
    print(f" - Mean Avg Length: {mean_len:.1f} bp")
    print(f" - Total Bases Retained: {int(total_bases):,}")

    # Status breakdown
    print(f"\nStatus breakdown:")
    print(df["Status"].value_counts())

    return df


In [54]:
archaea_df = analyze_logs(log_dir, 'archaea', 'train') 

----------TRAIN----------

===== Summary for ARCHAEA =====
Total genomes processed: 3243
 - KEPT: 3238
 - DELETED: 5

Contig Counts:
 - Original contigs: 391,542
 - After filtering: 359,833

Statistics for KEPT files:
 - Mean GC %: 45.69
 - Mean Avg Length: 377777.2 bp
 - Total Bases Retained: 6,072,132,376

Status breakdown:
Status
KEPT                        3238
DELETED (empty or error)       5
Name: count, dtype: int64


In [55]:
viral_df = analyze_logs(log_dir, 'viral', 'train') 

----------TRAIN----------

===== Summary for VIRAL =====
Total genomes processed: 128160
 - KEPT: 127974
 - DELETED: 186

Contig Counts:
 - Original contigs: 702,682
 - After filtering: 691,051

Statistics for KEPT files:
 - Mean GC %: 44.18
 - Mean Avg Length: 16422.3 bp
 - Total Bases Retained: 3,057,903,987

Status breakdown:
Status
KEPT                        127974
DELETED (empty or error)       186
Name: count, dtype: int64


In [56]:
protozoa_df = analyze_logs(log_dir, 'protozoa', 'train')

----------TRAIN----------

===== Summary for PROTOZOA =====
Total genomes processed: 879
 - KEPT: 873
 - DELETED: 6

Contig Counts:
 - Original contigs: 7,716,447
 - After filtering: 3,260,052

Statistics for KEPT files:
 - Mean GC %: 44.76
 - Mean Avg Length: 205546.4 bp
 - Total Bases Retained: 49,117,730,953

Status breakdown:
Status
KEPT                        873
DELETED (empty or error)      6
Name: count, dtype: int64


In [57]:
fungi_df = analyze_logs(log_dir, 'fungi', 'train')

----------TRAIN----------

===== Summary for FUNGI =====
Total genomes processed: 6895
 - KEPT: 6889
 - DELETED: 6

Contig Counts:
 - Original contigs: 22,885,431
 - After filtering: 7,154,918

Statistics for KEPT files:
 - Mean GC %: 46.49
 - Mean Avg Length: 404045.3 bp
 - Total Bases Retained: 219,024,695,767

Status breakdown:
Status
KEPT                        6889
DELETED (empty or error)       6
Name: count, dtype: int64


In [58]:
viral_val_df = analyze_logs(log_dir, 'viral', 'val')

----------VAL----------

===== Summary for VIRAL =====
Total genomes processed: 1730
 - KEPT: 1728
 - DELETED: 2

Contig Counts:
 - Original contigs: 4,484
 - After filtering: 4,457

Statistics for KEPT files:
 - Mean GC %: 45.92
 - Mean Avg Length: 33024.9 bp
 - Total Bases Retained: 69,955,905

Status breakdown:
Status
KEPT                        1728
DELETED (empty or error)       2
Name: count, dtype: int64


In [59]:
archaea_val_df = analyze_logs(log_dir, 'archaea', 'val') 

----------VAL----------

===== Summary for ARCHAEA =====
Total genomes processed: 566
 - KEPT: 566
 - DELETED: 0

Contig Counts:
 - Original contigs: 93,128
 - After filtering: 93,013

Statistics for KEPT files:
 - Mean GC %: 46.24
 - Mean Avg Length: 50170.7 bp
 - Total Bases Retained: 769,393,372

Status breakdown:
Status
KEPT    566
Name: count, dtype: int64


In [60]:
protozoa_val_df = analyze_logs(log_dir, 'protozoa', 'val')

----------VAL----------

===== Summary for PROTOZOA =====
Total genomes processed: 14
 - KEPT: 14
 - DELETED: 0

Contig Counts:
 - Original contigs: 385,323
 - After filtering: 196,283

Statistics for KEPT files:
 - Mean GC %: 43.02
 - Mean Avg Length: 308442.3 bp
 - Total Bases Retained: 720,581,149

Status breakdown:
Status
KEPT    14
Name: count, dtype: int64


In [61]:
fungi_val_df = analyze_logs(log_dir, 'fungi', 'val')

----------VAL----------

===== Summary for FUNGI =====
Total genomes processed: 299
 - KEPT: 299
 - DELETED: 0

Contig Counts:
 - Original contigs: 365,026
 - After filtering: 156,083

Statistics for KEPT files:
 - Mean GC %: 47.98
 - Mean Avg Length: 450098.2 bp
 - Total Bases Retained: 10,141,426,788

Status breakdown:
Status
KEPT    299
Name: count, dtype: int64


In [62]:
viral_test_df = analyze_logs(log_dir, 'viral', 'test')

----------TEST----------

===== Summary for VIRAL =====
Total genomes processed: 22047
 - KEPT: 21585
 - DELETED: 462

Contig Counts:
 - Original contigs: 96,619
 - After filtering: 96,023

Statistics for KEPT files:
 - Mean GC %: 41.94
 - Mean Avg Length: 58710.1 bp
 - Total Bases Retained: 1,391,947,835

Status breakdown:
Status
KEPT                        21585
DELETED (empty or error)      462
Name: count, dtype: int64


In [63]:
archaea_test_df = analyze_logs(log_dir, 'archaea', 'test')

----------TEST----------

===== Summary for ARCHAEA =====
Total genomes processed: 2076
 - KEPT: 2075
 - DELETED: 1

Contig Counts:
 - Original contigs: 368,163
 - After filtering: 367,183

Statistics for KEPT files:
 - Mean GC %: 47.53
 - Mean Avg Length: 125850.9 bp
 - Total Bases Retained: 3,110,350,636

Status breakdown:
Status
KEPT                        2075
DELETED (empty or error)       1
Name: count, dtype: int64


In [64]:
protozoa_test_df = analyze_logs(log_dir, 'protozoa', 'test')

----------TEST----------

===== Summary for PROTOZOA =====
Total genomes processed: 385
 - KEPT: 384
 - DELETED: 1

Contig Counts:
 - Original contigs: 986,885
 - After filtering: 729,262

Statistics for KEPT files:
 - Mean GC %: 43.62
 - Mean Avg Length: 101282.5 bp
 - Total Bases Retained: 6,565,240,907

Status breakdown:
Status
KEPT                        384
DELETED (empty or error)      1
Name: count, dtype: int64


In [65]:
fungi_test_df = analyze_logs(log_dir, 'fungi', 'test')

----------TEST----------

===== Summary for FUNGI =====
Total genomes processed: 2564
 - KEPT: 2564
 - DELETED: 0

Contig Counts:
 - Original contigs: 4,386,266
 - After filtering: 1,848,270

Statistics for KEPT files:
 - Mean GC %: 46.80
 - Mean Avg Length: 406296.1 bp
 - Total Bases Retained: 75,433,547,894

Status breakdown:
Status
KEPT    2564
Name: count, dtype: int64


In [66]:
plasmid_df = analyze_logs(log_dir, 'plasmid', 'train')

----------TRAIN----------

===== Summary for PLASMID =====
Total genomes processed: 81509
 - KEPT: 76374
 - DELETED: 5135

Contig Counts:
 - Original contigs: 81,509
 - After filtering: 76,374

Statistics for KEPT files:
 - Mean GC %: 47.04
 - Mean Avg Length: 85062.8 bp
 - Total Bases Retained: 6,496,585,913

Status breakdown:
Status
KEPT                        76374
DELETED (empty or error)     5135
Name: count, dtype: int64


In [67]:
plasmid_val_df = analyze_logs(log_dir, 'plasmid', 'val')

----------VAL----------

===== Summary for PLASMID =====
Total genomes processed: 4794
 - KEPT: 4497
 - DELETED: 297

Contig Counts:
 - Original contigs: 4,794
 - After filtering: 4,497

Statistics for KEPT files:
 - Mean GC %: 47.01
 - Mean Avg Length: 84854.6 bp
 - Total Bases Retained: 381,590,941

Status breakdown:
Status
KEPT                        4497
DELETED (empty or error)     297
Name: count, dtype: int64


In [68]:
plasmid_test_df = analyze_logs(log_dir, 'plasmid', 'test')

----------TEST----------

===== Summary for PLASMID =====
Total genomes processed: 9591
 - KEPT: 8981
 - DELETED: 610

Contig Counts:
 - Original contigs: 9,591
 - After filtering: 8,981

Statistics for KEPT files:
 - Mean GC %: 47.06
 - Mean Avg Length: 83905.9 bp
 - Total Bases Retained: 753,558,532

Status breakdown:
Status
KEPT                        8981
DELETED (empty or error)     610
Name: count, dtype: int64


In [69]:
bacteria_df = analyze_logs(log_dir, 'bacteria', 'train')

----------TRAIN----------

===== Summary for BACTERIA =====
Total genomes processed: 180772
 - KEPT: 180116
 - DELETED: 656

Contig Counts:
 - Original contigs: 23,504,784
 - After filtering: 17,359,293

Statistics for KEPT files:
 - Mean GC %: 48.54
 - Mean Avg Length: 604095.8 bp
 - Total Bases Retained: 669,746,996,127

Status breakdown:
Status
KEPT                        180116
DELETED (empty or error)       656
Name: count, dtype: int64


In [70]:
bacteria_val_df = analyze_logs(log_dir, 'bacteria', 'val')

----------VAL----------

===== Summary for BACTERIA =====
Total genomes processed: 12096
 - KEPT: 12082
 - DELETED: 14

Contig Counts:
 - Original contigs: 2,083,961
 - After filtering: 1,899,769

Statistics for KEPT files:
 - Mean GC %: 48.86
 - Mean Avg Length: 547053.1 bp
 - Total Bases Retained: 40,661,519,996

Status breakdown:
Status
KEPT                        12082
DELETED (empty or error)       14
Name: count, dtype: int64


In [71]:
bacteria_test_df = analyze_logs(log_dir, 'bacteria', 'test')

----------TEST----------

===== Summary for BACTERIA =====
Total genomes processed: 74827
 - KEPT: 74780
 - DELETED: 47

Contig Counts:
 - Original contigs: 14,444,862
 - After filtering: 12,984,304

Statistics for KEPT files:
 - Mean GC %: 48.78
 - Mean Avg Length: 338347.6 bp
 - Total Bases Retained: 216,587,938,910

Status breakdown:
Status
KEPT                        74780
DELETED (empty or error)       47
Name: count, dtype: int64
