In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [2]:
log_dir = Path('../../results/logs/preprocessing-logs')

In [3]:
def load_training_logs(log_dir, category):
    csv_files = list(log_dir.glob(f"./{category}_preprocessing_report.csv"))
    all_dfs = [pd.read_csv(csv_file) for csv_file in csv_files]
    df = pd.concat(all_dfs, ignore_index=True)
    return df

In [4]:
def load_val_logs(log_dir, category):
    csv_files = list(log_dir.glob(f"./{category}_val_preprocessing_report.csv"))
    all_dfs = [pd.read_csv(csv_file) for csv_file in csv_files]
    df = pd.concat(all_dfs, ignore_index=True)
    return df

In [5]:
def load_test_logs(log_dir, category):
    csv_files = list(log_dir.glob(f"./{category}_test_preprocessing_report.csv"))
    all_dfs = [pd.read_csv(csv_file) for csv_file in csv_files]
    df = pd.concat(all_dfs, ignore_index=True)
    return df

In [6]:
def analyze_logs(log_dir, category, set):
    if set == 'training':
        df = load_training_logs(log_dir, category)
        print('---------- TRAINING ----------')
    elif set == 'validation':
        df = load_val_logs(log_dir, category)
        print('---------- VALIDATION ----------')

    else:
        df = load_test_logs(log_dir, category)
        print('---------- TEST ----------')

    # Convert numeric columns
    df["GC Content (%)"] = df["GC Content (%)"].astype(float)
    df["Avg Length"] = df["Avg Length"].astype(float)

    # Summary counts
    kept = df[df["Status"] == "KEPT"]
    deleted = df[df["Status"] != "KEPT"]

    print(f"\n===== Summary for {category.upper()} =====")
    print(f"Total genomes processed: {len(df)}")
    print(f" - KEPT: {len(kept)}")
    print(f" - DELETED: {len(deleted)}")

    # Aggregate contig stats
    total_contigs_before = df["Original Contigs"].sum()
    total_after_filter = df["After Filtering"].sum()

    print(f"\nContig Counts:")
    print(f" - Original contigs: {total_contigs_before:,}")
    print(f" - After filtering: {total_after_filter:,}")

    # GC and Length stats (for KEPT)
    mean_gc = kept["GC Content (%)"].mean() if not kept.empty else 0.0
    mean_len = kept["Avg Length"].mean() if not kept.empty else 0.0
    total_bases = (kept["After Filtering"] * kept["Avg Length"]).sum()

    print(f"\nStatistics for KEPT files:")
    print(f" - Mean GC %: {mean_gc:.2f}")
    print(f" - Mean Avg Length: {mean_len:.1f} bp")
    print(f" - Total bases retained: {int(total_bases):,}")

    # Status breakdown
    print(f"\nStatus breakdown:")
    print(df["Status"].value_counts())

    return df


In [7]:
archaea_df = analyze_logs(log_dir, 'archaea', 'training') 

---------- TRAINING ----------

===== Summary for ARCHAEA =====
Total genomes processed: 3243
 - KEPT: 3239
 - DELETED: 4

Contig Counts:
 - Original contigs: 391,542
 - After filtering: 374,747

Statistics for KEPT files:
 - Mean GC %: 45.64
 - Mean Avg Length: 378032.4 bp
 - Total bases retained: 6,176,019,445

Status breakdown:
Status
KEPT                        3239
DELETED (empty or error)       4
Name: count, dtype: int64


In [8]:
viral_df = analyze_logs(log_dir, 'viral', 'training') 

---------- TRAINING ----------

===== Summary for VIRAL =====
Total genomes processed: 129831
 - KEPT: 129677
 - DELETED: 154

Contig Counts:
 - Original contigs: 704,353
 - After filtering: 692,902

Statistics for KEPT files:
 - Mean GC %: 44.24
 - Mean Avg Length: 16836.4 bp
 - Total bases retained: 3,139,870,091

Status breakdown:
Status
KEPT                        129677
DELETED (empty or error)       154
Name: count, dtype: int64


In [9]:
protozoa_df = analyze_logs(log_dir, 'protozoa', 'training')

---------- TRAINING ----------

===== Summary for PROTOZOA =====
Total genomes processed: 879
 - KEPT: 877
 - DELETED: 2

Contig Counts:
 - Original contigs: 7,716,447
 - After filtering: 3,388,244

Statistics for KEPT files:
 - Mean GC %: 44.58
 - Mean Avg Length: 213361.0 bp
 - Total bases retained: 51,936,515,609

Status breakdown:
Status
KEPT                        877
DELETED (empty or error)      2
Name: count, dtype: int64


In [10]:
fungi_df = analyze_logs(log_dir, 'fungi', 'training')

---------- TRAINING ----------

===== Summary for FUNGI =====
Total genomes processed: 6897
 - KEPT: 6896
 - DELETED: 1

Contig Counts:
 - Original contigs: 22,887,089
 - After filtering: 7,332,353

Statistics for KEPT files:
 - Mean GC %: 46.43
 - Mean Avg Length: 403193.2 bp
 - Total bases retained: 224,498,241,515

Status breakdown:
Status
KEPT                        6896
DELETED (empty or error)       1
Name: count, dtype: int64


In [11]:
viral_val_df = analyze_logs(log_dir, 'viral', 'validation')

---------- VALIDATION ----------

===== Summary for VIRAL =====
Total genomes processed: 1730
 - KEPT: 1730
 - DELETED: 0

Contig Counts:
 - Original contigs: 4,484
 - After filtering: 4,459

Statistics for KEPT files:
 - Mean GC %: 45.93
 - Mean Avg Length: 33073.6 bp
 - Total bases retained: 70,106,312

Status breakdown:
Status
KEPT    1730
Name: count, dtype: int64


In [12]:
archaea_val_df = analyze_logs(log_dir, 'archaea', 'validation') 

---------- VALIDATION ----------

===== Summary for ARCHAEA =====
Total genomes processed: 566
 - KEPT: 566
 - DELETED: 0

Contig Counts:
 - Original contigs: 93,128
 - After filtering: 93,075

Statistics for KEPT files:
 - Mean GC %: 46.23
 - Mean Avg Length: 50167.9 bp
 - Total bases retained: 769,953,467

Status breakdown:
Status
KEPT    566
Name: count, dtype: int64


In [13]:
protozoa_val_df = analyze_logs(log_dir, 'protozoa', 'validation')

---------- VALIDATION ----------

===== Summary for PROTOZOA =====
Total genomes processed: 14
 - KEPT: 14
 - DELETED: 0

Contig Counts:
 - Original contigs: 385,323
 - After filtering: 196,395

Statistics for KEPT files:
 - Mean GC %: 43.02
 - Mean Avg Length: 308440.0 bp
 - Total bases retained: 721,011,537

Status breakdown:
Status
KEPT    14
Name: count, dtype: int64


In [14]:
fungi_val_df = analyze_logs(log_dir, 'fungi', 'validation')

---------- VALIDATION ----------

===== Summary for FUNGI =====
Total genomes processed: 299
 - KEPT: 299
 - DELETED: 0

Contig Counts:
 - Original contigs: 365,026
 - After filtering: 157,454

Statistics for KEPT files:
 - Mean GC %: 47.98
 - Mean Avg Length: 449333.7 bp
 - Total bases retained: 10,171,948,258

Status breakdown:
Status
KEPT    299
Name: count, dtype: int64


In [15]:
viral_test_df = analyze_logs(log_dir, 'viral', 'test')

---------- TEST ----------

===== Summary for VIRAL =====
Total genomes processed: 22047
 - KEPT: 21884
 - DELETED: 163

Contig Counts:
 - Original contigs: 96,619
 - After filtering: 96,376

Statistics for KEPT files:
 - Mean GC %: 41.79
 - Mean Avg Length: 60469.0 bp
 - Total bases retained: 1,448,101,969

Status breakdown:
Status
KEPT                        21884
DELETED (empty or error)      163
Name: count, dtype: int64


In [16]:
archaea_test_df = analyze_logs(log_dir, 'archaea', 'test')

---------- TEST ----------

===== Summary for ARCHAEA =====
Total genomes processed: 2074
 - KEPT: 2073
 - DELETED: 1

Contig Counts:
 - Original contigs: 368,161
 - After filtering: 367,499

Statistics for KEPT files:
 - Mean GC %: 47.53
 - Mean Avg Length: 122947.9 bp
 - Total bases retained: 3,106,062,737

Status breakdown:
Status
KEPT                        2073
DELETED (empty or error)       1
Name: count, dtype: int64


In [17]:
protozoa_test_df = analyze_logs(log_dir, 'protozoa', 'test')

---------- TEST ----------

===== Summary for PROTOZOA =====
Total genomes processed: 385
 - KEPT: 385
 - DELETED: 0

Contig Counts:
 - Original contigs: 986,885
 - After filtering: 733,045

Statistics for KEPT files:
 - Mean GC %: 43.52
 - Mean Avg Length: 106907.4 bp
 - Total bases retained: 6,984,905,693

Status breakdown:
Status
KEPT    385
Name: count, dtype: int64


In [18]:
fungi_test_df = analyze_logs(log_dir, 'fungi', 'test')

---------- TEST ----------

===== Summary for FUNGI =====
Total genomes processed: 2564
 - KEPT: 2564
 - DELETED: 0

Contig Counts:
 - Original contigs: 4,386,266
 - After filtering: 1,860,771

Statistics for KEPT files:
 - Mean GC %: 46.76
 - Mean Avg Length: 405658.8 bp
 - Total bases retained: 76,102,766,806

Status breakdown:
Status
KEPT    2564
Name: count, dtype: int64
