In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [41]:
log_dir = Path('../../results/logs/preprocessing-logs')

In [42]:
def load_logs(log_dir, file_name='*.csv'):
    log_dir = Path(log_dir)
    csv_files = list(log_dir.rglob(file_name))
    if not csv_files:
        print('No CSV files found')
        return None

    all_dfs = list()
    for csv_file in csv_files:
        df = pd.read_csv(csv_file)
        df['__source_file'] = csv_file.name
        df['__log_type'] = (
            'deduplication' if 'deduplication' in csv_file.name else
            'near_identical' if 'near_identical' in csv_file.name else
            'preprocessing'
        )
        all_dfs.append(df)
    return pd.concat(all_dfs, ignore_index=True)

In [43]:
def analyze_preprocessing_logs(log_dir, category):
    df = load_logs(log_dir, f"./{category}/{category}_*_report.csv")

    if df is None:
        return None

    df = df[df['__log_type'] == 'preprocessing']

    # Convert numeric columns
    df["GC Content (%)"] = df["GC Content (%)"].astype(float)
    df["Avg Length"] = df["Avg Length"].astype(float)

    # Summary counts
    kept = df[df["Status"] == "KEPT"]
    deleted = df[df["Status"] != "KEPT"]

    print(f"\n===== Summary for {category.upper()} =====")
    print(f"Total genomes processed: {len(df)}")
    print(f" - KEPT: {len(kept)}")
    print(f" - DELETED: {len(deleted)}")

    # Aggregate contig stats
    total_contigs_before = df["Original Contigs"].sum()
    total_after_filter = df["After Filtering"].sum()

    print(f"\nContig Counts:")
    print(f" - Original contigs: {total_contigs_before:,}")
    print(f" - After filtering: {total_after_filter:,}")

    # GC and Length stats (for KEPT)
    mean_gc = kept["GC Content (%)"].mean() if not kept.empty else 0.0
    mean_len = kept["Avg Length"].mean() if not kept.empty else 0.0
    total_bases = (kept["After Filtering"] * kept["Avg Length"]).sum()

    print(f"\nStatistics for KEPT files:")
    print(f" - Mean GC %: {mean_gc:.2f}")
    print(f" - Mean Avg Length: {mean_len:.1f} bp")
    print(f" - Total Bases Retained: {int(total_bases):,}")

    # Status breakdown
    print(f"\nStatus breakdown:")
    print(df["Status"].value_counts())

    return df


In [44]:
def analyze_deduplication_logs(log_dir, category):
   df = load_logs(log_dir, f"./{category}/{category}_*_report.csv")
   if df is None:
       return None

   df = df[df['__log_type'] == 'deduplication']
   print(f"\n===== Deduplication Summary for {category.upper()} =====")
   if "Split" in df.columns:
        print("\nRemovals per split:")
        print(df["Split"].value_counts())

    # Count reasons for removal
   if "Reason" in df.columns:
        print("\nReasons for removal:")
        print(df["Reason"].value_counts())

   return df


In [45]:
def analyze_near_identical_logs(log_dir, category):
    df = load_logs(log_dir, f"{category}/{category}_near_identical_report.csv")
    if df is None:
        return None

    df = df[df["__log_type"] == "near_identical"]

    print(f"\n===== Near-Identical Summary for {category.upper()} =====")
    if "Split" in df.columns:
        print("\nRemovals per split:")
        print(df["Split"].value_counts())

    return df


In [46]:
def analyze_log_per_category(log_dir, category):
    preprocessing_df = analyze_preprocessing_logs(log_dir, category)
    dedup_df = analyze_deduplication_logs(log_dir, category)
    near_identical_df = analyze_near_identical_logs(log_dir, category)

    return {
        'preprocessing': preprocessing_df,
        'deduplication': dedup_df,
        'near_identical': near_identical_df,
    }

In [47]:
category = 'archaea'

results = analyze_log_per_category(log_dir, category)


===== Summary for ARCHAEA =====
Total genomes processed: 5885
 - KEPT: 5879
 - DELETED: 6

Contig Counts:
 - Original contigs: 852,833.0
 - After filtering: 820,029.0

Statistics for KEPT files:
 - Mean GC %: 46.39
 - Mean Avg Length: 257319.3 bp
 - Total Bases Retained: 9,951,876,385

Status breakdown:
Status
KEPT                        5879
DELETED (empty or error)       6
Name: count, dtype: int64

===== Deduplication Summary for ARCHAEA =====

Removals per split:
Split
test    4
Name: count, dtype: int64

Reasons for removal:
Reason
Exact duplicate (MD5, Train vs Test)    4
Name: count, dtype: int64

===== Near-Identical Summary for ARCHAEA =====

Removals per split:
Split
test    725
val      58
Name: count, dtype: int64
