In [18]:
# IMPORTS AND VERSION CHECK
from pathlib import Path
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from dotenv import load_dotenv
import sys
from time import strftime
import requests

# PyDESeq2
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats

# Config reproducible
np.random.seed(16)

In [19]:
# ENV AND PATHS

# Load environment variables from .env file
repo_root = Path.cwd().parent
load_dotenv(repo_root / ".env")

# Paths
INPUT_DIR = Path(os.getenv("INPUT_DIR"))
OUTPUT_DIR = Path(os.getenv("OUTPUT_DIR"))

# Inputs
RAW_DIR = (repo_root / INPUT_DIR / "raw" / "functional_annotation").resolve()
META_PATH = (repo_root / INPUT_DIR / "metadata" / "metadata.csv").resolve()

# Outputs
analysis_name = "01_deseq_study"
OUT_ROOT = (repo_root / OUTPUT_DIR / analysis_name).resolve()
OUT_CSV = OUT_ROOT / "csv"
OUT_PLOTS = OUT_ROOT / "plots"
for d in [OUT_ROOT, OUT_CSV, OUT_PLOTS]:
    d.mkdir(parents=True, exist_ok=True)

#Check paths
print(f"Repo root: {repo_root}")
print(f"Input dir: {INPUT_DIR}")
print(f"Output dir: {OUTPUT_DIR}")

Repo root: c:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos
Input dir: data
Output dir: results


In [20]:
# FUNCTIONS

def read_emapper(raw_dir, pattern="ERR*.emapper.annotations"):
    """
    Read all *.emapper.annotations files under `raw_dir` matching `pattern`
    and return a single long DataFrame with columns:
        ['sample_id', 'query', 'KEGG_Module'].
    """
    raw_dir = Path(raw_dir)
    files = sorted(raw_dir.glob(pattern))
    if not files:
        raise FileNotFoundError(f"No files matching '{pattern}' found in {raw_dir}")

    all_dfs = []
    for fp in files:
        m = re.search(r"(ERR\d+)", fp.name, flags=re.I)
        sample_id = m.group(1).upper() if m else fp.stem.split(".")[0]

        header = None
        with open(fp, "r", encoding="utf-8", errors="ignore") as f:
            for line in f:
                if line.startswith("#query"):
                    header = line.lstrip("#").strip().split("\t")
                    break
        if header is None:
            print(f"Skipping {fp.name}: no '#query' header found.")
            continue

        df = pd.read_csv(fp, sep="\t", comment="#", names=header, dtype=str, low_memory=False)
        df = df[["query", "KEGG_Module"]].dropna()
        df = df[df["KEGG_Module"] != "-"]
        df["KEGG_Module"] = df["KEGG_Module"].astype(str).str.split(",")
        df = df.explode("KEGG_Module", ignore_index=True)
        df["KEGG_Module"] = df["KEGG_Module"].str.strip()
        df = df[df["KEGG_Module"] != ""]

        df.insert(0, "sample_id", sample_id)
        all_dfs.append(df[["sample_id", "query", "KEGG_Module"]])

    if not all_dfs:
        raise RuntimeError("No valid annotation files could be read.")

    out = pd.concat(all_dfs, ignore_index=True)
    print(f"Loaded {len(files)} files — {out['sample_id'].nunique()} samples, {len(out):,} rows.")
    return out

def summarize_modules(df):
    """
    Return:
      - long_counts: ['sample_id','KEGG_Module','n_proteins'] (#unique queries per module)
      - matrix: samples x modules (counts)
    """
    long_counts = (
        df.groupby(["sample_id", "KEGG_Module"])["query"]
        .nunique()
        .reset_index(name="n_proteins")
    )
    matrix = (
        long_counts
        .pivot(index="sample_id", columns="KEGG_Module", values="n_proteins")
        .fillna(0)
        .astype(int)
    )
    return long_counts, matrix

def fetch_kegg_reference():
    """
    Fetch KEGG module reference (KEGG_Module, Module_name, Module_description).
    """
    url = "https://rest.kegg.jp/list/module"
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    lines = [line.split("\t") for line in r.text.strip().split("\n")]
    df = pd.DataFrame(lines, columns=["KEGG_Module", "Description"])
    df["KEGG_Module"] = df["KEGG_Module"].str.replace("module:", "", regex=False).str.strip()
    df["Module_name"] = df["Description"].str.split(",", n=1).str[0]
    df["Module_description"] = df["Description"]
    return df.drop_duplicates(subset=["KEGG_Module"])

def percent_unmapped(modules, reference_df):
    """
    Percentage of modules in `modules` that are NOT present in the KEGG reference dataframe.
    """
    present = set(modules)
    found = set(reference_df["KEGG_Module"])
    missing = [m for m in present if m not in found]
    return round(100 * len(missing) / len(present), 2) if present else 0.0

def matrix_sparsity(matrix):
    """
    Sparsity (percentage of zeros) in a counts matrix.
    """
    total = matrix.size
    zeros = (matrix == 0).sum().sum()
    return round(100 * zeros / total, 2) if total else 0.0

# Read metadata

def load_metadata(meta_path):
    """
    Load metadata.csv and return DataFrame with columns: ['sample_id', 'group'].
    Expects columns 'NCBI_accession' and 'study_condition'.
    """
    meta = pd.read_csv(meta_path)
    if "NCBI_accession" not in meta.columns or "study_condition" not in meta.columns:
        raise ValueError("Metadata must have columns 'NCBI_accession' and 'study_condition'.")
    meta["sample_id"] = (
        meta["NCBI_accession"].astype(str).str.extract(r"(ERR\d+)", expand=False).str.upper()
    )
    meta["group"] = meta["study_condition"].astype(str).str.strip().str.lower()
    meta = meta.dropna(subset=["sample_id", "group"]).drop_duplicates("sample_id")
    return meta[["sample_id", "group"]]

# Results 

def save_csv(df, out_dir, name):
    """
    Save a DataFrame as CSV inside `out_dir`.
    Automatically adds a timestamp unless stamp=False.
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    file_path = out_dir / f"{name}.csv"
    df.to_csv(file_path, index=True)
    print(f"CSV saved in: {file_path}")
    return file_path

def save_plot(out_dir, name, dpi=300):
    """
    Save the current Matplotlib figure in `out_dir` with timestamp.
    """
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    file_path = out_dir / f"{name}.png"
    plt.savefig(file_path, dpi=dpi, bbox_inches="tight")
    plt.close()
    print(f"FIG saved in: {file_path}")
    return file_path

In [21]:
# LOAD ANNOTATIONS AND BUILD ABUNDANCE MATRIX (E-MAPPER -> LONG AND MATRIX)

# Read all emapper annotations
annotations_long = read_emapper(RAW_DIR) 
print(f"Total KEGG_Module annotations: {len(annotations_long):,}")

# Summarize to long_counts and counts matrix (samples x modules)
long_counts, matrix = summarize_modules(annotations_long)
print(f"Matrix shape (samples x modules): {matrix.shape} | Sparsity: {matrix_sparsity(matrix)}%")

# Save CSV outputs
save_csv(annotations_long, OUT_CSV, "annotations_long")
save_csv(long_counts, OUT_CSV, "counts_long")
save_csv(matrix, OUT_CSV, "counts_matrix")

Loaded 171 files — 171 samples, 9,237,561 rows.
Total KEGG_Module annotations: 9,237,561
Matrix shape (samples x modules): (171, 757) | Sparsity: 30.07%
CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\annotations_long.csv
CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\counts_long.csv
CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\counts_matrix.csv


WindowsPath('C:/Users/Andre/OneDrive/Escritorio/tfm_andrea_ramos/results/01_deseq_study/csv/counts_matrix.csv')

In [40]:
matrix = pd.read_csv(OUT_CSV / "counts_matrix.csv", index_col=0)

In [41]:
# KEGG MODULE ANNOTATION AND FILTER 

try:
    kmap = fetch_kegg_reference()
    pct_unmapped = percent_unmapped(matrix.columns, kmap)
    print(f"% of matrix modules not found in KEGG: {pct_unmapped}%")

    # Annotate long table and keep only modules that exist in KEGG
    long_counts_annot = long_counts.merge(kmap, on="KEGG_Module", how="inner")
    valid_mods = set(long_counts_annot["KEGG_Module"].unique())
    matrix = matrix.loc[:, sorted([c for c in matrix.columns if c in valid_mods])]
    print(f"Modules retained with KEGG description: {matrix.shape[1]}")

    # Save
    save_csv(kmap, OUT_CSV, "kegg_reference")
    save_csv(long_counts_annot, OUT_CSV, "counts_long_annotated")

except Exception as e:
    print("KEGG fetch failed; continuing without annotation:", e)
    kmap = None
    long_counts_annot = long_counts.copy()
    long_counts_annot["Module_name"] = long_counts_annot["KEGG_Module"]
    long_counts_annot["Module_description"] = "NA"
    
    # Save fallback
    save_csv(long_counts_annot, OUT_CSV, "counts_long_annotated_fallback")

% of matrix modules not found in KEGG: 55.88%
Modules retained with KEGG description: 334
CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\kegg_reference.csv
CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\counts_long_annotated.csv


In [23]:
# METADATA AND MERGE

# Load metadata
meta = load_metadata(META_PATH)
TARGET_GROUPS = ["control", "schizophrenia"]
meta = meta[meta["group"].isin([g.lower() for g in TARGET_GROUPS])].copy()

# Set index to sample_id
meta = meta.set_index("sample_id").sort_index()
matrix = matrix.sort_index()

# Inner join
joined = matrix.join(meta, how="inner")

if joined.empty:
    raise RuntimeError("Inner join produced an empty set: no overlapping samples between counts and metadata.")

# Split back into DESeq2 objects (counts_df and metadata)
metadata = joined[["group"]].copy()
counts_df = joined.drop(columns=["group"]).astype(int)

# Save unfiltered count matrix
save_csv(counts_df, OUT_CSV, "counts_unfiltered")
print(f"Unfiltered counts saved: {counts_df.shape}")

# Filter low-abundance modules
counts_filtered = counts_df.loc[:, (counts_df.sum(axis=0) >= 10)]

# Save the filtered count matrix
save_csv(counts_filtered, OUT_CSV, "counts_filtered")
save_csv(metadata, OUT_CSV, "metadata_aligned")

# Report summary
print("DESeq2 inputs:")
print("  counts_unfiltered:", counts_df.shape)
print("  counts_filtered:  ", counts_filtered.shape)
print("  metadata:         ", metadata.shape)
print("Groups:", metadata["group"].value_counts().to_dict())

CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\counts_unfiltered.csv
Unfiltered counts saved: (171, 334)
CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\counts_filtered.csv
CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\metadata_aligned.csv
DESeq2 inputs:
  counts_unfiltered: (171, 334)
  counts_filtered:   (171, 305)
  metadata:          (171, 1)
Groups: {'schizophrenia': 90, 'control': 81}


In [37]:
# COUNT MATRIX + METADATA
# Combine original counts_matrix (samples x modules) with metadata and save
counts_matrix_with_meta = metadata.join(matrix, how="inner")
save_csv(counts_matrix_with_meta, OUT_CSV, "counts_matrix_with_metadata")

CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\counts_matrix_with_metadata.csv


WindowsPath('C:/Users/Andre/OneDrive/Escritorio/tfm_andrea_ramos/results/01_deseq_study/csv/counts_matrix_with_metadata.csv')

In [24]:
# DIFFERENTIAL ANALYSIS (PyDESeq2)

# Configure groups and set reference
metadata = metadata.copy()
metadata["group"] = pd.Categorical(
    metadata["group"].str.lower(),
    categories=["control", "schizophrenia"],
    ordered=True
)

# Initialize DESeqDataSet
dds = DeseqDataSet(
    counts=counts_filtered,
    metadata=metadata,      
    design="~ group",
    refit_cooks=True
)
dds.deseq2()

# Contrast: schizophrenia vs control
ds = DeseqStats(
    dds, 
    contrast=["group", "schizophrenia", "control"],
    alpha=0.05,
    cooks_filter=True,
    independent_filter=True
)
ds.summary()

Fitting size factors...
... done in 0.02 seconds.



Using None as control genes, passed at DeseqDataSet initialization


Fitting dispersions...
... done in 0.34 seconds.

Fitting dispersion trend curve...
  self._fit_parametric_dispersion_trend(vst)
... done in 0.06 seconds.

Fitting MAP dispersions...
... done in 0.48 seconds.

Fitting LFCs...
... done in 0.26 seconds.

Calculating cook's distance...
... done in 0.02 seconds.

Replacing 0 outlier genes.

Running Wald tests...


Log2 fold change & Wald test p-value: group schizophrenia vs control
          baseMean  log2FoldChange     lfcSE      stat    pvalue      padj
M00001  755.155210        0.008188  0.009872  0.829474  0.406836  0.892698
M00002  405.801700        0.004106  0.014211  0.288897  0.772660  0.999723
M00003  525.165444        0.014409  0.010788  1.335679  0.181654  0.688905
M00004  451.492611        0.006004  0.010786  0.556618  0.577789  0.999723
M00005   66.487499        0.020617  0.026422  0.780303  0.435213  0.909177
...            ...             ...       ...       ...       ...       ...
M00841  186.060300       -0.021877  0.016232 -1.347745  0.177740  0.688905
M00842   67.482778       -0.024876  0.026574 -0.936131  0.349206  0.875098
M00843   67.491975       -0.024243  0.026568 -0.912475  0.361519  0.875098
M00844  162.463244        0.009870  0.017051  0.578824  0.562708  0.999723
M00845  264.751661       -0.004839  0.013454 -0.359665  0.719098  0.999723

[305 rows x 6 columns]


... done in 0.25 seconds.



In [None]:
# GET RESULTS AND SIGNIFICANT MODULES
res_df = ds.results_df.copy()
res_df = res_df.reset_index()
if "KEGG_Module" not in res_df.columns:
    # First column after reset_index() is the module IDs
    res_df = res_df.rename(columns={res_df.columns[0]: "KEGG_Module"})

res_clean = res_df.dropna(subset=["log2FoldChange", "padj"]).copy()

# Significant list (padj < 0.05)
sig = res_clean[res_clean["padj"] < 0.05].sort_values("padj", na_position="last").copy()
n_total = res_clean.shape[0]
n_sig   = sig.shape[0]
n_up    = int((sig["log2FoldChange"] > 0).sum())
n_down  = int((sig["log2FoldChange"] < 0).sum())

print(f"Total tested modules: {n_total}")
print(f"Significant (padj < 0.05): {n_sig} | Up: {n_up} | Down: {n_down}")

# Save significant table
case_group = "schizophrenia"
ref_group  = "control"
save_csv(sig, OUT_CSV, f"significant_modules__{case_group}_vs_{ref_group}")

# Save significant results with annotation
if "kmap" in globals() and isinstance(kmap, pd.DataFrame) and not kmap.empty:
    sig_annot = sig.merge(kmap, on="KEGG_Module", how="left")
else:
    print("KEGG reference not available, skipping annotation.")
    sig_annot = sig.copy()
    sig_annot["Module_name"] = sig_annot["KEGG_Module"]
    sig_annot["Module_description"] = "NA"

# Add direction column (up/down regulation)
sig_annot["direction"] = np.where(
    sig_annot["log2FoldChange"] > 0, "up_in_schizophrenia", "down_in_schizophrenia"
)

# Save annotated table
save_csv(sig_annot, OUT_CSV, f"significant_modules_annotated__{case_group}_vs_{ref_group}")

Total tested modules: 305
Significant (padj < 0.05): 5 | Up: 2 | Down: 3
CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\significant_modules__schizophrenia_vs_control.csv
CSV saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\csv\significant_modules_annotated__schizophrenia_vs_control.csv


WindowsPath('C:/Users/Andre/OneDrive/Escritorio/tfm_andrea_ramos/results/01_deseq_study/csv/significant_modules_annotated__schizophrenia_vs_control.csv')

In [36]:
# LFC distribution: seaborn histogram with KDE and mean ± SD lines
lfc = res_clean["log2FoldChange"].astype(float)
mu  = float(lfc.mean())
sd  = float(lfc.std(ddof=1))

plt.figure()
sns.histplot(lfc, bins=50, kde=True)
plt.axvline(mu,             ls="-",  lw=2,   color="black", label=f"Mean = {mu:.2f}")
plt.axvline(mu - sd,        ls="--", lw=1.5, color="gray",  label=f"Mean ± SD ({sd:.2f})")
plt.axvline(mu + sd,        ls="--", lw=1.5, color="gray")
plt.xlabel("log2 Fold Change")
plt.title(f"LFC distribution ({case_group} vs {ref_group}) with Mean ± SD")
plt.legend()
save_plot(OUT_PLOTS, f"hist_log2fc_mean_pm_sd__{case_group}_vs_{ref_group}")

FIG saved in: C:\Users\Andre\OneDrive\Escritorio\tfm_andrea_ramos\results\01_deseq_study\plots\hist_log2fc_mean_pm_sd__schizophrenia_vs_control.png


WindowsPath('C:/Users/Andre/OneDrive/Escritorio/tfm_andrea_ramos/results/01_deseq_study/plots/hist_log2fc_mean_pm_sd__schizophrenia_vs_control.png')