In [10]:
import sys
from pathlib import Path
# ensure project root and src are importable inside the notebook environment
ROOT = Path.cwd().parent  # notebooks/ -> project root
print('Notebook cwd is', ROOT)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
if str(ROOT / 'src') not in sys.path:
    sys.path.insert(0, str(ROOT / 'src'))

print('sys.path includes ROOT', str(ROOT) in sys.path)
print('sys.path includes ROOT/src', str(ROOT / 'src') in sys.path)
print('src dir exists', (ROOT / 'src').exists())

import importlib
importlib.invalidate_caches()

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
import gzip

# Data directories used in notebook
RAW_DIR = Path('data/raw')
PROCESSED_DIR = Path('data/processed')
FIG_DIR = Path('figures/qc_preprocessing')
# Ensure dirs exist
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
FIG_DIR.mkdir(parents=True, exist_ok=True)


Notebook cwd is d:\OneDrive\Documents\My Docs\CMU\Fall '25\Model & Sims\Project\ODE-to-Circadian-Clocks
sys.path includes ROOT True
sys.path includes ROOT/src True
src dir exists True


In [12]:
from preprocessing.gse48113 import parse_meta

files = sorted(RAW_DIR.glob("*.txt.gz"))
if not files:
    raise RuntimeError("No .txt.gz files found in data/raw; did you extract GSE48113_RAW.tar?")

meta = pd.DataFrame([parse_meta(f) for f in files]).sort_values(["subject", "condition", "t_idx"])
meta.to_csv(PROCESSED_DIR / "sample_metadata.csv", index=False)
print("Number of samples:", len(meta))
display(meta.head())

# Quick QC: sample counts per time index and condition
plt.figure(figsize=(6, 4))
sns.countplot(data=meta, x="t_idx", hue="condition")
plt.title("Sample counts by time index and condition")
plt.tight_layout()
plt.savefig(FIG_DIR / "sample_counts_by_time_condition.png", dpi=200)
plt.close()

Number of samples: 287


Unnamed: 0,gsm,subject,condition,t_idx,file
0,GSM1168586,BB0012,R,1,data\raw\GSM1168586_BB0012_R_1.txt.gz
1,GSM1168587,BB0012,R,2,data\raw\GSM1168587_BB0012_R_2.txt.gz
2,GSM1168588,BB0012,R,3,data\raw\GSM1168588_BB0012_R_3.txt.gz
3,GSM1168589,BB0012,R,4,data\raw\GSM1168589_BB0012_R_4.txt.gz
4,GSM1168590,BB0012,R,5,data\raw\GSM1168590_BB0012_R_5.txt.gz


In [13]:
# Debug: can we import 'preprocessing' modules?
import importlib
try:
    m = importlib.import_module('preprocessing')
    print('preprocessing module:', m)
    print('preprocessing attributes:', [k for k in dir(m) if not k.startswith('__')][:20])
except Exception as e:
    print('could not import preprocessing:', e)

try:
    p = importlib.import_module('preprocessing')
    print('preprocessing:', p)
    print('preprocessing path:', p.__path__)
except Exception as e:
    print('could not import preprocessing:', e)

try:
    q = importlib.import_module('src.preprocessing')
    print('src.preprocessing module found via importlib (should not be necessary in notebooks):', q)
except Exception as e:
    print('src.preprocessing not found (expected)', e)

from importlib import invalidate_caches
invalidate_caches()


preprocessing module: <module 'preprocessing' from "d:\\OneDrive\\Documents\\My Docs\\CMU\\Fall '25\\Model & Sims\\Project\\ODE-to-Circadian-Clocks\\src\\preprocessing\\__init__.py">
preprocessing attributes: ['gse48113']
preprocessing: <module 'preprocessing' from "d:\\OneDrive\\Documents\\My Docs\\CMU\\Fall '25\\Model & Sims\\Project\\ODE-to-Circadian-Clocks\\src\\preprocessing\\__init__.py">
preprocessing path: ["d:\\OneDrive\\Documents\\My Docs\\CMU\\Fall '25\\Model & Sims\\Project\\ODE-to-Circadian-Clocks\\src\\preprocessing"]
src.preprocessing module found via importlib (should not be necessary in notebooks): <module 'src.preprocessing' from "d:\\OneDrive\\Documents\\My Docs\\CMU\\Fall '25\\Model & Sims\\Project\\ODE-to-Circadian-Clocks\\src\\preprocessing\\__init__.py">


In [14]:
# Read Agilent FE files, QC, normalize, collapse probes->genes, and produce plots

AGILENT_CANDIDATE_COLS = [
    "ProbeName", "SystematicName", "ControlType", "GeneName",
    "gProcessedSignal", "gMeanSignal", "gBGMedianSignal",
    "gIsWellAboveBG", "gIsPosAndSignif", "gNumPix"
]

INT_CHOICES  = ["gProcessedSignal", "gMeanSignal", "ProcessedSignal", "Signal"]
PROBE_KEYS   = ["ProbeName", "SystematicName", "FeatureNum"]
GENE_KEYS    = ["GeneName", "Gene Symbol", "GENE_SYMBOL"]


def find_header_row_gz(path: Path, max_lines: int = 600) -> int | None:
    opener = gzip.open if str(path).endswith('.gz') else open
    with opener(path, 'rt', errors='ignore') as f:
        for i, line in enumerate(f):
            if i > max_lines:
                break
            cols = line.rstrip('\n').split('\t')
            if 'ProbeName' in cols and any(c in cols for c in ('gProcessedSignal', 'gMeanSignal')):
                return i
    return None


def read_agilent_fe(path: Path) -> pd.DataFrame:
    hdr = find_header_row_gz(path)
    if hdr is None:
        raise RuntimeError(f"Could not locate FEATURES header in {path}")
    df = pd.read_csv(
        path,
        sep='\t',
        compression='gzip' if str(path).endswith('.gz') else None,
        skiprows=hdr,
        dtype=str,
        low_memory=False,
    )
    return df


def extract_expression(df: pd.DataFrame) -> pd.DataFrame:
    int_col   = next((c for c in INT_CHOICES if c in df.columns), None)
    probe_col = next((c for c in PROBE_KEYS   if c in df.columns), None)
    gene_col  = next((c for c in GENE_KEYS    if c in df.columns), None)
    if int_col is None or probe_col is None:
        raise ValueError('Missing intensity or probe column in FE file')
    if 'ControlType' in df.columns:
        df = df[df['ControlType'].astype(str) == '0']
    out = df[[probe_col, int_col]].rename(columns={probe_col: 'probe', int_col: 'intensity'})
    if gene_col:
        out['gene'] = df[gene_col]
    return out.reset_index(drop=True)

# Peek one file
from preprocessing.gse48113 import read_agilent_fe, extract_expression, quantile_normalize

# Example FE
if len(files) > 0:
    df0 = read_agilent_fe(files[0])
    print('Example FE file shape:', df0.shape)
    print('Example FE columns:', df0.columns[:15].tolist())
    ex0 = extract_expression(df0)
    print('Extracted expression example:')
    display(ex0.head())

    plt.figure(figsize=(5, 4))
    sns.histplot(ex0['intensity'].astype(float), bins=80, log_scale=(False, True))
    plt.xlabel('Raw intensity (gProcessedSignal)')
    plt.ylabel('Count (log scale)')
    plt.title('Raw intensity distribution (single array)')
    plt.tight_layout()
    plt.savefig(FIG_DIR / 'raw_intensity_hist_single_array.png', dpi=200)
    plt.close()

Example FE file shape: (43758, 63)
Example FE columns: ['FEATURES', 'FeatureNum', 'Row', 'Col', 'chr_coord', 'accessions', 'SubTypeMask', 'SubTypeName', 'Start', 'Sequence', 'ProbeUID', 'ControlType', 'ProbeName', 'GeneName', 'SystematicName']
Extracted expression example:


Unnamed: 0,probe,intensity,gene
0,A_23_P67299,13.61813,DOCK6
1,A_23_P49021,783.3045,WDR61
2,A_24_P315975,4.684536,KRTAP4-9
3,A_24_P109191,73.38431,A_24_P109191
4,A_24_P269814,1367.563,PLEKHA1
