In [1]:
from google.colab import drive
drive.mount('/content/drive')

import os, getpass
from pathlib import Path

BASE = Path("/content/drive/MyDrive/Colab Notebooks/BioModSim-FinalProject_F25")
PROJ = "ODE-to-Circadian-Clocks"          # folder name for the repo
REPO_HTTPS = "https://github.com/<YOU>/<REPO>.git"   # <-- replace

BASE.mkdir(parents=True, exist_ok=True)
proj_path = BASE / PROJ

# Clone once if missing
if not (proj_path / ".git").exists():
    # If private, prompt for a token; if public, skip this block and use plain REPO_HTTPS
    IS_PRIVATE = True
    if IS_PRIVATE:
        token = getpass.getpass("GitHub token: ")
        auth_url = REPO_HTTPS.replace("https://", f"https://{token}@")
        !git clone "{auth_url}" "{proj_path}"
    else:
        !git clone "{REPO_HTTPS}" "{proj_path}"
else:
    print("Repo already present.")

# Change directory using Python (avoid %cd)
os.chdir(proj_path)
print("PWD:", Path.cwd())
!git config pull.rebase false
!git pull

# Data dir (inside repo)
DATA_DIR = proj_path / "data"
DATA_DIR.mkdir(exist_ok=True)
print("Data dir:", DATA_DIR)
!ls -al "{DATA_DIR}"


Mounted at /content/drive
Repo already present.
PWD: /content/drive/MyDrive/Colab Notebooks/BioModSim-FinalProject_F25/ODE-to-Circadian-Clocks
Already up to date.
Data dir: /content/drive/MyDrive/Colab Notebooks/BioModSim-FinalProject_F25/ODE-to-Circadian-Clocks/data
total 2524544
-rw------- 1 root root 2585128960 Nov 10 17:59 GSE48113_RAW.tar
drwx------ 2 root root       4096 Nov 10 19:35 raw


In [2]:
from pathlib import Path
import tarfile

RAW_TAR = Path("data/GSE48113_RAW.tar")
RAW_DIR = Path("data/raw")
if not RAW_DIR.exists():
    RAW_DIR.mkdir(parents=True, exist_ok=True)
    with tarfile.open(RAW_TAR) as tar:
        tar.extractall(RAW_DIR)
    print("Extracted.")
else:
    print("Already extracted.")


Already extracted.


In [3]:
# Imports
from pathlib import Path
import pandas as pd
import numpy as np
import re
import gzip

In [4]:

# Check out the columns and contents of one of the .gz files

one = sorted(Path("data/raw").glob("*.txt.gz"))[0]
df0 = pd.read_csv(one, sep="\t", compression="gzip", nrows=50, low_memory=False)
df0.columns.tolist()[:20]
df0.head(3)

Unnamed: 0,TYPE,text,text.1,text.2,text.3,integer,float,float.1,text.4,text.5,...,float.63,integer.58,integer.59,float.64,text.23,integer.60,integer.61,integer.62,integer.63,integer.64
0,FEPARAMS,Protocol_Name,Protocol_date,Scan_Date,Scan_ScannerName,Scan_NumChannels,Scan_MicronsPerPixelX,Scan_MicronsPerPixelY,Scan_OriginalGUID,Grid_Name,...,QCMetrics_MinReproducibility,QCMetrics_Formulation,QCMetrics_EnableDyeFlip,QCMetrics_PercentileValueForSignal,FeatureExtractor_Version,FeatureExtractor_SingleTextFileOutput,FeatureExtractor_JPEGDownSampleFactor,FeatureExtractor_ColorMode,FeatureExtractor_QCReportType,FeatureExtractor_OutputQCReportGraphText
1,DATA,GE1_107_Sep09 (Read Only),29-Sep-2009 12:35,11-23-2010 10:33:09,Agilent Technologies Scanner G2505B US45102984,1,5,5,b78b5053-6369-4e90-91da-2b2b004b82b5,026817_D_F_20100112ANNOTATED,...,50,2,0,75,10.7.1.1,1,4,0,0,0
2,*,,,,,,,,,,...,,,,,,,,,,


In [5]:

def parse_meta(fn: Path):
    # e.g., GSM1168586_BB0012_R_1.txt.gz
    name = fn.stem  # removes .gz
    if name.endswith(".txt"):
        name = name[:-4]
    m1 = re.match(r"(GSM\d+)_([A-Za-z0-9]+)_([RS])_(\d+)$", name)
    gsm, subj, cond, t_idx = m1.groups()
    return dict(gsm=gsm, subject=subj, condition=cond, t_idx=int(t_idx), file=str(fn))

files = sorted(Path("data/raw").glob("*.txt.gz"))
meta = pd.DataFrame([parse_meta(f) for f in files]).sort_values(["subject","condition","t_idx"])
meta.head()
print("n samples:", len(meta))


n samples: 287


In [7]:
# columns you expect in Agilent one-color FE files
AGILENT_CANDIDATE_COLS = [
    "ProbeName", "SystematicName", "ControlType", "GeneName",
    "gProcessedSignal", "gMeanSignal", "gBGMedianSignal",
    "gIsWellAboveBG", "gIsPosAndSignif", "gNumPix"
]

def find_header_row_gz(path, max_lines=500):
    """
    Scan a (possibly gzipped) text file for the first line that looks like the
    FEATURES header by checking for typical Agilent FE columns.
    Returns the zero-based line index of the header row.
    """
    opener = gzip.open if str(path).endswith(".gz") else open
    with opener(path, "rt", errors="ignore") as f:
        for i, line in enumerate(f):
            if i > max_lines:  # safety
                break
            cols = line.rstrip("\n").split("\t")
            # Heuristic: must contain 'ProbeName' and at least one g* signal column
            if "ProbeName" in cols and any(c in cols for c in ("gProcessedSignal","gMeanSignal")):
                return i
    return None

def read_agilent_fe(path: str | Path) -> pd.DataFrame:
    """
    Read an Agilent Feature Extraction .txt or .txt.gz into a DataFrame.
    Automatically locates the header row by column names.
    """
    path = Path(path)
    header_idx = find_header_row_gz(path)
    if header_idx is None:
        raise RuntimeError(f"Could not locate an Agilent FEATURES header in {path}")
    # pandas will treat the *next* line after skiprows as the header row
    df = pd.read_csv(
        path,
        sep="\t",
        compression="gzip" if str(path).endswith(".gz") else None,
        skiprows=header_idx,
        dtype=str,     # read as strings first (mixed types common), cast later
        low_memory=False
    )
    return df

# Example usage
one = sorted(Path("data/raw").glob("*.txt.gz"))[0]
df0 = read_agilent_fe(one)

# sanity peek: which FE-like columns are present?
present = [c for c in AGILENT_CANDIDATE_COLS if c in df0.columns]
missing = [c for c in AGILENT_CANDIDATE_COLS if c not in df0.columns]
print("Present FE columns:", present)
print("Missing FE columns:", missing)
df0.head(3)

Present FE columns: ['ProbeName', 'SystematicName', 'ControlType', 'GeneName', 'gProcessedSignal', 'gMeanSignal', 'gBGMedianSignal', 'gIsWellAboveBG', 'gIsPosAndSignif', 'gNumPix']
Missing FE columns: []


Unnamed: 0,FEATURES,FeatureNum,Row,Col,chr_coord,accessions,SubTypeMask,SubTypeName,Start,Sequence,...,SpotExtentX,SpotExtentY,gNetSignal,gMultDetrendSignal,gProcessedBackground,gProcessedBkngError,IsUsedBGAdjust,gInterpolatedNegCtrlSub,gIsInNegCtrlRange,gIsUsedInMD
0,DATA,1,1,1,,,260,BrightCorner,0,,...,57.259,57.259,139029.0,0.810563,49.3102,9.17273,0,138987.0,0,0
1,DATA,2,1,2,,,66,Structural,0,,...,57.5363,57.5363,40.4795,0.813782,51.2922,7.81943,0,-1.22966,0,0
2,DATA,3,1,3,,,66,Structural,0,,...,48.5334,48.5334,40.2987,0.816876,52.1925,7.48548,0,-1.40499,0,0


| Column                                                                                         | What it is                                                    | How we might use it                                                 |
| ---------------------------------------------------------------------------------------------- | ------------------------------------------------------------- | ----------------------------------------------------------------- |
| `ProbeName` / `SystematicName`                                                                 | Probe identifier on the array                                 | Key for joining to platform (GPL) and for aligning across samples |
| `ControlType`                                                                                  | -1 (neg ctrl), 0 (real probe), 1 (pos ctrl)                   | Keep `ControlType == 0`                                           |
| `GeneName` (sometimes `Gene Symbol`)                                                           | FE‚Äôs gene symbol annotation                                   | Quick mapping to genes (you can cross-check with GPL later)       |
| **`gProcessedSignal`**                                                                         | **Background-corrected intensity** (green channel, one-color) | This is the expression signal you‚Äôll analyze                      |
| `gMeanSignal`, `gBGMedianSignal`                                                               | Raw spot and background                                       | Usually not needed once you use `gProcessedSignal`                |
| QC flags: `gIsWellAboveBG`, `gIsPosAndSignif`, `gIsFeatNonUnifOL`, `gIsBGNonUnifOL`, `gNumPix` | Quality indicators                                            | Optional filtering (e.g., drop non-uniform outliers)              |


**What are we modeling (features vs response)?**

The expression measurement per probe is the gProcessedSignal column (one value per probe, per sample).

After you log2 + normalize across samples and collapse probes‚Üígenes, you‚Äôll get a gene √ó sample matrix.

Temporal dynamics: each subject was sampled repeatedly (t=1..7 per day/condition). For a chosen set of genes (e.g., PER2, BMAL1, REV-ERB), their expression across time is your state vector
ùë•
(
ùë°
)
x(t).

What‚Äôs the ‚Äúresponse‚Äù? Depends on analysis tier:

Rhythmicity analysis: estimate amplitude and phase of each gene (no ‚Äúlabel‚Äù; output = parameters).

Mechanistic ODE: fit parameters
ùúÉ
Œ∏ of a known circadian model so simulated
ùë•
ùúÉ
(
ùë°
)
x
Œ∏
	‚Äã

(t) matches observed time courses (response = fitted parameters / prediction error).

SINDy: learn a sparse ODE
ùë•
Àô
=
ùëì
(
ùë•
)
x
Àô
=f(x); response = discovered terms/coefficients + forecast quality.

Neural ODE: learn
ùë•
Àô
=
ùëì
ùúÉ
(
ùë•
,
ùë°
)
x
Àô
=f
Œ∏
	‚Äã

(x,t); response =
ùëì
ùúÉ
f
Œ∏
	‚Äã

 and its ability to predict held-out times/subjects.

So: features = time-series of gene expression, response = dynamics/parameters/forecasts, not a single supervised label.

In [14]:
INT_CHOICES  = ["gProcessedSignal","gMeanSignal","ProcessedSignal","Signal"]
PROBE_KEYS   = ["ProbeName","SystematicName","FeatureNum"]
GENE_KEYS    = ["GeneName","Gene Symbol","GENE_SYMBOL"]

def extract_expression(df):
    """
    From a full Agilent FE DataFrame, pull out only the useful columns:
      - intensity (gProcessedSignal)
      - probe ID
      - gene symbol (if present)
    and filter to real probes (ControlType == 0).
    """
    int_col  = next((c for c in INT_CHOICES if c in df.columns), None)
    probe_col= next((c for c in PROBE_KEYS   if c in df.columns), None)
    gene_col = next((c for c in GENE_KEYS    if c in df.columns), None)
    if int_col is None or probe_col is None:
        raise ValueError("Missing intensity or probe column")

    # keep only real (non-control) probes if ControlType exists
    if "ControlType" in df.columns:
        df = df[df["ControlType"].astype(str) == "0"]

    out = df[[probe_col, int_col]].rename(columns={probe_col:"probe", int_col:"intensity"})
    if gene_col:
        out["gene"] = df[gene_col]
    return out.reset_index(drop=True)

dtest = extract_expression(df0)
dtest.shape
dtest.head()




Unnamed: 0,probe,intensity,gene
0,A_23_P67299,13.61813,DOCK6
1,A_23_P49021,783.3045,WDR61
2,A_24_P315975,4.684536,KRTAP4-9
3,A_24_P109191,73.38431,A_24_P109191
4,A_24_P269814,1367.563,PLEKHA1


# **ONLY FOR PUSHING CHANGES TO GITHUB**

In [17]:


import getpass, urllib.parse, os

username = "skothare"  # your GitHub username
token = getpass.getpass('GitHub token: ')

# URL-encode the token in case it has special characters
token_enc = urllib.parse.quote(token, safe='')

origin = f"https://{username}:{token_enc}@github.com/skothare/ODE-to-Circadian-Clocks.git"

# Set identity (ok if already set)
!git config user.name "{username}"
!git config user.email "21262952+skothare@users.noreply.github.com"

# Update remote to include username:token
!git remote set-url origin "{origin}"

# Push
!git add -A
!git commit -m "Update: preprocessing on Colab" || true  # don't fail if no changes
!git push origin main


GitHub token: ¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑¬∑
[main 3d59497] Update: preprocessing on Colab
 1 file changed, 1 insertion(+), 1 deletion(-)
remote: Permission to skothare/ODE-to-Circadian-Clocks.git denied to skothare.
fatal: unable to access 'https://github.com/skothare/ODE-to-Circadian-Clocks.git/': The requested URL returned error: 403
