In [1]:
%%capture
!pip install import_ipynb --no-cache
import import_ipynb
m = __import__("Methods")

In [2]:
import pandas as pd
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
from urllib.request import urlretrieve
from sklearn.preprocessing import StandardScaler
import numpy as np
from scipy.stats import boxcox
import pandas as pd
import zipfile
import urllib.request
import os

In [3]:
rna_seq = dict()
phenotype = dict()
meta = dict()

# RR-1 Data

In [4]:
# Dataset identifiers
datasets = ["47", "48", "137", "168"]

# --- Load Metadata ---
for ds in datasets:
    meta_path = f"../meta_data/OSD-{ds}_metadata_OSD-{ds}-ISA/s_OSD-{ds}.txt"
    if os.path.exists(meta_path):
        print(f"Loading metadata for OSD-{ds}...")
        meta[ds] = pd.read_csv(meta_path, sep="\t", header=0)

# --- Load RNA-seq Data ---
for ds in datasets:
    rna_path = f"../raw_data/GLDS-{ds}_rna_seq_Normalized_Counts.csv"
    if os.path.exists(rna_path):
        print(f"Loading RNA-seq data for OSD-{ds}...")
        rna_seq[ds] = pd.read_csv(rna_path)

# --- Load Phenotype Data ---
pheno_sources = {
    "47": "LSDS-29_Histology_OSD_47_Histology_TRANSFORMED.csv",
    "48": "LSDS-2_Histology_OSD_48_Histology_TRANSFORMED.csv",
    "137": "LSDS-28_Histology_OSD_137_Histology_TRANSFORMED.csv"
}

for ds, fname in pheno_sources.items():
    pheno_path = f"../raw_data/{fname}"
    if os.path.exists(pheno_path):
        print(f"Loading phenotype data for OSD-{ds}...")
        phenotype[ds] = pd.read_csv(pheno_path)

# --- Filter RNA-seq Data ---
for ds in rna_seq:
    print(f"Filtering RNA-seq data for OSD-{ds}...")

    df = rna_seq[ds].copy()

    if ds == "168":
        # Convert index to 'Unnamed: 0' column for compatibility with filter_data
        df = df.reset_index().rename(columns={"index": "Unnamed: 0"})

    rna_seq[ds] = m.filter_data(df, dropnans=True, dropgenes=True, droplowcvs=0.5, droplowcount=10)


# --- Transpose RNA-seq Data ---
for ds in rna_seq:
    print(f"Transposing RNA-seq data for OSD-{ds}...")
    rna_seq[ds] = m.transpose_df(rna_seq[ds], cur_index_col="Unnamed: 0", new_index_col="Sample name")


# --- Align RNA-seq and Phenotype by Common Samples (47, 48) ---
for ds in ["47", "48", "137"]:
    if ds in rna_seq and ds in phenotype:
        print(f"Aligning RNA-seq and phenotype data for OSD-{ds}...")
        rna = rna_seq[ds]
        pheno = phenotype[ds]

        common = set(rna["Sample name"]) & set(pheno["Sample name"])
        rna_common = rna[rna["Sample name"].isin(common)].copy()
        pheno_common = pheno[pheno["Sample name"].isin(common)].copy()
        pheno_common = pheno_common[["Sample name", "ORO Positivity (%)"]]

        rna_common.reset_index(drop=True, inplace=True)
        pheno_common.reset_index(drop=True, inplace=True)

        print(f"Saving aligned data for OSD-{ds}...")
        rna_common.to_csv(f"../x_variables/rna_seq-{ds}.csv", index=False)
        pheno_common.to_csv(f"../y_variables/pheno-{ds}.csv", index=False)

# --- Special Case: phenotype["137"] aligned with rna_seq["168"] ---
if "168" in rna_seq and "137" in phenotype:
    print("Aligning RNA-seq data from OSD-168 with phenotype data from OSD-137...")
    rna = rna_seq["168"]
    pheno = phenotype["137"]

    common = set(rna["Sample name"]) & set(pheno["Sample name"])
    rna_common = rna[rna["Sample name"].isin(common)].copy()
    pheno_common = pheno[pheno["Sample name"].isin(common)].copy()
    pheno_common = pheno_common[["Sample name", "ORO Positivity (%)"]]

    rna_common.reset_index(drop=True, inplace=True)
    pheno_common.reset_index(drop=True, inplace=True)

    print("Saving aligned data for OSD-168 (RNA) and OSD-137 (phenotype)...")
    rna_common.to_csv("../x_variables/rna_seq-168.csv", index=False)
    pheno_common.to_csv("../y_variables/pheno-168.csv", index=False)


Loading metadata for OSD-47...
Loading metadata for OSD-48...
Loading metadata for OSD-137...
Loading metadata for OSD-168...
Loading RNA-seq data for OSD-47...
Loading RNA-seq data for OSD-48...
Loading RNA-seq data for OSD-137...
Loading RNA-seq data for OSD-168...
Loading phenotype data for OSD-47...
Loading phenotype data for OSD-48...
Loading phenotype data for OSD-137...
Filtering RNA-seq data for OSD-47...
Number of protein coding genes: 15059
2520
Filtering RNA-seq data for OSD-48...
Number of protein coding genes: 15991
5244
Filtering RNA-seq data for OSD-137...
Number of protein coding genes: 15567
3004
Filtering RNA-seq data for OSD-168...
Number of protein coding genes: 18551
6661
Transposing RNA-seq data for OSD-47...
Transposing RNA-seq data for OSD-48...
Transposing RNA-seq data for OSD-137...
Transposing RNA-seq data for OSD-168...
Aligning RNA-seq and phenotype data for OSD-47...
Saving aligned data for OSD-47...
Aligning RNA-seq and phenotype data for OSD-48...
Saving