In [1]:
import pandas as pd
import tqdm
import pickle
import numpy as np
from sklearn.model_selection import train_test_split

psychad synapse: https://www.synapse.org/Synapse:syn26720956

In [43]:
phen=["c02x","r05x"]

In [33]:
meta_obs = pd.read_csv("/home/ubuntu/MSSM_meta_obs.csv")
meta_obs.index = meta_obs.barcodekey

In [44]:
clinical_meta = pd.read_csv("/home/ubuntu/metadata_latest_oct5.csv",low_memory=False)
clinical_meta = clinical_meta.dropna(subset = [phen[1]])
donors = clinical_meta["SubID"].to_list()
meta_obs = meta_obs[meta_obs["SubID"].isin(donors)]
donors_meta = pd.merge(meta_obs[["SubID"]], clinical_meta[phen +["SubID"]], left_on="SubID", right_on="SubID")
donors_meta.index = meta_obs.index

In [45]:
donors_meta.loc[:,"donor"] = donors_meta.index.str.split("-", n=1).str[0]

In [46]:
subset_phen="r05x"

In [47]:
# 1. donor-level summary
all_donors = (
    donors_meta.groupby('donor')
      .agg(cell_count=('donor','size'),
           phenotype=(f"{subset_phen}",'first'))
      .reset_index()
)

# 2. bin into quantiles (here 4 bins; adjust as you like)
all_donors['count_bin'] = pd.qcut(all_donors['cell_count'],
                              q=4,
                              labels=False,
                              duplicates='drop')

# 3. stratify on “phenotype_bin”
all_donors['stratify_grp'] = all_donors['phenotype'].astype(str) + "_" + all_donors['count_bin'].astype(str)

train_donors, val_donors = train_test_split(
    all_donors,
    test_size=0.2,
    stratify=all_donors['stratify_grp'],
    random_state=42
)

# 4. build your splits
train_df = donors_meta[donors_meta['donor'].isin(train_donors['donor'])]
val_df   = donors_meta[donors_meta['donor'].isin(val_donors  ['donor'])]

In [48]:
print("Cells (train):", len(train_df), "  Cells (val):", len(val_df))
print("Phenotype ratio (train):")
print(train_df[f"{subset_phen}"].value_counts(normalize=True),train_df[f"{subset_phen}"].value_counts())
print("Phenotype ratio (val):")
print(val_df[f"{subset_phen}"].value_counts(normalize=True), val_df[f"{subset_phen}"].value_counts())

train_df = train_df[["SubID",f"{subset_phen}"]]
train_df["barcodekey"] = train_df.index
val_df = val_df[["SubID",f"{subset_phen}"]]
val_df["barcodekey"] = val_df.index

Cells (train): 1685413   Cells (val): 431442
Phenotype ratio (train):
r05x
3.0    0.233629
0.0    0.200337
5.0    0.140744
0.5    0.111353
2.0    0.109961
4.0    0.104096
1.0    0.099882
Name: proportion, dtype: float64 r05x
3.0    393761
0.0    337650
5.0    237211
0.5    187675
2.0    185329
4.0    175445
1.0    168342
Name: count, dtype: int64
Phenotype ratio (val):
r05x
3.0    0.226417
0.0    0.203661
5.0    0.154533
4.0    0.119295
0.5    0.116746
2.0    0.107683
1.0    0.071664
Name: proportion, dtype: float64 r05x
3.0    97686
0.0    87868
5.0    66672
4.0    51469
0.5    50369
2.0    46459
1.0    30919
Name: count, dtype: int64


In [49]:
if subset_phen == "r01x":
    map_phen = {
        0: "Early",
        1: "Early",
        2: "Early",
        3: "Mid",
        4: "Mid",
        5: "Late",
        6: "Late",
    }  
    
    train_df[subset_phen] = train_df[subset_phen].map(map_phen)
    val_df[subset_phen] = val_df[subset_phen].map(map_phen)
    
    map_phen = {
        "Early": 0,
        "Mid":1,
        "Late":2,
    }  
    
    train_df[subset_phen] = train_df[subset_phen].map(map_phen)
    val_df[subset_phen] = val_df[subset_phen].map(map_phen)
elif subset_phen == "r05x":

    map_phen = {
        0: "No AD",
        0.5: "MCI",
        1: "Dementia",
        2: "Dementia",
        3: "Dementia",
        4: "Dementia",
        5: "Dementia",
    }  
    
    train_df[subset_phen] = train_df[subset_phen].map(map_phen)
    val_df[subset_phen] = val_df[subset_phen].map(map_phen)
    
    map_phen = {
        "No AD": 0,
        "MCI":1,
        "Dementia":2,
    }  
    
    train_df[subset_phen] = train_df[subset_phen].map(map_phen)
    val_df[subset_phen] = val_df[subset_phen].map(map_phen)

In [50]:
data={"train": train_df, "test": val_df}

In [51]:
train_df

Unnamed: 0_level_0,SubID,r05x,barcodekey
barcodekey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
M10001-1-AAACCCACAGATGCGA-0,M10001,1,M10001-1-AAACCCACAGATGCGA-0
M10001-1-AAACCCACAGTGTGCC-0,M10001,1,M10001-1-AAACCCACAGTGTGCC-0
M10001-1-AAACCCAGTGGGAGAG-0,M10001,1,M10001-1-AAACCCAGTGGGAGAG-0
M10001-1-AAACCCATCAGAGCAG-0,M10001,1,M10001-1-AAACCCATCAGAGCAG-0
M10001-1-AAACCCATCTACTGCC-0,M10001,1,M10001-1-AAACCCATCTACTGCC-0
...,...,...,...
M99891-2-TTTGGTTGTTGTCATG-1,M99891,2,M99891-2-TTTGGTTGTTGTCATG-1
M99891-2-TTTGTTGAGGCCTTGC-1,M99891,2,M99891-2-TTTGTTGAGGCCTTGC-1
M99891-2-TTTGTTGGTCGCATCG-1,M99891,2,M99891-2-TTTGTTGGTCGCATCG-1
M99891-2-TTTGTTGGTGTAGCAG-1,M99891,2,M99891-2-TTTGTTGGTGTAGCAG-1


In [52]:
with open(f"/home/ubuntu/scripts/Train_cv_to_send/Phenotype_classification_files/{subset_phen}_split_seed42.pkl", 'wb') as handle:
    pickle.dump(data,handle)