## Train/Test Data Split

In [7]:
import pandas as pd
from pathlib import Path

DATA_PATH = Path("../data/biomarker-raw.csv")
TRAIN_PATH = Path("../data/biomarker-train.csv")
TEST_PATH = Path("../data/biomarker-test.csv")
TARGET_COLUMN = "Group"
TRAIN_FRACTION = 0.80
RANDOM_SEED = 197

df = pd.read_csv(DATA_PATH)
df.head()

Unnamed: 0,Group,Target Full Name,E3 ubiquitin-protein ligase CHIP,CCAAT/enhancer-binding protein beta,Gamma-enolase,E3 SUMO-protein ligase PIAS4,Interleukin-10 receptor subunit alpha,Signal transducer and activator of transcription 3,Interferon regulatory factor 1,Transcription factor AP-1,...,Transgelin-2,"ATP synthase subunit O, mitochondrial",Pro-opiomelanocortin,Quinone oxidoreductase-like protein 1,Pigment epithelium-derived factor,Cathepsin F,Formimidoyltransferase-cyclodeaminase,Ubiquitin carboxyl-terminal hydrolase 25,Plexin-B2,ADOS Total Score
0,,Target,CHIP,CEBPB,NSE,PIAS4,IL-10 Ra,STAT3,IRF1,c-Jun,...,Transgelin-2,ATPO,Corticotropin-lipotropin,QORL1,PEDF,CATF,FTCD,UBP25,PLXB2,
1,ASD,,618.6,1489.3,732.7,1229.6,1647,467,1041,3114.2,...,3016.2,2156.9,895.7,2313.6,24904.5,2048.7,9942.4,1462.6,2024.1,8.0
2,ASD,,512.2,1697.8,2628.3,1484.3,1711.9,548.3,1213.8,3188,...,3296.2,1813.6,555,1345,24201.3,2273.2,1918.9,1708,2655.9,21.0
3,ASD,,438.5,1121.7,857.3,1419.4,1926.3,412.6,1222.3,2373.1,...,2875.7,1482.6,543,1980.2,20143.1,4092.6,501.2,1386.4,3091.6,12.0
4,ASD,,505,1209.7,1394,1036.1,1551.6,523.3,1982.2,2652.5,...,3096.2,1399.8,1178.8,1711.1,27553.1,2979.8,1040.4,1508.9,2166.9,20.0


In [8]:
train_index = (
    df.groupby(TARGET_COLUMN)
      .sample(frac=TRAIN_FRACTION, random_state=RANDOM_SEED)
      .index
)
train_df = df.loc[train_index].reset_index(drop=True)
test_df = df.drop(train_index).reset_index(drop=True)

train_df.head()

Unnamed: 0,Group,Target Full Name,E3 ubiquitin-protein ligase CHIP,CCAAT/enhancer-binding protein beta,Gamma-enolase,E3 SUMO-protein ligase PIAS4,Interleukin-10 receptor subunit alpha,Signal transducer and activator of transcription 3,Interferon regulatory factor 1,Transcription factor AP-1,...,Transgelin-2,"ATP synthase subunit O, mitochondrial",Pro-opiomelanocortin,Quinone oxidoreductase-like protein 1,Pigment epithelium-derived factor,Cathepsin F,Formimidoyltransferase-cyclodeaminase,Ubiquitin carboxyl-terminal hydrolase 25,Plexin-B2,ADOS Total Score
0,ASD,,458.9,1367.2,780.8,905.7,2244.0,450.2,1469.7,3401.5,...,3365.9,1424.5,601.4,2608.2,24093.8,2600.0,1780.3,1663.8,2629.7,22
1,ASD,,618.6,1489.3,732.7,1229.6,1647.0,467.0,1041.0,3114.2,...,3016.2,2156.9,895.7,2313.6,24904.5,2048.7,9942.4,1462.6,2024.1,8
2,ASD,,916.8,1716.5,877.2,978.6,1970.6,501.0,1434.8,3013.8,...,4124.4,1488.3,688.7,2466.8,24585.4,3737.1,2421.2,1670.7,2525.2,22
3,ASD,,458.3,1021.1,1142.3,740.1,1669.8,372.4,1022.4,2623.5,...,2962.8,1731.0,525.3,1579.9,30119.9,2453.0,307.7,1444.9,1648.8,7
4,ASD,,395.6,1356.4,638.7,1044.6,1821.8,326.2,1085.7,2696.7,...,3327.2,1708.2,585.0,2193.7,23211.6,2667.9,1032.4,1478.1,1853.7,21


In [9]:
train_df.to_csv(TRAIN_PATH, index=False)
test_df.to_csv(TEST_PATH, index=False)

summary = {
    "train_size": len(train_df),
    "test_size": len(test_df),
    "train_group_counts": train_df[TARGET_COLUMN].value_counts().to_dict(),
    "test_group_counts": test_df[TARGET_COLUMN].value_counts().to_dict(),
}
summary

{'train_size': 123,
 'test_size': 33,
 'train_group_counts': {'TD': 62, 'ASD': 61},
 'test_group_counts': {'TD': 16, 'ASD': 15}}