In [None]:
%pip install pandas numpy matplotlib seaborn scikit-learn

In [12]:
import pandas as pd
import numpy as np
import matplotlib as plt
import seaborn as sns
import os

In [18]:
# read in data
data = pd.read_csv("../data/biomarker-raw.csv")

In [19]:
data.head()

Unnamed: 0,Group,Target Full Name,E3 ubiquitin-protein ligase CHIP,CCAAT/enhancer-binding protein beta,Gamma-enolase,E3 SUMO-protein ligase PIAS4,Interleukin-10 receptor subunit alpha,Signal transducer and activator of transcription 3,Interferon regulatory factor 1,Transcription factor AP-1,...,Transgelin-2,"ATP synthase subunit O, mitochondrial",Pro-opiomelanocortin,Quinone oxidoreductase-like protein 1,Pigment epithelium-derived factor,Cathepsin F,Formimidoyltransferase-cyclodeaminase,Ubiquitin carboxyl-terminal hydrolase 25,Plexin-B2,ADOS Total Score
0,,Target,CHIP,CEBPB,NSE,PIAS4,IL-10 Ra,STAT3,IRF1,c-Jun,...,Transgelin-2,ATPO,Corticotropin-lipotropin,QORL1,PEDF,CATF,FTCD,UBP25,PLXB2,
1,ASD,,618.6,1489.3,732.7,1229.6,1647,467,1041,3114.2,...,3016.2,2156.9,895.7,2313.6,24904.5,2048.7,9942.4,1462.6,2024.1,8.0
2,ASD,,512.2,1697.8,2628.3,1484.3,1711.9,548.3,1213.8,3188,...,3296.2,1813.6,555,1345,24201.3,2273.2,1918.9,1708,2655.9,21.0
3,ASD,,438.5,1121.7,857.3,1419.4,1926.3,412.6,1222.3,2373.1,...,2875.7,1482.6,543,1980.2,20143.1,4092.6,501.2,1386.4,3091.6,12.0
4,ASD,,505,1209.7,1394,1036.1,1551.6,523.3,1982.2,2652.5,...,3096.2,1399.8,1178.8,1711.1,27553.1,2979.8,1040.4,1508.9,2166.9,20.0


In [20]:
# Basic info
print("=" * 50)
print("BASIC INFORMATION")
print("=" * 50)
print(f"Shape: {data.shape} (rows × columns)")
print(f"Total cells: {data.size}")
print(f"\nColumn names: {data.columns.tolist()[:10]}...")  # First 10 columns
print(f"Total columns: {len(data.columns)}")

# Data types
print("\n" + "=" * 50)
print("DATA TYPES")
print("=" * 50)
print(data.dtypes.value_counts())

# Missing values
print("\n" + "=" * 50)
print("MISSING VALUES")
print("=" * 50)
print(f"Total missing values: {data.isnull().sum().sum()}")
print(f"Percentage missing: {(data.isnull().sum().sum() / data.size * 100):.2f}%")

# Group distribution
print("\n" + "=" * 50)
print("GROUP DISTRIBUTION")
print("=" * 50)
print(data['Group'].value_counts())

# Statistical summary
print("\n" + "=" * 50)
print("STATISTICAL SUMMARY")
print("=" * 50)
print(data.describe())

# First few rows
print("\n" + "=" * 50)
print("FIRST 5 ROWS")
print("=" * 50)
print(data.head())

BASIC INFORMATION
Shape: (156, 1320) (rows × columns)
Total cells: 205920

Column names: ['Group', 'Target Full Name', 'E3 ubiquitin-protein ligase CHIP', 'CCAAT/enhancer-binding protein beta', 'Gamma-enolase', 'E3 SUMO-protein ligase PIAS4', 'Interleukin-10 receptor subunit alpha', 'Signal transducer and activator of transcription 3', 'Interferon regulatory factor 1', 'Transcription factor AP-1']...
Total columns: 1320

DATA TYPES
object     1319
float64       1
Name: count, dtype: int64

MISSING VALUES
Total missing values: 1476
Percentage missing: 0.72%

GROUP DISTRIBUTION
Group
TD     78
ASD    76
Name: count, dtype: int64

STATISTICAL SUMMARY
        Protein 4.1
count    155.000000
mean    3048.512903
std     1650.073753
min       41.000000
25%     2038.150000
50%     2647.800000
75%     3457.750000
max    13685.600000

FIRST 5 ROWS
  Group Target Full Name E3 ubiquitin-protein ligase CHIP  \
0   NaN           Target                             CHIP   
1   ASD              NaN    

In [21]:
print(data['Group'].value_counts())

Group
TD     78
ASD    76
Name: count, dtype: int64


In [16]:
# drop nuisance cols
data.drop(0, inplace=True)
data.drop(columns=["Target Full Name"], inplace=True)

# convert protein level data to float
group_col = data["Group"]
data.drop(columns=["Group"], inplace=True)
data = data.replace('-', pd.NA)
data.dropna(inplace=True)
data = data.astype(float)
data.head()

data.insert(0, "Group", group_col)

data.head()

Unnamed: 0,Group,E3 ubiquitin-protein ligase CHIP,CCAAT/enhancer-binding protein beta,Gamma-enolase,E3 SUMO-protein ligase PIAS4,Interleukin-10 receptor subunit alpha,Signal transducer and activator of transcription 3,Interferon regulatory factor 1,Transcription factor AP-1,Induced myeloid leukemia cell differentiation protein Mcl-1,...,Transgelin-2,"ATP synthase subunit O, mitochondrial",Pro-opiomelanocortin,Quinone oxidoreductase-like protein 1,Pigment epithelium-derived factor,Cathepsin F,Formimidoyltransferase-cyclodeaminase,Ubiquitin carboxyl-terminal hydrolase 25,Plexin-B2,ADOS Total Score
1,ASD,618.6,1489.3,732.7,1229.6,1647.0,467.0,1041.0,3114.2,1247.7,...,3016.2,2156.9,895.7,2313.6,24904.5,2048.7,9942.4,1462.6,2024.1,8.0
2,ASD,512.2,1697.8,2628.3,1484.3,1711.9,548.3,1213.8,3188.0,868.5,...,3296.2,1813.6,555.0,1345.0,24201.3,2273.2,1918.9,1708.0,2655.9,21.0
3,ASD,438.5,1121.7,857.3,1419.4,1926.3,412.6,1222.3,2373.1,1203.0,...,2875.7,1482.6,543.0,1980.2,20143.1,4092.6,501.2,1386.4,3091.6,12.0
4,ASD,505.0,1209.7,1394.0,1036.1,1551.6,523.3,1982.2,2652.5,764.1,...,3096.2,1399.8,1178.8,1711.1,27553.1,2979.8,1040.4,1508.9,2166.9,20.0
5,ASD,440.7,1120.2,885.0,925.8,1518.5,523.9,1422.4,2373.8,647.9,...,3282.5,1439.8,619.6,1497.2,27775.2,2720.1,1312.6,1445.9,2887.4,22.0


Number of rows where Group is not ASD: 0
