# Question 3: Analysis Modification - Dataset Testing Diffrentiation

In [1]:
# import dependencies
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
import math
import pyreadr
import requests

In [31]:
# read in data
data = pd.read_csv(
  "https://raw.githubusercontent.com/pstat197/module-1-biomarker-data-table_ten/refs/heads/main/data/biomarker-raw.csv"
  )

# drop nuisance cols
data.drop(0, inplace=True)
data.drop(columns=["Target Full Name"], inplace=True)

# convert protein level data to float
group_col = data["Group"]
data.drop(columns=["Group"], inplace=True)
data = data.replace('-', pd.NA)

# Replace missing markers with NaN but don’t drop them
data = data.replace('-', pd.NA)
data = data.apply(pd.to_numeric, errors='coerce')

data = data.astype(float)

data.insert(0, "Group", group_col)
biomarker_clean = data.copy()

repeat the analysis but carry out the entire selection procedure on a training partition -- in other words, set aside some testing data at the very beginning and don't use it until you are evaluating accuracy at the very end

In [41]:
# --- Assumes your preprocessing just ran and you have: biomarker_clean = data.copy() ---

import numpy as np
import pandas as pd
from scipy import stats
from scipy.special import digamma

from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score, recall_score

# --------------- Helpers ---------------

def welch_t_two_sided(col, df, group_col='Group', order=('ASD','TD')):
    """Welch two-sample t-test; ignores NaNs."""
    g1, g2 = order
    x = df.loc[df[group_col] == g1, col].astype(float)
    y = df.loc[df[group_col] == g2, col].astype(float)
    _, p = stats.ttest_ind(x, y, equal_var=False, nan_policy='omit')
    return p

def by_adjust(pvals):
    """Benjamini-Yekutieli adjustment using the same approximation your R used:
       hm = log(m) + 1/(2m) - digamma(1)
    """
    m = len(pvals)
    hm = np.log(m) + 1.0/(2*m) - digamma(1)
    ranks = np.arange(1, m+1, dtype=float)
    p_sorted = np.array(pvals)
    p_adj = np.minimum(1.0, (m * hm * p_sorted) / ranks)
    return p_adj, hm

# --------------- Dataset Manipulation + Split ---------------
biomarker_fix = biomarker_clean.dropna(subset=['Group']).copy()
biomarker_fix['Group'] = biomarker_fix['Group'].astype(str).str.strip()
feature_cols = [c for c in biomarker_fix.columns if c != 'Group']
X = biomarker_fix[feature_cols]
y = biomarker_fix['Group']
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=101422,
    stratify=y 
)
df_train = X_train.copy()
df_train['Group'] = y_train.values

# --------------- T-tests + BY ---------------

# Compute p-values (Welch, two-sided), NaNs automatically omitted
ttest_rows = []
for protein in X_train.columns:
    p = welch_t_two_sided(protein, df_train, 'Group', ('ASD','TD'))
    ttest_rows.append({'protein': protein, 'p_value': p})

ttests_out = (pd.DataFrame(ttest_rows)
              .sort_values('p_value', ascending=True)
              .reset_index(drop=True))

# BY correction (R-style approximation for harmonic number)
m = len(ttests_out)
p_adj, hm = by_adjust(ttests_out['p_value'].values)
ttests_out['m'] = m
ttests_out['hm'] = hm
ttests_out['rank'] = np.arange(1, m+1)
ttests_out['p_adj'] = p_adj

# Top 10 by adjusted p-value
proteins_s1 = ttests_out.nsmallest(10, 'p_adj')['protein'].tolist()

# --------------- Random Forest (+ impute, OOB confusion) ---------------
predictors = X_train.copy()
response = y_train.copy()


# Pipeline: impute (median) → RF with OOB enabled
rf_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    RandomForestClassifier(
        n_estimators=1000,
        oob_score=True,        # mirror R's rf_out$confusion using OOB
        bootstrap=True,
        random_state=101422,
        n_jobs=-1
    )
)
rf_pipe.fit(predictors, response)

# OOB preds → confusion
rf = rf_pipe.named_steps['randomforestclassifier']
if hasattr(rf, 'oob_decision_function_') and rf.oob_decision_function_ is not None:
    oob_prob = rf.oob_decision_function_
    oob_pred = rf.classes_[oob_prob.argmax(axis=1)]
    cm_oob = pd.DataFrame(
        confusion_matrix(response, oob_pred, labels=['ASD','TD']),
        index=pd.Index(['ASD','TD'], name='True'),
        columns=pd.Index(['ASD','TD'], name='Pred')
    )
    print("Random Forest (OOB) confusion:\n", cm_oob, "\n")

# Importances
imp = (pd.DataFrame({
            'protein': predictors.columns,
            'importance': rf.feature_importances_
       })
       .sort_values('importance', ascending=False))
proteins_s2 = imp.head(10)['protein'].tolist()

# --------------- Logistic Regression on intersection (+ impute) ---------------

proteins_sstar = sorted(set(proteins_s1).intersection(set(proteins_s2)))
#if not proteins_sstar:
#   # widen to 20 if empty to avoid dead-end
#    proteins_s1 = ttests_out.nsmallest(20, 'p_adj')['protein'].tolist()
#    proteins_s2 = imp.head(20)['protein'].tolist()
#    proteins_sstar = sorted(set(proteins_s1).intersection(set(proteins_s2)))

## Use the SAME original split with the selected features
Xtr = X_train[proteins_sstar]
Xte = X_test[proteins_sstar]
ytr = (y_train == 'ASD').astype(int)
yte = (y_test == 'ASD').astype(int)

# Pipeline: impute (median) → logistic regression
logit_pipe = make_pipeline(
    SimpleImputer(strategy='median'),
    LogisticRegression(max_iter=1000, solver='lbfgs')
)

logit_pipe.fit(Xtr, ytr)

#Test Probs
probs = logit_pipe.predict_proba(Xte)[:, 1]
pred = (probs > 0.5).astype(int)

#Train Probs
probs_train = logit_pipe.predict_proba(Xtr)[:, 1]
train_pred = (probs_train > 0.5).astype(int)

cm_train = pd.DataFrame(
    confusion_matrix(ytr, train_pred, labels=[1, 0]),
    index=pd.Index(['ASD', 'TD'], name='True'),
    columns=pd.Index(['ASD', 'TD'], name='Pred')
)
cm_test = pd.DataFrame(
    confusion_matrix(yte, pred, labels=[1, 0]),
    index=pd.Index(['ASD', 'TD'], name='True'),
    columns=pd.Index(['ASD', 'TD'], name='Pred')
)
print("Training Confusion Matrix")
print(cm_train)
print("Testing Confusion Matrix")
print(cm_test)

#test metrics
tn, fp, fn, tp = confusion_matrix(yte, pred).ravel()
accuracy = accuracy_score(yte, pred)
sensitivity = recall_score(yte, pred)  # ASD==1
specificity = tn / (tn + fp) if (tn + fp) else np.nan
roc_auc = roc_auc_score(yte, probs)

Random Forest (OOB) confusion:
 Pred  ASD  TD
True         
ASD    46  15
TD     20  42 

Training Confusion Matrix
Pred  ASD  TD
True         
ASD    46  15
TD     18  44
Testing Confusion Matrix
Pred  ASD  TD
True         
ASD    12   3
TD      6  10
