In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [5]:
data = pd.read_excel('dataset1-copy.xlsx')
# 10374 rows, 29 columns
print(data.shape)
data.head()

(10374, 29)


Unnamed: 0,uid,sex,fdr,any_fdr,source,screening_info,sample_date,age_at_sample,elisa,lips_unin_iaa,...,ia2_status_fu,zink_status_fu,all_reports_screening,final_report_screening,reevaluated,hemolyzed,sample_empty,lab,last_contact_or_t1d_date,last_contact_date_type
0,00032F0C-5916-4671-BB7B-A48314D14AF6,0.0,,,screening,,2016-11-11,3.78,132.32,,...,,,negative,negative,,,,Munich,2016-11-11,last_contact_date_without_t1d
1,0033DB11-56A2-4E3D-9B8F-0556819CF005,0.0,,,screening,,2021-05-11,3.1,1.335415,19.007826,...,,,"call_for_2nd_sample,single_positive",single_positive,,,,Munich,2023-06-19,last_contact_date_without_t1d
2,0033DB11-56A2-4E3D-9B8F-0556819CF005,0.0,,,screening,,2021-06-17,3.21,,,...,,,"call_for_2nd_sample,single_positive",single_positive,,,,Munich,2023-06-19,last_contact_date_without_t1d
3,0033DB11-56A2-4E3D-9B8F-0556819CF005,0.0,,,follow_up,,2022-07-01,4.24,,,...,0.0,0.0,,,,,,Munich,2023-06-19,last_contact_date_without_t1d
4,005B2E43-F96E-406F-AB19-BECB1692A4C6,1.0,,,screening,,2015-09-09,3.92,26.92,,...,,,negative,negative,,,,Munich,2015-09-09,last_contact_date_without_t1d


### Implement date-dependent baseline cutoffs
These are established cutoffs that we use a base for establishing future cutoffs

In [8]:
# Assign labels to rows based on established cutoffs
# We have already run this and saved the new columns, no need to run again

# ELISA cutoffs
elisa_conditions = [
    (data['sample_date'] <= pd.Timestamp('2022-01-31')) & (data['elisa'] >= 25),
    (data['sample_date'] > pd.Timestamp('2022-01-31')) & 
    (data['sample_date'] <= pd.Timestamp('2024-12-31')) & 
    (data['elisa'] >= 40),
    (data['sample_date'] > pd.Timestamp('2024-12-31')) & (data['elisa'] >= 35)
]
elisa_choices = [True, True, True]
data['elisa_pos'] = np.select(elisa_conditions, elisa_choices, default=False)

# M_iaa cutoffs
data['m_iaa_pos'] = (data['m_iaa'] >= 1.5)

# Gada_trunc cutoffs
gada_trunc_conditions = [
    ((data['sample_date'] <= pd.Timestamp('2016-12-12')) & (data['gada_trunc'] >= 22)),
    ((data['sample_date'] > pd.Timestamp('2016-12-12')) & (data['gada_trunc'] >= 30))
]

gada_trunc_choices = [True, True]
data['gada_trunc_pos'] = np.select(gada_trunc_conditions, gada_trunc_choices, default=False)

# Ia2 cutoffs
ia2_conditions = [
    (data['sample_date'] <= pd.Timestamp('2016-12-12')) & (data['ia2'] >= 5),
    (data['sample_date'] >  pd.Timestamp('2016-12-12')) & (data['ia2'] >= 3)
]

ia2_choices = [True, True]
data['ia2_pos'] = np.select(ia2_conditions, ia2_choices, default=False)

# ZnT8 cutoffs
znt8_c_arg_conditions = [
    (data['sample_date'] <= pd.Timestamp('2016-12-12')) & (data['znt8_c_arg'] >= 16),
    (data['sample_date'] > pd.Timestamp('2016-12-12')) & (data['znt8_c_arg'] >= 30)
]
znt8_c_arg_choices = [True, True]
data['znt8_c_arg_pos'] = np.select(znt8_c_arg_conditions, znt8_c_arg_choices, default=False)

data['znt8_c_tryp_pos'] = data['znt8_c_tryp'] >= 30

data['znt8_pos'] = data['znt8_c_arg_pos'] | data['znt8_c_tryp_pos']
data['znt8_neg'] = ~(data['znt8_c_arg_pos'] | data['znt8_c_tryp_pos'])

# Lips_unin_iaa cutoffs
lips_unin_conditions = [
    ((data['elisa_pos'] == True) & (data['lips_unin_iaa'] >= 3)),
    ((data['elisa_pos'] == False) & (data['lips_unin_iaa'] >= 10))
]
lips_unin_choices = [True, True]
data['lips_unin_iaa_pos'] = np.select(lips_unin_conditions, lips_unin_choices, default=False)

# Lips_in_iaa cutoffs
lips_in_conditions = [
    (data['source'] == 'screening') & (data['lips_in_iaa'] >= 15) &
    (~data[['elisa_pos', 'gada_trunc_pos', 'ia2_pos', 'znt8_pos']].any(axis=1)),
    (data['source'] == 'confirmation') & (data['lips_in_iaa'] >= 4) & (data['m_iaa'] >= 1.5)
]

lips_in_choices = [True, True]
data['lips_in_iaa_pos'] = np.select(lips_in_conditions, lips_in_choices, default=False)

data['lips_iaa_pos'] = (data['lips_unin_iaa_pos'] | data['lips_in_iaa_pos'])


data.to_excel('dataset1-copy.xlsx', index=False)

### Clinical-like classification
We attempt a naive way of establishing cutoffs based on AB counts
We make a per-row classification using:
* If ELISA positive, and 2+ antibodies are positive → mark as early_stage_T1D
* If ELISA positive, and exactly 1 antibody positive (not ZnT8) → mark as single_AB_risk
* Otherwise → negative.

Some adjustments:
* Count IAA positivity (LIPS U or LIPS I)
* ELISA gate (don’t count ABs if ELISA below its date-dependent cutoff)
* ZnT8 cannot be the only single-positive
* Aggregate per child across screening/confirmation (take the max positives)

In [9]:
# Per-row IAA flag
data['iaa_pos'] = data['lips_unin_iaa_pos'] | data['lips_in_iaa_pos']

# Raw antibody count (continuous antibodies)
data['num_AB_positive_raw'] = data[['iaa_pos','m_iaa_pos','gada_trunc_pos','ia2_pos','znt8_pos']].sum(axis=1)

# ELISA-gated count
data['effective_AB_positive'] = np.where(data['elisa_pos'], data['num_AB_positive_raw'], 0)

# Row-level classification (pre-aggregation)
row_conditions = [
    data['effective_AB_positive'] >= 2,
    (data['effective_AB_positive'] == 1) & (~data['znt8_pos']),  # exclude ZnT8-only single
    data['effective_AB_positive'] == 0
]
row_choices = ['early_stage_T1D', 'single_AB_risk', 'negative']
data['rule_result_row'] = np.select(row_conditions, row_choices, default='negative')

# Aggregate to a per-child screening-summary (max over time)
agg = (data
       .groupby('uid', as_index=False)
       .agg(
           max_effective_AB_positive=('effective_AB_positive','max'),
           any_znt8_only=('znt8_pos', 'max'),
           any_elisa_pos=('elisa_pos','max'),
           final_report=('final_report_screening','last'), # or 'first'—pick consistent
       ))

# Derive per-UID classification from aggregated count
cond_uid = [
    agg['max_effective_AB_positive'] >= 2,
    (agg['max_effective_AB_positive'] == 1) & (~agg['any_znt8_only']),
    agg['max_effective_AB_positive'] == 0
]
agg['rule_result_uid'] = np.select(cond_uid, row_choices, default='negative')

### Evaluate this approach
Obviously this naive approach does not work and there are cases that are not classified correctly.
We do not that for multiple_positive and negative, the approach correctly predicts almost all of the cases, but other cases are misclassified heavily (e.g. single_positive).

In [10]:
pd.crosstab(agg['final_report'], agg['rule_result_uid'])

rule_result_uid,early_stage_T1D,negative,single_AB_risk
final_report,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
call_for_2nd_sample,69,44,25
multiple_positive,562,35,23
multiple_positive_after_single,7,8,4
negative,91,3098,763
negative_after_single,5,1,4
no_report_yet,17,29,12
resend_low_volume,3,45,19
single_positive,25,31,58


### Building a feature matrix for training

In [11]:
# Create binary label for training - early-stage T1D or not
def label_early_stage(row):
    fr = row['final_report_screening']
    t1d = row.get('last_contact_date_type') == 't1d_date'
    return int((fr in ['multiple_positive', 'multiple_positive_after_single']) or t1d)

data['label_early'] = data.apply(label_early_stage, axis=1)
data.to_excel('dataset1-copy.xlsx', index=False)

# Aggregate label per child
labels = data.groupby('uid', as_index=False)['label_early'].max()

In [14]:
# Aggregate continuous antibody titers
# Use maximum as strongest indicator
titers = data.groupby('uid', as_index=False).agg({
    'elisa': 'max',
    'gada_trunc': 'max',
    'ia2': 'max',
    'm_iaa': 'max',
    'znt8_c_arg': 'max',
    'znt8_c_tryp': 'max',
})

# Add more phenotypic and contextual features
# Age at first sample: Autoimmunity appears earlier in life
# Family history (fdr): Elevated risk
# Year of first sample: Control for drift in assays over time
other = data.groupby('uid', as_index=False).agg({
    'age_at_sample': 'min',  # youngest age at sampling
    'any_fdr': 'max',  # if ever fdr=1 for the child
    'sample_date': lambda x: x.min().year  # first year measured
})
other = other.rename(columns={'sample_date': 'first_sample_year'})


In [17]:
# Merge all features into one DataFrame
model_df = (
    labels
    .merge(titers, on='uid', how='left')
    .merge(other, on='uid', how='left')
)

# Rename columns for clarity
model_df = model_df.rename(columns={
    'label_early': 'early_stage_T1D'
})

model_df.head()

Unnamed: 0,uid,early_stage_T1D,elisa,gada_trunc,ia2,m_iaa,znt8_c_arg,znt8_c_tryp,age_at_sample,any_fdr,first_sample_year
0,00032F0C-5916-4671-BB7B-A48314D14AF6,0,132.32,8.5,0.1,1.0,0.1,0.1,3.78,,2016
1,0033DB11-56A2-4E3D-9B8F-0556819CF005,0,1.335415,0.1,0.1,30.6,0.1,0.1,3.1,,2021
2,005B2E43-F96E-406F-AB19-BECB1692A4C6,0,26.92,0.1,0.1,0.7,0.1,0.1,3.92,,2015
3,00612B9E-AB0E-4E25-9078-219711028F73,0,54.409,3.8,0.1,0.6,0.1,0.1,2.16,,2019
4,00731613-E23A-4DA4-849F-9D1762C9E3D7,0,130.64,0.1,0.1,,0.1,0.1,5.25,,2024


In [18]:
# Feature set
feature_cols = [
    'elisa', 'gada_trunc', 'ia2', 'm_iaa', 'znt8_c_arg', 'znt8_c_tryp',
    'age_at_sample', 'any_fdr'
]

X = model_df[feature_cols].fillna(0.0)  # Fill missing with zero for now
y = model_df['early_stage_T1D'].astype(int)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, stratify=y, random_state=42
)

# Fit logistic regression model
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)

# Predictions
y_proba = lr.predict_proba(X_test)[:, 1]

# Metrics
auc = roc_auc_score(y_test, y_proba)
print(f"AUC: {auc:.3f}")
print(classification_report(y_test, (y_proba >= 0.5).astype(int)))

AUC: 0.977
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      1078
           1       0.87      0.73      0.79       167

    accuracy                           0.95      1245
   macro avg       0.91      0.86      0.88      1245
weighted avg       0.95      0.95      0.95      1245



  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
