In [6]:
# Load libraries and data
import pandas as pd
import numpy as np

In [7]:
data = pd.read_excel('dataset1-copy.xlsx')
# 10374 rows, 29 columns
print(data.shape)
data.head()

(10374, 29)


Unnamed: 0,uid,sex,fdr,any_fdr,source,screening_info,sample_date,age_at_sample,elisa,lips_unin_iaa,...,ia2_status_fu,zink_status_fu,all_reports_screening,final_report_screening,reevaluated,hemolyzed,sample_empty,lab,last_contact_or_t1d_date,last_contact_date_type
0,00032F0C-5916-4671-BB7B-A48314D14AF6,0.0,,,screening,,2016-11-11,3.78,132.32,,...,,,negative,negative,,,,Munich,2016-11-11,last_contact_date_without_t1d
1,0033DB11-56A2-4E3D-9B8F-0556819CF005,0.0,,,screening,,2021-05-11,3.1,1.335415,19.007826,...,,,"call_for_2nd_sample,single_positive",single_positive,,,,Munich,2023-06-19,last_contact_date_without_t1d
2,0033DB11-56A2-4E3D-9B8F-0556819CF005,0.0,,,screening,,2021-06-17,3.21,,,...,,,"call_for_2nd_sample,single_positive",single_positive,,,,Munich,2023-06-19,last_contact_date_without_t1d
3,0033DB11-56A2-4E3D-9B8F-0556819CF005,0.0,,,follow_up,,2022-07-01,4.24,,,...,0.0,0.0,,,,,,Munich,2023-06-19,last_contact_date_without_t1d
4,005B2E43-F96E-406F-AB19-BECB1692A4C6,1.0,,,screening,,2015-09-09,3.92,26.92,,...,,,negative,negative,,,,Munich,2015-09-09,last_contact_date_without_t1d


In [None]:
# Assign labels to rows based on established cutoffs
# We have already run this and saved the new columns, no need to run again

# ELISA cutoffs
elisa_conditions = [
    (data['sample_date'] <= pd.Timestamp('2022-01-31')) & (data['elisa'] >= 25),
    (data['sample_date'] > pd.Timestamp('2022-01-31')) & 
    (data['sample_date'] <= pd.Timestamp('2024-12-31')) & 
    (data['elisa'] >= 40),
    (data['sample_date'] > pd.Timestamp('2024-12-31')) & (data['elisa'] >= 35)
]
elisa_choices = [True, True, True]
data['elisa_pos'] = np.select(elisa_conditions, elisa_choices, default=False)

# Lips_unin_iaa cutoffs
lips_unin_conditions = [
    ((data['elisa_pos'] == True) & (data['lips_unin_iaa'] >= 3)),
    ((data['elisa_pos'] == False) & (data['lips_unin_iaa'] >= 10))
]
lips_unin_choices = [True, True]
data['lips_unin_iaa_pos'] = np.select(lips_unin_conditions, lips_unin_choices, default=False)

# Lips_in_iaa cutoffs
lips_in_conditions = [
    ((data['lips_in_iaa'] >= 15)),
    ((data['lips_in_iaa'] >= 4) & (data['m_iaa'] >= 1.5))
]
lips_in_choices = [True, True]
data['lips_in_iaa_pos'] = np.select(lips_in_conditions, lips_in_choices, default=False)

data['lips_iaa_pos'] = (data['lips_unin_iaa_pos'] | data['lips_in_iaa_pos'])

# M_iaa cutoffs
data['m_iaa_pos'] = (data['m_iaa'] > 1.5)

# Gada_trunc cutoffs
gada_trunc_conditions = [
    ((data['sample_date'] <= pd.Timestamp('2016-12-12')) & (data['gada_trunc'] >= 22)),
    ((data['sample_date'] > pd.Timestamp('2016-12-12')) & (data['gada_trunc'] >= 30))
]

gada_trunc_choices = [True, True]
data['gada_trunc_pos'] = np.select(gada_trunc_conditions, gada_trunc_choices, default=False)

# Ia2 cutoffs
ia2_conditions = [
    ((data['sample_date'] <= pd.Timestamp('2016-12-12')) & (data['ia2'] >= 5)),
    ((data['sample_date'] > pd.Timestamp('2016-12-12')) & (data['gada_trunc'] >= 3))
]

ia2_choices = [True, True]
data['ia2_pos'] = np.select(ia2_conditions, ia2_choices, default=False)

# ZnT8 cutoffs
znt8_c_arg_conditions = [
    (data['sample_date'] <= pd.Timestamp('2016-12-12')) & (data['znt8_c_arg'] >= 16),
    (data['sample_date'] > pd.Timestamp('2016-12-12')) & (data['znt8_c_arg'] >= 30)
]
znt8_c_arg_choices = [True, True]
data['znt8_c_arg_pos'] = np.select(znt8_c_arg_conditions, znt8_c_arg_choices, default=False)

data['znt8_c_tryp_pos'] = data['znt8_c_tryp'] >= 30

data['znt8_pos'] = data['znt8_c_arg_pos'] | data['znt8_c_tryp_pos']

data.to_excel('dataset1-copy.xlsx', index=False)


In [9]:
# Check correlation of antibodies (binary flags)
binary_flags = ['elisa_pos', 'lips_iaa_pos', 'm_iaa_pos', 'gada_trunc_pos', 'ia2_pos', 'znt8_pos']

# Compute Pearson correlation
pearson_corr = data[binary_flags].corr(method='pearson')
print(pearson_corr)

# Compute Spearman correlation
spearman_corr = data[binary_flags].corr(method='spearman')
print(spearman_corr)


                elisa_pos  lips_iaa_pos  m_iaa_pos  gada_trunc_pos   ia2_pos  \
elisa_pos        1.000000      0.015527  -0.272087       -0.341533 -0.342660   
lips_iaa_pos     0.015527      1.000000   0.156339        0.108204  0.116013   
m_iaa_pos       -0.272087      0.156339   1.000000        0.344609  0.296351   
gada_trunc_pos  -0.341533      0.108204   0.344609        1.000000  0.745513   
ia2_pos         -0.342660      0.116013   0.296351        0.745513  1.000000   
znt8_pos        -0.350104      0.064558   0.300944        0.472704  0.523652   

                znt8_pos  
elisa_pos      -0.350104  
lips_iaa_pos    0.064558  
m_iaa_pos       0.300944  
gada_trunc_pos  0.472704  
ia2_pos         0.523652  
znt8_pos        1.000000  
                elisa_pos  lips_iaa_pos  m_iaa_pos  gada_trunc_pos   ia2_pos  \
elisa_pos        1.000000      0.015527  -0.272087       -0.341533 -0.342660   
lips_iaa_pos     0.015527      1.000000   0.156339        0.108204  0.116013   
m_iaa_pos 

In [10]:
# Check correlation of antibodies (raw values)
raw_flags = ['elisa', 'lips_in_iaa', 'lips_unin_iaa', 'm_iaa', 'gada_trunc', 'ia2', 'znt8_c_arg', 'znt8_c_tryp']

# Compute Pearson correlation
pearson_corr = data[raw_flags].corr(method='pearson')
print(pearson_corr)

# Compute Spearman correlation
spearman_corr = data[raw_flags].corr(method='spearman')
print(spearman_corr)

                  elisa  lips_in_iaa  lips_unin_iaa     m_iaa  gada_trunc  \
elisa          1.000000     0.046523       0.065802  0.096522    0.642090   
lips_in_iaa    0.046523     1.000000       0.907465  0.605296    0.042985   
lips_unin_iaa  0.065802     0.907465       1.000000  0.408920    0.070980   
m_iaa          0.096522     0.605296       0.408920  1.000000    0.051792   
gada_trunc     0.642090     0.042985       0.070980  0.051792    1.000000   
ia2            0.580125     0.020651       0.054391  0.016370    0.193802   
znt8_c_arg     0.509764     0.026611       0.050338  0.019184    0.210927   
znt8_c_tryp    0.433116     0.017630       0.025325  0.022985    0.169616   

                    ia2  znt8_c_arg  znt8_c_tryp  
elisa          0.580125    0.509764     0.433116  
lips_in_iaa    0.020651    0.026611     0.017630  
lips_unin_iaa  0.054391    0.050338     0.025325  
m_iaa          0.016370    0.019184     0.022985  
gada_trunc     0.193802    0.210927     0.169616  


In [None]:
# With the binary flags we measure co-positivity, how often are antibodies positive in the same children or sample
# GADA–IA2 (0.75) - Very strong co-positivity, these two antibodies often appear together
# IA2–ZnT8 (0.52) - Moderate–strong co-positivity, part of the same immune cluster
# ADA–ZnT8 (0.47)- Moderate — common co-positivity
# IAA with others (0.30–0.34) - Weaker association, insulin autoantibody tends to appear more independently
# -> Most children who are IA2-positive are also GADA-positive, but many who are IAA-positive are not GADA/IA2 positive

# With the raw values we measure whether the levels of antibodies are correlated
# GADA–IA2 (0.46) - Moderate monotonic link, children with higher GADA tend to have higher IA2
# IA2–ZnT8 (0.64–0.76) - Strong, these two markers rise together; may measure overlapping autoimmune processes
# ZnT8_arg–ZnT8_tryp (0.76) - Very strong, expected, as they are two variants of the same antigen
# IAA with others (0.22–0.34) - Weak, insulin antibody behaves differently, confirming independence

In [None]:
# Count number of positive antibodies
data['num_AB_positive'] = data[['lips_iaa_pos', 'm_iaa_pos', 'gada_trunc_pos', 'ia2_pos', 'znt8_pos']].sum(axis=1)

# Apply ELISA gate
data['effective_AB_positive'] = np.where(data['elisa_pos'], data['num_AB_positive'], 0)

# Classification
conditions = [
    data['effective_AB_positive'] >= 2,
    (data['effective_AB_positive'] == 1) & (~data['znt8_pos']),  # exclude ZnT8-only
    data['effective_AB_positive'] == 0
]
choices = ['early_stage_T1D', 'single_AB_risk', 'negative']
data['result'] = np.select(conditions, choices, default='negative')

data.to_excel('dataset1-copy.xlsx', index=False)