In [1]:
import numpy as np
import pandas as pd
import os

## Check Workspace Settings for Jupyter Notebooks

```json
"jupyter.notebookFileRoot": "${workspaceFolder}",
```

In [2]:
print(os.getcwd())

c:\Users\alan.mcdonagh\OneDrive - Milliman Inc\Projects\51. WiDS Datathon 2025\wids_2025_study\wids_2025_study


In [3]:

fpath_input = os.path.abspath('.\\data')
print(fpath_input)

def get_feats(mode='TRAIN_OLD'):
    """
    Load data for the specified mode (TRAIN_OLD, TRAIN_NEW or TEST).
    """
    print(f"Loading {mode} data...")

    kw_suffix = {
        'TRAIN_OLD' : '',
        'TRAIN_NEW' : '_new',
        'TEST'      : ''
    }
    suffix = kw_suffix[mode]

    # Load quantitative metadata
    feats = pd.read_excel(f"{fpath_input}\\{mode}\\{mode[:5]}_QUANTITATIVE_METADATA{suffix}.xlsx")
    
    # Load categorical metadata
    if mode in ['TRAIN_OLD', 'TRAIN_NEW']:
        cate = pd.read_excel(f"{fpath_input}\\{mode}\\TRAIN_CATEGORICAL_METADATA{suffix}.xlsx")
    else:
        cate = pd.read_excel(f"{fpath_input}\\{mode}\\TEST_CATEGORICAL.xlsx")
    
    # Merge quantitative and categorical data
    feats = pd.merge(
        feats, 
        cate, 
        on  = 'participant_id', 
        how = 'left'
    )
    
    # Load functional connectome matrices
    if mode in ['TRAIN_OLD', 'TEST']:
        func = pd.read_csv(f"{fpath_input}\\{mode}\\{mode[:5]}_FUNCTIONAL_CONNECTOME_MATRICES.csv")
    else:
        func = pd.read_csv(f"{fpath_input}\\{mode}\\{mode[:5]}_FUNCTIONAL_CONNECTOME_MATRICES_new_36P_Pearson.csv")


    feats = pd.merge(
        feats, 
        func, 
        on  = 'participant_id', 
        how = 'left'
    )
    
    # Load training solutions (only for TRAIN mode)
    if mode in ['TRAIN_OLD', 'TRAIN_NEW']:
        solution = pd.read_excel(f"{fpath_input}\\{mode}\\TRAINING_SOLUTIONS.xlsx")
        feats = pd.merge(
            feats, 
            solution, 
            on  = 'participant_id', 
            how = 'left'
        )
    
    return feats

# Load training and test data
train_old = get_feats(mode='TRAIN_OLD')
train_new = get_feats(mode='TRAIN_NEW')
test      = get_feats(mode='TEST'     )

# Display the first few rows of the training data
train_new.head()

c:\Users\alan.mcdonagh\OneDrive - Milliman Inc\Projects\51. WiDS Datathon 2025\wids_2025_study\wids_2025_study\data
Loading TRAIN_OLD data...
Loading TRAIN_NEW data...
Loading TEST data...


Unnamed: 0,participant_id,EHQ_EHQ_Total,ColorVision_CV_Score,APQ_P_APQ_P_CP,APQ_P_APQ_P_ID,APQ_P_APQ_P_INV,APQ_P_APQ_P_OPD,APQ_P_APQ_P_PM,APQ_P_APQ_P_PP,SDQ_SDQ_Conduct_Problems,...,195throw_198thcolumn,195throw_199thcolumn,196throw_197thcolumn,196throw_198thcolumn,196throw_199thcolumn,197throw_198thcolumn,197throw_199thcolumn,198throw_199thcolumn,ADHD_Outcome,Sex_F
0,00aIpNTbG5uh,100.0,13.0,3.0,15.0,44.0,14.0,20.0,27.0,3.0,...,-0.280312,0.03756,0.423037,0.242453,0.336213,0.402338,0.327915,0.539032,1,0
1,00fV0OyyoLfw,92.27,14.0,3.0,12.0,35.0,25.0,28.0,30.0,5.0,...,-0.332783,-0.332711,0.556939,0.475578,0.429196,0.45797,0.312571,0.595978,1,0
2,04X1eiS79T4B,86.67,14.0,3.0,21.0,37.0,18.0,26.0,28.0,3.0,...,-0.002132,-0.175586,0.679183,0.290292,0.48668,0.255208,0.575017,0.605182,0,1
3,05ocQutkURd6,93.34,14.0,3.0,11.0,42.0,15.0,20.0,28.0,0.0,...,-0.199576,-0.216457,0.519074,0.298586,0.415466,0.511607,0.361204,0.446613,0,1
4,06YUNBA9ZRLq,0.0,14.0,8.0,12.0,35.0,22.0,12.0,24.0,6.0,...,-0.141012,-0.002865,0.515169,0.336139,0.31643,0.44223,0.177079,0.378278,1,0


In [4]:
print(f'There are {len(train_new.columns)} columns in total; this will take too long in this demo so we will reduce this to 10.')

cols_x = train_new.columns[:10]
cols_y = ['ADHD_Outcome', 'Sex_F']
cols = [
    *cols_x,
    *cols_y,
]

There are 19930 columns in total; this will take too long in this demo so we will reduce this to 10.


In [5]:
import sweetviz as sv

# Define the FeatureConfig object to force the target features to be numerical
my_feature_config = sv.FeatureConfig(force_num=['ADHD_Outcome', 'Sex_F'])

# Create a boolean array to use as the grouping condition
condition_series = train_new['ADHD_Outcome'] == 1

# Analyze the dataset with the specified FeatureConfig object and grouping condition
my_report = sv.compare_intra(
    train_new[cols], 
    condition_series, 
    ['ADHD', 'No_ADHD'], 
    feat_cfg=my_feature_config, 
    target_feat='ADHD_Outcome',
    pairwise_analysis='off',
)

# Generate and display the report
my_report.show_html('.\\notebooks\\sweetviz_examples\\new_compare_intra.html')

                                             |          | [  0%]   00:00 -> (? left)

Report .\notebooks\sweetviz_examples\new_compare_intra.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [6]:
# Pairwise analysis is turned off to avoid detailed pairwise feature comparisons
# This can speed up the report generation and reduce the report size
report = sv.compare(
    [train_new[cols] , "Train"], 
    [test[cols_x], "Test" ],
    pairwise_analysis='on',
)

report.show_html('.\\notebooks\\sweetviz_examples\\new_compare.html')

                                             |          | [  0%]   00:00 -> (? left)

Report .\notebooks\sweetviz_examples\new_compare.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [7]:
# Pairwise analysis is turned off to avoid detailed pairwise feature comparisons
# This can speed up the report generation and reduce the report size
report = sv.analyze(
    [train_new[cols] , "Train"],
    #target_feat=cols_y, 
    feat_cfg=my_feature_config, 
    pairwise_analysis='on',
)

report.show_html('.\\notebooks\\sweetviz_examples\\new_analyse.html')

                                             |          | [  0%]   00:00 -> (? left)

Report .\notebooks\sweetviz_examples\new_analyse.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
