# **WiDS Datathon 2025 - Unraveling the Mysteries of the Female Brain: Sex Patterns in ADHD**
## **Authors:** Sergiu Buhatel and Ramy El Gharbawy

## **Import the necessary libraries**

In [731]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cbook import boxplot_stats
import seaborn as sns

# To scale the data using z-score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Algorithms to use
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Metrics to evaluate the model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report,recall_score,precision_score, accuracy_score

# For tuning the model
from sklearn.model_selection import GridSearchCV
from IPython.display import display, HTML, Markdown

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

## **Data Overview**

- Reading the dataset
- Understanding the shape of the dataset
- Checking the data types
- Checking for missing values

In [732]:
# Loading the datasets
df_train_categorical = pd.read_excel('./Data/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx').set_index("participant_id")
df_train_functional = pd.read_csv('./Data/TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv').set_index("participant_id")
df_train_quantitative = pd.read_excel('./Data/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx').set_index("participant_id")
df_training_solutions = pd.read_excel('./Data/TRAIN/TRAINING_SOLUTIONS.xlsx').set_index("participant_id")

df_test_categorical = pd.read_excel('./Data/TEST/TEST_CATEGORICAL.xlsx').set_index("participant_id")
df_test_functional = pd.read_csv('./Data/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv').set_index("participant_id")
df_test_quantitative = pd.read_excel('./Data/TEST/TEST_QUANTITATIVE_METADATA.xlsx').set_index("participant_id")

In [None]:
# PLotting a histogram of the functional connectivity values
plt.figure(figsize=(10, 5))
sns.histplot(df_train_functional.values.flatten(), bins=50, kde=True)
plt.xlabel("Connectivity Strength")
plt.ylabel("Frequency")
plt.title("Distribution of Functional Connectivity Values")
plt.show()

In [None]:
values = df_train_functional.iloc[0, :].values

if values.shape[0] == 19900:  # Likely an upper-triangle representation
    n = 200  # Assuming 200 regions
    matrix = np.zeros((n, n))  # Create empty symmetric matrix

    # Get upper triangle indices
    triu_indices = np.triu_indices(n, k=1)

    # Fill the upper triangle
    matrix[triu_indices] = values

    # Mirror to make it symmetric
    matrix = matrix + matrix.T

#  Check symmetry
print("Is the matrix symmetric?", np.allclose(matrix, matrix.T, atol=1e-5))

# Plot heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(matrix, cmap="coolwarm", center=0)
plt.title("Functional Connectivity Matrix")
plt.xlabel("Brain Region")
plt.ylabel("Brain Region")
plt.show()

# Plot histogram of values
plt.figure(figsize=(8, 5))
sns.histplot(matrix.flatten(), bins=50, kde=True)
plt.title("Distribution of Functional Connectivity Values")
plt.xlabel("Connectivity Strength")
plt.ylabel("Frequency")
plt.show()

In [None]:
# Copying data to another variable to avoid any changes to original data
data_train_categorical=df_train_categorical.copy()
data_train_functional=df_train_functional.copy()
data_train_quantitative=df_train_quantitative.copy()
data_training_solutions=df_training_solutions.copy()

data_test_categorical=df_test_categorical.copy()
data_test_functional=df_test_functional.copy()
data_test_quantitative=df_test_quantitative.copy()

In [None]:
def display_head_with_scroll(data):
    # Convert the DataFrame to HTML and wrap it with a div that enables horizontal scrolling
    html = data.head().to_html()
    html_with_scroll = f'<div style="overflow-x: auto; white-space: nowrap;">{html}</div>'
    
    # Display the HTML with the scroll
    display(HTML(html_with_scroll))

# Check for null values
def columns_containing_null(df):
    # Get the count of null values per column
    null_count = df.isnull().sum()
    
    # Filter to show only columns with null values
    columns_with_nulls = null_count[null_count > 0]
    
    print("\nColumns containing null and how many values are null:\n" + str(columns_with_nulls) + "\n")

def summary_statistics(data):
    # Creating numerical columns
    num_cols = data.select_dtypes('number').columns
    
    # Checking the descriptive statistics of the numerical columns
    html = data[num_cols].describe().T.to_html()

    html_with_scroll = f'<div style="overflow-x: auto; white-space: nowrap;">{html}</div>'
    
    # Display the HTML with the scroll
    display(HTML(html_with_scroll))
    
def display_data(data, title = None):
    if title is not None:
        # Display the title as an h1 header
        display(HTML(f'<h3>{title}</h1>'))
        
    display_head_with_scroll(data)

    # Get shape of the dataset in terms of number of rows and number of colums
    print("Shape:" + str(data.shape) + "\n")
    
    # Check the data types as part of the info of the data
    data_train_categorical.info() 

    # Check for null values
    columns_containing_null(data)

    # Summary statistics
    summary_statistics(data)

# Function to get participant IDs with rows that have null values
def get_participant_ids_with_nulls(df):
    # Identify rows with any null value
    rows_with_nulls = df[df.isnull().any(axis=1)]
    
    # Get the participant_id (index) of those rows
    participant_ids_with_nulls = rows_with_nulls.index.tolist()
    
    return participant_ids_with_nulls
    
def display_shape_and_null_data(data, title = None):
    if title is not None:
        # Display the title as an h1 header
        display(HTML(f'<h3>{title}</h1>'))

    # Get shape of the dataset in terms of number of rows and number of colums
    print("Shape:" + str(data.shape) + "\n")

    # Check for null values
    participant_ids_with_nulls = get_participant_ids_with_nulls(data)
    print("Number of columns with null value:" + str(len(participant_ids_with_nulls)))

In [None]:
display_data(data_train_categorical, "TRAIN_CATEGORICAL_METADATA.xlsx")

In [None]:
#display_data(data_train_functional, "TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv")

In [None]:
display_data(data_train_quantitative, "TRAIN_QUANTITATIVE_METADATA.xlsx")

In [None]:
display_data(data_training_solutions, "TRAINING_SOLUTIONS.xlsx")
print("\nList of possible values for 'ADHD_Outcome': " + str(data_training_solutions['ADHD_Outcome'].unique()))
print("\nList of possible values for 'Sex_F': " + str(data_training_solutions['Sex_F'].unique()))

### **Observations**
- Two values are possble for '**ADHD_Outcome**': **1** and **0**. (**1** = **Yes** and **0** = **No**) 
- Two values are possible for '**Sex_F**': **1** and **0**. (**1** means **Female**, and **0** means **Male**)

In [None]:
display_data(data_test_categorical, "TEST_CATEGORICAL.xlsx")

In [None]:
#display_data(data_test_functional, "TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv")

In [None]:
display_data(data_test_quantitative, "TEST_QUANTITATIVE_METADATA.xlsx")

In [None]:
# Check the data types as part of the info of the data
data_train_categorical.info()

In [None]:
# Get three lists of participant_id that have at least one column as null
participant_ids_with_null_categorical = get_participant_ids_with_nulls(data_train_categorical)
participant_ids_with_null_quantitative = get_participant_ids_with_nulls(data_train_quantitative)
participant_ids_with_null_solutions = get_participant_ids_with_nulls(data_training_solutions)

participant_ids_with_null_test_categorical = get_participant_ids_with_nulls(data_test_categorical)
participant_ids_with_null_test_quantitative = get_participant_ids_with_nulls(data_test_quantitative)

# Aggregate the lists into one list excluding duplicates
aggregate_participant_ids_with_null = list(set(participant_ids_with_null_categorical) | 
                                           set(participant_ids_with_null_quantitative) | 
                                           set(participant_ids_with_null_solutions) |
                                           set(participant_ids_with_null_test_categorical)|
                                           set(participant_ids_with_null_test_quantitative))

## **Data Preprocessing**

### **Remove null items**
Remove items from all dataframes coresponding to any participant_id that has at least one column as null

In [None]:
# Remove items from dataframe coresponding to participant_id that have at least one column as null
def remove_items_based_on_participant_id_list(participant_id_list, df):
    # Remove rows where 'participant_id' (index) is in the list
    filtered_df = df[~df.index.isin(participant_id_list)]
    
    # Remove rows where any other column (except 'participant_id') is null
    filtered_df = filtered_df.dropna()
    
    return filtered_df

data_train_categorical = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_train_categorical)
display_shape_and_null_data(data_train_categorical, "TRAIN_CATEGORICAL_METADATA.xlsx")

In [None]:
data_train_quantitative = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_train_quantitative)
display_shape_and_null_data(data_train_quantitative, "TRAIN_QUANTITATIVE_METADATA.xlsx")

In [None]:
data_training_solutions = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_training_solutions)
display_shape_and_null_data(data_training_solutions, "TRAINING_SOLUTIONS.xlsx")

In [None]:
data_test_categorical = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_test_categorical)
display_shape_and_null_data(data_test_categorical, "TEST_CATEGORICAL.xlsx")

In [None]:
data_test_quantitative = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_test_quantitative)
display_shape_and_null_data(data_test_quantitative, "TEST_QUANTITATIVE_METADATA.xlsx")

### TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv

In [None]:
data_train_functional = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_train_functional)

### TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv

In [None]:
data_test_functional = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_test_functional)

### **Aggregate training data and unseen data**
- Aggregate training data into data, and aggregate test data into unseen data
- Define label data

In [None]:
data_combined = pd.merge(data_train_quantitative, data_train_categorical, left_index=True, right_index=True, how="left")
data_combined_without_functional=data_combined.copy()
data_combined = pd.merge(data_combined, data_train_functional, left_index=True, right_index=True, how="left")
labels=data_training_solutions.copy()

unseen_data_combined = pd.merge(data_test_quantitative, data_test_categorical, left_index=True, right_index=True, how="left")
unseen_data_combined = pd.merge(unseen_data_combined, data_test_functional, left_index=True, right_index=True, how="left")

In [None]:
#display_data(data_combined, "DATA COMBINED")
data_combined.shape

In [None]:
#display_data(unseen_data_combined.shape, "UNSEEN DATA COMBINED")
unseen_data_combined.shape

In [None]:
labels.shape

## **Data Exploration**

### **Numerical and Categorical Columns**

In [None]:
# Identify numerical and categorical columns
numerical_cols_without_functional = data_combined_without_functional.select_dtypes(include=['int64', 'float64']).columns
categorical_cols_without_functional = data_combined_without_functional.select_dtypes(include=['object']).columns

print("Numerical Columns: ", numerical_cols_without_functional)
print("Categorical Columns: ", categorical_cols_without_functional)

In [None]:
numerical_cols = data_combined.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data_combined.select_dtypes(include=['object']).columns

### **Correlation Analysis**
For numerical features, it’s important to explore how they correlate with each other and with the target variables. This will help identify potentially strong predictors and check for multicollinearity.

In [None]:
def corelation_analysis(data_combined, labels, numerical_cols, first_half = None):
    data_combined_with_labels = pd.merge(data_combined, labels, left_index=True, right_index=True, how="left")
    numerical_cols = list(numerical_cols)
    
    # Limit to the first half numerical columns (if more than half)
    if first_half is not None:
        threshold = int(len(numerical_cols)/2)
        if first_half is True:
            numerical_cols = numerical_cols[:threshold]
        else:
            numerical_cols = numerical_cols[threshold:]
    numerical_cols = numerical_cols + ["ADHD_Outcome", "Sex_F"]
    
    # Correlation heatmap for numerical features
    correlation_matrix = data_combined_with_labels[numerical_cols].corr()
    plt.figure(figsize=(12, 8))
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title("Correlation Matrix of Numerical Features")
    plt.show()

for first_half in [True, False]:
    corelation_analysis(data_combined_without_functional, labels, numerical_cols_without_functional, first_half)

### **Observations**
The following predictors are not correlated with either **ADHD_Outcome** or **Sex_F**, and therefore can be dropped:
"**Basic_Demos_Study_Site**", "**MRI_Track_Scan_Location**", "**PreInt_Demos_Fam_Child_Ethnicity**", "**PreInt_Demos_Fam_Child_Race**", "**Barratt_Barratt_P1_Occ**", '**Barratt_Barratt_P2_Occ**".

### **Visualizing the Data**
Visualizing the relationship between features and target variables can provide deeper insights. Here are a few ideas for visualizations:

Boxplots/Histograms: To explore the distribution of numerical features.
Count Plots: For categorical variables, like Sex_F and ADHD_Outcome.

In [None]:
# Visualize the distribution of numerical features
data_combined_without_functional[numerical_cols_without_functional].hist(bins=20, figsize=(12, 8))
plt.suptitle("Histograms of Numerical Features")
plt.show()

# Visualize the distribution of ADHD_Outcome from the labels DataFrame
sns.countplot(x='ADHD_Outcome', data=labels)
plt.title('Distribution of ADHD Outcome')
plt.show()

# Visualize the distribution of Sex_F from the labels DataFrame
sns.countplot(x='Sex_F', data=labels)
plt.title('Distribution of Sex (Female = 1, Male = 0)')
plt.show()

## **Data Processing**

### **Feature Engineering**
Drop columns that are not correlated with the target variables, based on the observations in the correlation matrices.

In [None]:
# Drop columns 
drop_cols = [
    "Basic_Demos_Study_Site", "MRI_Track_Scan_Location", "PreInt_Demos_Fam_Child_Ethnicity",
    "PreInt_Demos_Fam_Child_Race", 'Barratt_Barratt_P1_Occ', 'Barratt_Barratt_P2_Occ'
]
data_combined.drop(drop_cols, axis=1, inplace=True)
unseen_data_combined.drop(drop_cols, axis=1, inplace=True)

numerical_cols = list(set(numerical_cols) - set(drop_cols))

### **Treating Outliers**
Check for outliers and if found, handle them as following:
Calculate the lower whisker and upper whisker for each numerical column and replace any smaller value than lower whisker with lower whisker, and any larger value than upper whisker with upper whisker. Only print the number of outliers.

In [None]:
def treating_outliers(data, num_columns):
    outliers_info = {}  # Dictionary to track outliers removed for each column
    total_outliers = 0  # To track total outliers across all columns
    
    # Iterate over numerical columns
    for num_column in num_columns:
        # Get whisker for each column
        lower_whisker, upper_whisker = get_whiskers_by_column(data, num_column)

        # Identify outliers before replacing
        lower_outliers = data[data[num_column] < lower_whisker]
        upper_outliers = data[data[num_column] > upper_whisker]
        
        # Count outliers
        num_lower_outliers = len(lower_outliers)
        num_upper_outliers = len(upper_outliers)

        # Sum total outliers for all columns
        total_outliers += num_lower_outliers + num_upper_outliers

        # Print outliers info if any outliers exist
        if num_lower_outliers > 0 or num_upper_outliers > 0:
            print(f"Outliers for column '{num_column}':")
            print(f"  - Number of lower outliers: {num_lower_outliers}")
            print(f"  - Number of upper outliers: {num_upper_outliers}")

        # Replace values smaller than lower_whisker with lower_whisker, 
        # and values larger than upper_whisker with upper_whisker using clip method
        data[num_column] = data[num_column].clip(lower=lower_whisker, upper=upper_whisker)
        
        # Store outliers info in dictionary for potential future reference
        outliers_info[num_column] = {
            'lower_outliers': lower_outliers,
            'upper_outliers': upper_outliers,
            'num_lower_outliers': num_lower_outliers,
            'num_upper_outliers': num_upper_outliers
        }

    print("\nTotal number of outliers across all columns:", total_outliers)
    return data, outliers_info

# num_columns = [col for col in data_combined.select_dtypes(include=['float64', 'int64']).columns]
# data_combined, outliers_info = treating_outliers(data_combined, num_columns)

### **Observations**
Since 467 outliers were found out of 845 items in the total data, we decided to leave the outliers in, as they represent more than half of the data.

### **Split data into training and testing**
- Split the data into 80% training and 20% testing, and stratify based on male or female ('Sex_F') from the labels variable. 
- Set a random_state, so that it will generate the same random output each time. This controls the seed for the random number generator.

In [None]:
# Renamed labels and data_combined into Y and X
Y = labels
X = data_combined

# Define 2 targets
labels_adhd = labels["ADHD_Outcome"]
labels_sex = labels["Sex_F"]

# Splitting the data and stratify based on ADHD_Outcome
x_train_adhd, x_test_adhd, y_train_adhd, y_test_adhd = train_test_split(X, labels_adhd, test_size = 0.2, 
                                                                        random_state = 1, stratify = labels_adhd)
# Splitting the data and stratify based on Sex_F
x_train_sex, x_test_sex, y_train_sex, y_test_sex = train_test_split(X, labels_sex, test_size = 0.2, 
                                                                    random_state = 1, stratify = labels_sex)

In [None]:
def display_dataframe_shapes():
    shapes_list = [
        (x_train_adhd, 'x_train_adhd'),  
        (x_test_adhd, 'x_test_adhd'),  
        (y_train_adhd, 'y_train_adhd'),  
        (y_test_adhd, 'y_test_adhd'),  
        (x_train_sex, 'x_train_sex'),  
        (x_test_sex, 'x_test_sex'),  
        (y_train_sex, 'y_train_sex'),  
        (y_test_sex, 'y_test_sex'),  
    ]
    for item, name in shapes_list:
        print(f"{name}.shape: {item.shape}")
        
display_dataframe_shapes()

## **Model Building**

### **Logistic Regression**

In [None]:
def generic_regression(x_train, y_train, x_test, y_test, target_names, model):
    model.fit(x_train, y_train)  # Train the model on the training data
    
    # Make predictions
    y_pred = model.predict(x_test)  # Use the trained model to make predictions on the test set
    
    # Evaluate the model's performance
    
    # Accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    
    # Classification Report (precision, recall, f1-score)
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=target_names))
    
    # Confusion Matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:\n", cm)
    
    # Plot confusion matrix
    plt.figure(figsize=(6, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={'size': 16}, 
                xticklabels=target_names, yticklabels=target_names)
    plt.title("Confusion Matrix")
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()


def logistic_regression(x_train, y_train, x_test, y_test, target_names):
    # Train the logistic regression model
    model = LogisticRegression(max_iter=1000)  # Logistic Regression model

    # Fit, predict, display metrics
    generic_regression(x_train, y_train, x_test, y_test, target_names, model)
    
display(Markdown('### ADHD_Outcome'))
logistic_regression(x_train_adhd, y_train_adhd, x_test_adhd, y_test_adhd, target_names=['No', 'Yes'])

display(Markdown('### Sex_F'))
logistic_regression(x_train_sex, y_train_sex, x_test_sex, y_test_sex, target_names=['Male', 'Female'])

### **Random Forest**

In [None]:
def random_forest(x_train, y_train, x_test, y_test, target_names):
    # Train the Random Forest model
    model = RandomForestClassifier(n_estimators=100, random_state=42)  # Random Forest model with 100 trees

    # Fit, predict, display metrics
    generic_regression(x_train, y_train, x_test, y_test, target_names, model)

# Displaying the sections with Random Forest
display(Markdown('### ADHD_Outcome'))
random_forest(x_train_adhd, y_train_adhd, x_test_adhd, y_test_adhd, target_names=['No', 'Yes'])

display(Markdown('### Sex_F'))
random_forest(x_train_sex, y_train_sex, x_test_sex, y_test_sex, target_names=['Male', 'Female'])


In [None]:
### **XGBoost**

In [None]:
def xgboost_model(x_train, y_train, x_test, y_test, target_names):
    # Train the XGBoost model
    model = xgb.XGBClassifier(n_estimators=100, random_state=42)  # XGBoost model with 100 trees

    # Fit, predict, display metrics
    generic_regression(x_train, y_train, x_test, y_test, target_names, model)

# Displaying the sections with XGBoost
display(Markdown('### ADHD_Outcome'))
xgboost_model(x_train_adhd, y_train_adhd, x_test_adhd, y_test_adhd, target_names=['No', 'Yes'])

display(Markdown('### Sex_F'))
xgboost_model(x_train_sex, y_train_sex, x_test_sex, y_test_sex, target_names=['Male', 'Female'])
