# WiDS Datathon 2025 - Unraveling the Mysteries of the Female Brain: Sex Patterns in ADHD

## **Import the necessary libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.cbook import boxplot_stats
import seaborn as sns

# To scale the data using z-score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Algorithms to use
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Metrics to evaluate the model
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report,recall_score,precision_score, accuracy_score

# For tuning the model
from sklearn.model_selection import GridSearchCV
from IPython.display import display, HTML

# To ignore warnings
import warnings
warnings.filterwarnings("ignore")

## **Data Overview**

- Reading the dataset
- Understanding the shape of the dataset
- Checking the data types
- Checking for missing values

In [2]:
# Loading the datasets
df_train_categorical = pd.read_excel('./Data/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx').set_index("participant_id")
#df_train_functional = pd.read_csv('./Data/TRAIN/TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv').set_index("participant_id")
df_train_quantitative = pd.read_excel('./Data/TRAIN/TRAIN_QUANTITATIVE_METADATA.xlsx').set_index("participant_id")
df_training_solutions = pd.read_excel('./Data/TRAIN/TRAINING_SOLUTIONS.xlsx').set_index("participant_id")

df_test_categorical = pd.read_excel('./Data/TEST/TEST_CATEGORICAL.xlsx').set_index("participant_id")
#df_test_functional = pd.read_csv('./Data/TEST/TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv').set_index("participant_id")
df_test_quantitative = pd.read_excel('./Data/TEST/TEST_QUANTITATIVE_METADATA.xlsx').set_index("participant_id")

FileNotFoundError: [Errno 2] No such file or directory: './Data/TRAIN/TRAIN_CATEGORICAL_METADATA.xlsx'

In [None]:
# Copying data to another variable to avoid any changes to original data
data_train_categorical=df_train_categorical.copy()
#data_train_functional=df_train_functional.copy()
data_train_quantitative=df_train_quantitative.copy()
data_training_solutions=df_training_solutions.copy()

data_test_categorical=df_test_categorical.copy()
#data_test_functional=df_test_functional.copy()
data_test_quantitative=df_test_quantitative.copy()

In [None]:
def display_head_with_scroll(data):
    # Convert the DataFrame to HTML and wrap it with a div that enables horizontal scrolling
    html = data.head().to_html()
    html_with_scroll = f'<div style="overflow-x: auto; white-space: nowrap;">{html}</div>'
    
    # Display the HTML with the scroll
    display(HTML(html_with_scroll))

# Check for null values
def columns_containing_null(df):
    # Get the count of null values per column
    null_count = df.isnull().sum()
    
    # Filter to show only columns with null values
    columns_with_nulls = null_count[null_count > 0]
    
    print("\nColumns containing null and how many values are null:\n" + str(columns_with_nulls) + "\n")

def summary_statistics(data):
    # Creating numerical columns
    num_cols = data.select_dtypes('number').columns
    
    # Checking the descriptive statistics of the numerical columns
    html = data[num_cols].describe().T.to_html()

    html_with_scroll = f'<div style="overflow-x: auto; white-space: nowrap;">{html}</div>'
    
    # Display the HTML with the scroll
    display(HTML(html_with_scroll))
    
def display_data(data, title = None):
    if title is not None:
        # Display the title as an h1 header
        display(HTML(f'<h3>{title}</h1>'))
        
    display_head_with_scroll(data)

    # Get shape of the dataset in terms of number of rows and number of colums
    print("Shape:" + str(data.shape) + "\n")
    
    # Check the data types as part of the info of the data
    data_train_categorical.info() 

    # Check for null values
    columns_containing_null(data)

    # Summary statistics
    summary_statistics(data)

# Function to get participant IDs with rows that have null values
def get_participant_ids_with_nulls(df):
    # Identify rows with any null value
    rows_with_nulls = df[df.isnull().any(axis=1)]
    
    # Get the participant_id (index) of those rows
    participant_ids_with_nulls = rows_with_nulls.index.tolist()
    
    return participant_ids_with_nulls
    
def display_shape_and_null_data(data, title = None):
    if title is not None:
        # Display the title as an h1 header
        display(HTML(f'<h3>{title}</h1>'))

    # Get shape of the dataset in terms of number of rows and number of colums
    print("Shape:" + str(data.shape) + "\n")

    # Check for null values
    participant_ids_with_nulls = get_participant_ids_with_nulls(data)
    print("Number of columns with null value:" + str(len(participant_ids_with_nulls)))

In [None]:
display_data(data_train_categorical, "TRAIN_CATEGORICAL_METADATA.xlsx")

In [None]:
#display_data(data_train_functional, "TRAIN_FUNCTIONAL_CONNECTOME_MATRICES.csv")

In [None]:
display_data(data_train_quantitative, "TRAIN_QUANTITATIVE_METADATA.xlsx")

In [None]:
display_data(data_training_solutions, "TRAINING_SOLUTIONS.xlsx")

In [None]:
display_data(data_test_categorical, "TEST_CATEGORICAL.xlsx")

In [None]:
#display_data(data_test_functional, "TEST_FUNCTIONAL_CONNECTOME_MATRICES.csv")

In [None]:
display_data(data_test_quantitative, "TEST_QUANTITATIVE_METADATA.xlsx")

In [None]:
# Check the data types as part of the info of the data
data_train_categorical.info()

In [None]:
# Get three lists of participant_id that have at least one column as null
participant_ids_with_null_categorical = get_participant_ids_with_nulls(data_train_categorical)
participant_ids_with_null_quantitative = get_participant_ids_with_nulls(data_train_quantitative)
participant_ids_with_null_solutions = get_participant_ids_with_nulls(data_training_solutions)

participant_ids_with_null_test_categorical = get_participant_ids_with_nulls(data_test_categorical)
participant_ids_with_null_test_quantitative = get_participant_ids_with_nulls(data_test_quantitative)

# Aggregate the lists into one list excluding duplicates
aggregate_participant_ids_with_null = list(set(participant_ids_with_null_categorical) | 
                                           set(participant_ids_with_null_quantitative) | 
                                           set(participant_ids_with_null_solutions) |
                                           set(participant_ids_with_null_test_categorical)|
                                           set(participant_ids_with_null_test_quantitative))

## **Data Preprocessing**

### **Remove null items**
Remove items from all dataframes coresponding to any participant_id that has at least one column as null

In [None]:
# Remove items from dataframe coresponding to participant_id that have at least one column as null
def remove_items_based_on_participant_id_list(participant_id_list, df):
    # Remove rows where 'participant_id' (index) is in the list
    filtered_df = df[~df.index.isin(participant_id_list)]
    
    # Remove rows where any other column (except 'participant_id') is null
    filtered_df = filtered_df.dropna()
    
    return filtered_df

data_train_categorical = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_train_categorical)
display_shape_and_null_data(data_train_categorical, "TRAIN_CATEGORICAL_METADATA.xlsx")

In [None]:
data_train_quantitative = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_train_quantitative)
display_shape_and_null_data(data_train_quantitative, "TRAIN_QUANTITATIVE_METADATA.xlsx")

In [None]:
data_training_solutions = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_training_solutions)
display_shape_and_null_data(data_training_solutions, "TRAINING_SOLUTIONS.xlsx")

In [None]:
data_test_categorical = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_test_categorical)
display_shape_and_null_data(data_test_categorical, "TEST_CATEGORICAL.xlsx")

In [None]:
data_test_quantitative = remove_items_based_on_participant_id_list(aggregate_participant_ids_with_null, data_test_quantitative)
display_shape_and_null_data(data_test_quantitative, "TEST_QUANTITATIVE_METADATA.xlsx")

### **Aggregate training data and unseen data**
- Aggregate training data into data, and aggregate test data into unseen data
- Define label data

In [None]:
data_combined = pd.merge(data_train_quantitative, data_train_categorical, left_index=True, right_index=True, how="left")
labels=data_training_solutions.copy()
unseen_data_combined = pd.merge(data_test_quantitative, data_test_categorical, left_index=True, right_index=True, how="left")

In [None]:
#display_data(data_combined, "DATA COMBINED")
data_combined.shape

In [None]:
#display_data(unseen_data_combined.shape, "UNSEEN DATA COMBINED")
unseen_data_combined.shape

In [None]:
labels.shape

## **Data Exploration**

### **Numerical and Categorical Columns**

In [None]:
# Identify numerical and categorical columns
numerical_cols = data_combined.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = data_combined.select_dtypes(include=['object']).columns

print("Numerical Columns: ", numerical_cols)
print("Categorical Columns: ", categorical_cols)

### **Correlation Analysis**
For numerical features, it’s important to explore how they correlate with each other and with the target variables. This will help identify potentially strong predictors and check for multicollinearity.

In [None]:
# Correlation heatmap for numerical features
correlation_matrix = data_combined[numerical_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix of Numerical Features")
plt.show()

### **Visualizing the Data**
Visualizing the relationship between features and target variables.

In [None]:
# Visualize the distribution of numerical features
data_combined[numerical_cols].hist(bins=20, figsize=(12, 8))
plt.suptitle("Histograms of Numerical Features")
plt.show()

# Visualize the distribution of ADHD_Outcome from the labels DataFrame
sns.countplot(x='ADHD_Outcome', data=labels)
plt.title('Distribution of ADHD Outcome')
plt.show()

# Visualize the distribution of Sex_F from the labels DataFrame
sns.countplot(x='Sex_F', data=labels)
plt.title('Distribution of Sex (Female = 1, Male = 0)')
plt.show()

## **Split data into training and testing**
- Split the data into 70% training and 30% testing, and stratify based on male or female ('Sex_F') from the labels variable. 
- Set a random_state, so that it will generate the same random output each time. This controls the seed for the random number generator.

In [None]:
# Renamed labels and data_combined into Y and X
Y = labels
X = data_combined

# Splitting the data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 1, stratify=Y['Sex_F'])

# Split labels into "ADHD_Outcome" and "Sex_F"
y_train_adhd = y_train["ADHD_Outcome"]
y_train_sex = y_train["Sex_F"]
y_test_adhd = y_test["ADHD_Outcome"]
y_test_sex = y_test["Sex_F"]

In [None]:
x_train.shape

In [None]:
x_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape