### Imports

In [7]:
# imports

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [8]:
# import, preprocess, and look over raw data

df = pd.read_csv("data/data_11_26_2024.csv")
df = df.dropna(subset=['DatasetID'])
df.replace("n/a", "", inplace=True)
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)
df = df.applymap(lambda x: x.lower() if isinstance(x, str) else x)
df.rename(columns={'iEEG modality': 'iEEG Modality'}, inplace=True)
# df.set_index('DatasetID', inplace=True)

print(f'df shape: {df.shape}')
df.tail()

df shape: (303, 70)



DataFrame.applymap has been deprecated. Use DataFrame.map instead.


DataFrame.applymap has been deprecated. Use DataFrame.map instead.



Unnamed: 0,intern,DatasetID,Warning Message,Error Message,has_visualization,No papers linked - hard to find other data,"Multiple Papers linked, none match NEMAR",HED anotation,Participants,Age range,...,Published date,Uploaded by,Date last update,Dataset DOI,References and links,License,On Brainlife.org,Funding,Ethics Approvals,IRB
301,liz,ds003483,,,False,,,no,21,,...,2021-01-24 10:37:19,luis fernando antón toro,,10.18112/openneuro.ds003483.v1.0.2,,cc0,True,,,
302,liz,ds003352,please use the checkmaxfilter option before pr...,,False,,,no,18,17-26,...,2020-11-03 21:07:28,conway lab,,10.18112/openneuro.ds003352.v1.0.0,"appelhoff, s., sanderson, m., brooks, t., vlie...",cc0,True,,,
303,liz,ds002791,the dataset indicates it contains data epochs ...,,False,,,no,23,19-40,...,2020-07-17 20:53:39,ahmad mheich,,10.18112/openneuro.ds002791.v1.0.0,,cc0,True,,,
304,peter,ds005107,,,,,,no,21,20-27,...,2024-06-26 6:05:45,wei xu,,doi:10.18112/openneuro.ds005107.v1.0.3,,cc0,False,,,
305,peter,ds005274,,,,,,no,22,20-26,...,2024-07-07 9:38:41,yukako ito,,doi:10.18112/openneuro.ds005274.v1.0.0,,cc0,False,,,


### Figure 1

Modality x Modality of experiment x type of experiment 

Network chart start w main modality (EEG, MEG, IEGG) and then continue to break up (like figure 4 in other paper)

In [3]:
import networkx as nx

df_fig1 = df[['EEG Modality', 'MEG Modality', 'iEEG modality', 'FOR FIGURE - modality of exp', 'FOR FIGURE - type of exp']]
primary_modalities = ['EEG', 'MEG', 'iEEG']

modalities_fig8 = df['FOR FIGURE - modality of exp'].unique()

# Create a directed graph for all primary modalities
G_all = nx.DiGraph()

# Add nodes for primary modalities
for modality in primary_modalities:
    G_all.add_node(modality)

# Add nodes and edges for additional modalities
for additional in modalities_fig8:
    G_all.add_node(additional)
    for modality in primary_modalities:
        count = df_fig1[df_fig1['FOR FIGURE - modality of exp'].str.contains(str(additional), case=False, na=False)].shape[0]
        if count > 0:
            G_all.add_edge(modality, additional, weight=count)

# Define colors for the nodes and edges
node_colors = []
edge_colors = []

for node in G_all.nodes():
    if node in primary_modalities:
        if node == 'EEG':
            node_colors.append('red')
        elif node == 'MEG':
            node_colors.append('blue')
        elif node == 'iEEG':
            node_colors.append('green')
    else:
        node_colors.append('#cccccc')

for edge in G_all.edges():
    if edge[0] == 'EEG':
        edge_colors.append('red')
    elif edge[0] == 'MEG':
        edge_colors.append('blue')
    elif edge[0] == 'iEEG':
        edge_colors.append('green')
    else:
        edge_colors.append('gray')

# Draw the combined graph with primary modalities in different locations
pos_all = nx.spring_layout(G_all, seed=42)
pos_all['EEG'] = [-0.5, 0]
pos_all['MEG'] = [0.5, 0]
pos_all['iEEG'] = [0, 0.5]

plt.figure(figsize=(20, 15))
nx.draw(G_all, 
        pos_all, 
        with_labels=True, 
        node_size=3000, 
        node_color=node_colors, 
        font_size=20, 
        font_weight="bold", 
        edge_color=edge_colors,
        alpha=0.5)
labels_all = nx.get_edge_attributes(G_all, 'weight')
nx.draw_networkx_edge_labels(G_all, pos_all, edge_labels=labels_all, font_size='20')
plt.title("Network Diagram of EEG, MEG, iEEG and Additional Modalities")
plt.show()


KeyError: "['iEEG modality'] not in index"

In [None]:
# Create lists for the nodes and links
nodes = []
node_indices = {}
links = []
index = 0

# Add primary modalities to nodes
for modality in primaryModalities_fig1:
    nodes.append(modality)
    node_indices[modality] = index
    index += 1

# Add additional modalities to nodes
for additional in modalities_fig1:
    if pd.notna(additional):
        if additional in ['motor', 'other']:
            additional = f"modality_{additional}"
        nodes.append(additional)
        node_indices[additional] = index
        index += 1

# Add types of experiments to nodes
for exp_type in typeExp_fig1:
    if pd.notna(exp_type):
        if exp_type in ['motor', 'other']:
            exp_type = f"exp_{exp_type}"
        nodes.append(exp_type)
        node_indices[exp_type] = index
        index += 1

# Create links between primary modalities and additional modalities
for modality in primaryModalities_fig1:
    for additional in modalities_fig1:
        if pd.notna(additional):
            additional_key = f"modality_{additional}" if additional in ['motor', 'other'] else additional
            count = df_fig1[(df_fig1[f'{modality} Modality'] == 'x') & 
                            (df_fig1['FOR FIGURE - modality of exp'] == additional)].shape[0]
            if count > 0:
                links.append({
                    'source': node_indices[modality],
                    'target': node_indices[additional_key],
                    'value': count
                })

# Create links between additional modalities and types of experiments
for additional in modalities_fig1:
    if pd.notna(additional):
        additional_key = f"modality_{additional}" if additional in ['motor', 'other'] else additional
        for exp_type in typeExp_fig1:
            if pd.notna(exp_type):
                exp_type_key = f"exp_{exp_type}" if exp_type in ['motor', 'other'] else exp_type
                count = df_fig1[(df_fig1['FOR FIGURE - modality of exp'] == additional) & 
                                (df_fig1['FOR FIGURE - type of exp'] == exp_type)].shape[0]
                if count > 0:
                    links.append({
                        'source': node_indices[additional_key],
                        'target': node_indices[exp_type_key],
                        'value': count
                    })

# Create the Sankey diagram
fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=20,
        thickness=20,
        line=dict(color="black", width=0.5),
        label=nodes
    ),
    link=dict(
        source=[link['source'] for link in links],
        target=[link['target'] for link in links],
        value=[link['value'] for link in links]
    )
)])

fig.update_layout(
    title_text="Sankey Diagram of Primary Modalities, Additional Modalities, and Types of Experiments",
    font=dict(size=12),
    width=800  # Set the width to a smaller value to compress the figure horizontally
)

fig.show()

# fourth column with 

### Figure 2

Figure 2 - Modality x Additional Modality
A network diagram or Sankey diagram displaying the connections between primary modality (e.g., EEG) and additional modalities (e.g., fMRI, behavioral). This can visually represent multi-modality studies and show how often additional methods were used


In [4]:
df_fig1 = df[['EEG Modality', 
              'MEG Modality', 
              'iEEG modality',
              'Has MRI',
              'Has EOG',
              'Has Behavioral',
              'Has fMRi',
              'Has EcOg',
              'Has DBS',
              'Has SMI',
              'Has EMG',
              'Has TMS',
              'Has ECG',
              'Has SEEG',
              'Has fNIRS',
              'Has Blood Tests',
              'Eye tracking',
              'Other (put in name)']]

df_fig1.rename(columns={
    'EEG Modality': 'EEG',
    'MEG Modality': 'MEG',
    'iEEG modality': 'iEEG',
    'Has MRI': 'MRI',
    'Has EOG': 'EOG',
    'Has Behavioral': 'Behavioral',
    'Has fMRi': 'fMRI',
    'Has EcOg': 'EcOg',
    'Has DBS': 'DBS',
    'Has SMI': 'SMI',
    'Has EMG': 'EMG',
    'Has TMS': 'TMS',
    'Has ECG': 'ECG',
    'Has SEEG': 'SEEG',
    'Has fNIRS': 'fNIRS',
    'Has Blood Tests': 'Blood Tests',
    'Eye tracking': 'Eye Tracking',
    'Other (put in name)': 'Other'
}, inplace=True)

# Create dataframe of eeg as primary modality
df_fig1_eeg = df_fig1[df_fig1['EEG'] == 'X']
df_fig1 = df_fig1[df_fig1['EEG'] != 'X']

# Create dataframe of meg as primary modality
df_fig1_meg = df_fig1[df_fig1['MEG'] == 'X']
df_fig1 = df_fig1[df_fig1['MEG'] != 'X']

# Create dataframe of ieeg as primary modality
df_fig1_ieeg = df_fig1[df_fig1['iEEG'] == 'X']
df_fig1 = df_fig1[df_fig1['iEEG'] != 'X']

# Create a list of primary modalities and additional modalities
primary_modalities = ['EEG', 'MEG', 'iEEG']
additional_modalities = ['MRI', 'EOG', 'Behavioral', 'fMRI', 'EcOg', 'DBS', 'SMI', 'EMG', 'TMS', 'ECG', 'SEEG', 'fNIRS', 'Blood Tests', 'Eye Tracking', 'Other']

# Initialize a dictionary to store the connections
connections = {modality: {additional: 0 for additional in additional_modalities} for modality in primary_modalities}

# Count the connections
for modality in primary_modalities:
    for additional in additional_modalities:
        connections[modality][additional] = df_fig1_eeg[additional].notna().sum() if modality == 'EEG' else \
                                            df_fig1_meg[additional].notna().sum() if modality == 'MEG' else \
                                            df_fig1_ieeg[additional].notna().sum()

# Create lists for the nodes and links
nodes = []
node_indices = {}
links = []
index = 0

# Add primary modalities to nodes
for modality in primary_modalities:
    modality_connections = sum(connections[modality].values())
    nodes.append(f"{modality} ({modality_connections})")
    node_indices[modality] = index
    index += 1

# Add additional modalities to nodes
for additional in additional_modalities:
    nodes.append(additional)
    node_indices[additional] = index
    index += 1

# Create links between primary modalities and additional modalities
for modality in primary_modalities:
    for additional in additional_modalities:
        if connections[modality][additional] > 0:
            links.append({
                'source': node_indices[modality],
                'target': node_indices[additional],
                'value': connections[modality][additional]
            })

fig = go.Figure(data=[go.Sankey(
    node=dict(
        pad=20,
        thickness=50,
        line=dict(color="black", width=1),
        label=nodes,
        color="black"  # Set the font color to black
    ),
    link=dict(
        source=[link['source'] for link in links],
        target=[link['target'] for link in links],
        value=[link['value'] for link in links],
        color="rgba(0, 0, 0, 0.1)"  # Increase transparency of the connections
    )
)])

fig.update_layout(
    title_text="Network Diagram of Modalities and Additional Modalities",
    font=dict(size=15, color="black"),  # Set the font color to black
    height=800  # Increase the height of the figure
)
fig.show()

KeyError: "['iEEG modality'] not in index"

### Figure 3

Figure 3 - Author country x # of datasets x authors from multiple institutions

A world map showing the distribution of author countries. Use circles of varying sizes to represent the number of studies from each country, use heat map to show how many of those studies from each country were collaboration with another institution 


In [31]:
# '''
# US has so many studies that it's hard to see the other countries on the map.
# Removing US to see the other countries alone. 
# Creating plot with international countries excluding US. 
# '''

# # create dataframe for number of studies from each country
# countries = df['Author country']
# countries = countries.astype(str)
# countries = countries.str.strip().str.lower().str.replace('.', '')
# replacements = {
#     "usa": "united states",
#     "uk": "united kingdom",
#     "romani": "romania",
#     'the netherlands': 'netherlands'
# }
# countries.replace(replacements, inplace=True)

# # calculate df without US
# countries_noUS = countries[countries != 'united states']
# unique_countries_noUS, counts_noUS = np.unique(countries_noUS, return_counts=True)
# country_counts_df = pd.DataFrame({
#     'Country': unique_countries_noUS,
#     'Count': counts_noUS
# })

# Create a plotly scatter_geo plot
fig3_noUS = px.scatter_geo(country_counts_df, locations="Country", locationmode='country names',
                     size="Count", projection="natural earth",
                     title="World Map with Circles Representing Number of Studies",
                     color_continuous_scale="Viridis")
fig3_noUS.show()

'''
Now, create a plot with only the US to see the distribution of studies within the US.
'''

usa = df[['Author country', 'First author state']]
usa = usa.map(lambda x: x.strip().lower().replace('.', ''))
usa.replace({"usa": "united states"}, inplace=True)
usa = usa[usa['Author country'] == 'united states']

city_replacements = {
    'chicago': 'illinois',
    'pittsburgh': 'pennsylvania',
}
usa.replace(city_replacements, inplace=True)

states_unique, states_counts = np.unique(usa['First author state'], return_counts=True)
states_counts = states_counts.astype(int)

usa_counts_df = pd.DataFrame({
    'Country': states_unique,
    'Count': states_counts
})
# Dictionary to convert state names to abbreviations
state_abbrev = {
    'alabama': 'AL', 'alaska': 'AK', 'arizona': 'AZ', 'arkansas': 'AR', 'california': 'CA',
    'colorado': 'CO', 'connecticut': 'CT', 'delaware': 'DE', 'florida': 'FL', 'georgia': 'GA',
    'hawaii': 'HI', 'idaho': 'ID', 'illinois': 'IL', 'indiana': 'IN', 'iowa': 'IA',
    'kansas': 'KS', 'kentucky': 'KY', 'louisiana': 'LA', 'maine': 'ME', 'maryland': 'MD',
    'massachusetts': 'MA', 'michigan': 'MI', 'minnesota': 'MN', 'mississippi': 'MS', 'missouri': 'MO',
    'montana': 'MT', 'nebraska': 'NE', 'nevada': 'NV', 'new hampshire': 'NH', 'new jersey': 'NJ',
    'new mexico': 'NM', 'new york': 'NY', 'north carolina': 'NC', 'north dakota': 'ND', 'ohio': 'OH',
    'oklahoma': 'OK', 'oregon': 'OR', 'pennsylvania': 'PA', 'rhode island': 'RI', 'south carolina': 'SC',
    'south dakota': 'SD', 'tennessee': 'TN', 'texas': 'TX', 'utah': 'UT', 'vermont': 'VT',
    'virginia': 'VA', 'washington': 'WA', 'west virginia': 'WV', 'wisconsin': 'WI', 'wyoming': 'WY'
}
# Convert state names to abbreviations
usa_counts_df['Country'] = usa_counts_df['Country'].map(state_abbrev).fillna(usa_counts_df['Country'])


fig3_us = px.scatter_geo(usa_counts_df, locations="Country", locationmode='USA-states',
                     size="Count", projection="albers usa",
                     title="US Map with Circles Representing Number of Studies",
                     color_continuous_scale="Viridis")
fig3_us.show()

### Figure 4

Figure 4 - Published date x number of datasets x modality

A line graph showing the number of publications over time, segmented by modality

In [None]:
df_fig4 = df[['EEG Modality', 
              'MEG Modality', 
              'iEEG modality',
              'Has MRI',
              'Has EOG',
              'Has Behavioral',
              'Has fMRi',
              'Has EcOg',
              'Has DBS',
              'Has SMI',
              'Has EMG',
              'Has TMS',
              'Has ECG',
              'Has SEEG',
              'Has fNIRS',
              'Has Blood Tests',
              'Eye tracking',
              'Other (put in name)',
              'Published date']]

df_fig4.rename(columns={
    'EEG Modality': 'EEG',
    'MEG Modality': 'MEG',
    'iEEG modality': 'iEEG',
    'Has MRI': 'MRI',
    'Has EOG': 'EOG',
    'Has Behavioral': 'Behavioral',
    'Has fMRi': 'fMRI',
    'Has EcOg': 'EcOg',
    'Has DBS': 'DBS',
    'Has SMI': 'SMI',
    'Has EMG': 'EMG',
    'Has TMS': 'TMS',
    'Has ECG': 'ECG',
    'Has SEEG': 'SEEG',
    'Has fNIRS': 'fNIRS',
    'Has Blood Tests': 'Blood Tests',
    'Eye tracking': 'Eye Tracking',
    'Other (put in name)': 'Other'
}, inplace=True)

# Initialize the dictionary with empty dataframes
dfs_by_modalities = {col: pd.DataFrame(columns=df_fig4.columns) for col in df_fig4.columns if col != 'Published date'}

# Populate the dataframes
for index, row in df_fig4.iterrows():
    for col in dfs_by_modalities.keys():
        if pd.notna(row[col]):
            dfs_by_modalities[col].loc[len(dfs_by_modalities[col])] = row

dfs_by_modalities['EEG'].head()

# Convert 'Published date' to datetime for sorting
for modality, modality_df in dfs_by_modalities.items():
    modality_df['Published date'] = pd.to_datetime(modality_df['Published date'])
    dfs_by_modalities[modality] = modality_df.sort_values(by='Published date')

    # Create a cumulative count of publications over time for each modality
    dfs_by_modalities[modality]['Cumulative Publications'] = range(1, len(modality_df) + 1)

# Create a plotly line graph for all modalities
fig4 = go.Figure()

for modality, modality_df in dfs_by_modalities.items():
    fig4.add_trace(go.Scatter(
        x=modality_df['Published date'],
        y=modality_df['Cumulative Publications'],
        mode='lines',
        name=modality
    ))

fig4.update_layout(
    title='Cumulative Publications Over Time for All Modalities',
    xaxis_title='Published Date',
    yaxis_title='Cumulative Number of Publications'
)

fig4.show()

# denote that all of the data is combination - like mri is eeg-mri
# denote this is over evolution of nemar/openneuro, thats how old database is
# take top 3 eeg/meg/ieeg on one plot, and then the other modalities on another plot



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. To ensure parsing is consistent and as-expected, please specify a format.


Could not infer format, so each element will be parsed individually, falling back to `dateutil`. T

### Figure 5

add histogram of additioanl main modalities over itme

Figure 5 - EEG channel histogram (see which are most common)

In [None]:
df_eeg_channels = df['EEG channels']
df_eeg_channels.replace("n/a", "", inplace=True)
df_eeg_channels.dropna()

# Convert the EEG channels to numeric, forcing errors to NaN
df_eeg_channels = pd.to_numeric(df_eeg_channels, errors='coerce')

# Drop NaN values
df_eeg_channels.dropna(inplace=True)

# Create a plotly histogram
fig = px.histogram(df_eeg_channels, nbins=260, title='Histogram of EEG Channels')
fig.update_layout(
    xaxis_title='Number of EEG Channels',
    yaxis_title='Frequency',
    xaxis=dict(categoryorder='category ascending', tickangle=45, dtick=5)
)
fig.show()

# make it log
# collapse the areas with no data
# color code part of the bar based on proportion of EOG data


### Figure 7

Figure 7: # of participants x age range x type of subjects 
Bubble chart or grouped bar chart (X-axis Age range, Y-axis: Number of subjects, Grouped Bars: Type of subjects (e.g., clinical vs. healthy))



In [58]:
df_fig7 = df[['Age range',
             'Type of subjects',
             'Participants']]

df_fig7.head()

Unnamed: 0,Age range,Type of subjects,Participants
3,19 - 29,Healthy,32
4,,"Healthy, right-handed, normal or corrected vis...",1
5,23-31,Healthy,16
6,21-35,,5
7,,Treatment Resistant Depression w DBS,14


age histogram of all the ages
only consider teh age ranges once for each df number

### Figure 8

Figure 8: Publish date x # subjects x modality of experiment (visual, audio, etc)
Like figure 7 in other paper


In [18]:
df_fig8 = df[['Published date',
              'Participants',
              'Modality of experiment']]

modalities_fig8 = ['visual', 
                   'audio',
                   'motor',
                   'drug',
                   ]

# Initialize the dictionary with empty dataframes
dfs_by_modalities_fig8 = {modality: pd.DataFrame(columns=df_fig8.columns) for modality in modalities_fig8}

# Populate the dataframes
for index, row in df_fig8.iterrows():
    for modality in modalities_fig8:
        if modality in str(row['Modality of experiment']).lower():
            dfs_by_modalities_fig8[modality].loc[len(dfs_by_modalities_fig8[modality])] = row

# Combine all dataframes in dfs_by_modalities_fig8 into a single dataframe
combined_df = pd.concat(dfs_by_modalities_fig8.values(), keys=dfs_by_modalities_fig8.keys()).reset_index(level=0).rename(columns={'level_0': 'Modality'})

# Convert Participants to numeric for sorting
combined_df['Participants'] = pd.to_numeric(combined_df['Participants'], errors='coerce')

# Create the strip plot with sorted x-axis
fig_strip = px.strip(combined_df.sort_values(by='Participants'), x='Participants', y='Modality', title='Participants by Modality', stripmode='overlay')

# Show the plot
fig_strip.show()

# Filter out the outlier
filtered_combined_df = combined_df[combined_df['Participants'] != 2951]

# Create the second strip plot without the outlier
fig_strip_no_outlier = px.strip(filtered_combined_df.sort_values(by='Participants'), 
                                x='Participants', y='Modality', 
                                title='Participants by Modality (Without Outlier)', 
                                stripmode='overlay')

# Show the plot
fig_strip_no_outlier.show()