In [None]:
import plotly
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import json
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import seaborn.objects as so
import os
import warnings
import matplotlib.patches as mpatches
import matplotlib.colors as mplc

from plotly.offline import plot

In [None]:
screening_info = ['Records obtained from Embase',
                  'Records obtained from MEDLINE',
                  'Records obtained from Google Scholar',
                  'Records selected for screening',
                  'Duplicate references removed',
                  """
                    Exclusion critera:<br>
                    - 
                  """,
                  'Studies retrieved for evaluation',
                  """
                    Exclusion criteria:<br>
                    - Review articles (n=31);<br>
                    - Ambiguous CT or MRI patient counts (n=14);<br>
                    - Case Reports (n=99);<br>
                    - Animal studies (n=14);<br>
                    - Correspondence (n=1);<br>
                    - Public dataset (n=84);<br>
                    - Conference paper (n=16);<br>
                    - Multisite dataset (n=776);<br>
                    - Post-mortem brain (n=14);<br>
                    - Registered report (n=2);<br>
                    - Wrong study design (n=8);<br>
                    - Unspecified location (n=37);<br>
                    - Wrong study population (n=2039);<br>
                    - Wrong patient population (n=13);<br>
                    - Duplicate dataset (QNTS) (n=9);<br>
                    - Duplicate dataset (TRIAD) (n=22);<br>
                    - Duplicate dataset (CIMA-Q) (n=7);<br>
                    - Duplicate dataset (ICBM-152) (n=20);<br>
                    - Duplicate dataset (PREVENT-AD) (n=16);<br>
                    - Duplicate dataset (Project ice storm) (n=2);<br>
                    - Duplicate dataset (SYS) (n=22);<br>
                    - Duplicate dataset (QCPR) (n=2);<br>
                    - Other duplicate dataset (n=4);
                  """,
                  'Studies selected for analysis']

fig1 = go.Figure(data=[go.Sankey(
    arrangement = "freeform",
    node = dict(
      pad = 80,
      thickness = 10,
      line = dict(color = "black", width = 0.5),
      label = ["Records obtained from Embase",#0
               "Records obtained from MEDLINE",#1
               "Records obtained from Google Scholar",#2
               "Main records identified",#3
               "Studies screened",#4
               "References removed",#5
               "Studies excluded from retrieval", #6
               "Studies assessed for eligibility",#7
               "Studies exluded from review",#8
               "Studies included in review"],#9
      x = [0, 0, 0, 0.3, 0.5, 0.5, 0.7, 0.7, 0.9, 0.9],
      y = [2, 0, 0, 0.5, 0.3, 0.9, 0.8, 0.4, 0.6, 0.2],
      hovertemplate = "%{label}<extra>%{value}</extra>",
      color = ["darkblue","darkblue","darkblue","darkblue","darkgreen","darkred","darkred","darkgreen","darkred","darkgreen"]
    ),
    link = dict(
      source = [0, 1, 2, 3, 3, 4, 4, 7, 7],
      target = [3, 3, 3, 4, 5, 6, 7, 8, 9],
      value = [6429, 3816, 100, 7280, 3065, 2476, 4804, 3252, 1552],
      customdata = screening_info,
      hovertemplate = "%{customdata}",
  ))])

fig1.update_layout(title = dict(text="Study identification methodology"),
                   width=800,
                   height=500,
                   font_size=11,
                   margin=dict(l=0))
    
fig1.show()

In [None]:
# laod data
path_data = os.path.join(os.getcwd(), "data/")
fname_data = os.path.join(path_data, f"fcdata.xlsx")
df = pd.read_excel(fname_data)

# load full dataset
# will need this for treemap with DOIs
fname2 = os.path.join(path_data, f"Cleaned Full data.xlsx")
df_full = pd.read_excel(fname2)

# merge superfluous ethnicity columns
df['Black'] = df.loc[:,['Black','African-American']].sum(axis=1)
df['White'] = df.loc[:,['Caucasian','Caucasian-Hispanic']].sum(axis=1)
df['Asian'] = df.loc[:,['Asian','Asian American']].sum(axis=1)
df = df.drop(columns=['African-American','Caucasian','Caucasian-Hispanic','Asian American'])
df.fillna({'Other': 0, 'Not specified' : 0, 'Middle Eastern' : 0, 'Caribbean' : 0, 'Jewish' : 0}, inplace=True)

###############
# create total participant column on the rule that if PET=MRI then all participants were scanned in both, and if not then total = PET+MRI
df['Total participants'] = df.loc[:,['PET participants','MRI participants']].sum(axis=1)
df.loc[df['PET participants']==df['MRI participants'], 'Total participants'] = df['MRI participants']

# hard coding execptions to the above rule
df.loc[118, 'Total participants'] = 25
df.loc[268, 'Total participants'] = 61
df.loc[356, 'Total participants'] = 15
df.loc[477, 'Total participants'] = 29
df.loc[572, 'Total participants'] = 24
df.loc[626, 'Total participants'] = 20
df.loc[832, 'Total participants'] = 60
df.loc[840, 'Total participants'] = 21
df.loc[1113, 'Total participants'] = 34
df.loc[1128, 'Total participants'] = 35
df.loc[1143, 'Total participants'] = 14
df.loc[1178, 'Total participants'] = 9
df.loc[1215, 'Total participants'] = 40
df.loc[1293, 'Total participants'] = 262
df.loc[1325, 'Total participants'] = 31
df.loc[1369, 'Total participants'] = 44
df.loc[1446, 'Total participants'] = 54
df.loc[1488, 'Total participants'] = 39
###############

# Create unreported column based on total participants
df['Unreported Ethnicity'] = df.loc[:,'Total participants'] - df.loc[:,['Black','Asian','Other','Not specified','Middle Eastern','Caribbean','Jewish','White']].sum(axis=1)
i = list(df.columns)
a, b = i.index('Unreported Ethnicity'), i.index('Total participants')
i[b], i[a] = i[a], i[b]
df = df[i]
df

# this is our initial dataframe which will be manipulated individually for each analysis to account for needing to separate different studies based on other criteria
# e.g. needing to remove multiple region studies for region analysis, but leaving those in for sex/ethnicity/age analysis

In [None]:
# Categorize geographically by number of studies 
# Drop studies with multiple locations, plot number of studies by city
filtered = df[~df['Geographical location'].str.contains(',')]
# removed 10 studies with multiple locations listed

In [None]:
#counting studies by region
# all
s1 = filtered['Geographical location'].value_counts()
# PET only
s2 = filtered[filtered['PET participants'].notna() & filtered['MRI participants'].isna()]['Geographical location'].value_counts()
# MRI only
s3 = filtered[filtered['PET participants'].isna() & filtered['MRI participants'].notna()]['Geographical location'].value_counts()
# PET+MRI studies
s4 = filtered[filtered['PET participants'].notna() & filtered['MRI participants'].notna()]['Geographical location'].value_counts()

# all
regs_all = pd.DataFrame({'Region' : s1.index, 'count' : s1.values})
regs_all['Region'] = ['Montreal', 'Estrie', 'Capitale-Nationale', 'Mauricie', 'Saguenay - Lac-Saint-Jean', 'Monteregie']
regs_all.loc[len(regs_all.index)]=['Gaspesie - Iles-de-la-Madeleine', 0]
regs_all.loc[len(regs_all.index)]=['Abitibi-Temiscamingue', 0]
regs_all.loc[len(regs_all.index)]=['Outaouais', 0]
regs_all.loc[len(regs_all.index)]=['Nord-du-Quebec', 0]
regs_all.loc[len(regs_all.index)]=['Laurentides', 0]
regs_all.loc[len(regs_all.index)]=['Lanaudiere', 0]
regs_all.loc[len(regs_all.index)]=['Chaudiere-Appalaches', 0]
regs_all.loc[len(regs_all.index)]=['Cote-Nord', 0]
regs_all.loc[len(regs_all.index)]=['Bas-Saint-Laurent', 0]
regs_all.loc[len(regs_all.index)]=['Centre-du-Quebec', 0]
regs_all.loc[len(regs_all.index)]=['Laval', 0]

# PET only
regs_pet = pd.DataFrame({'Region' : s2.index, 'count' : s2.values})
regs_pet['Region'] = ['Montreal', 'Capitale-Nationale', 'Estrie']
regs_pet.loc[len(regs_pet.index)]=['Mauricie', 0]
regs_pet.loc[len(regs_pet.index)]=['Saguenay - Lac-Saint-Jean', 0]
regs_pet.loc[len(regs_pet.index)]=['Monteregie', 0]
regs_pet.loc[len(regs_pet.index)]=['Gaspesie - Iles-de-la-Madeleine', 0]
regs_pet.loc[len(regs_pet.index)]=['Abitibi-Temiscamingue', 0]
regs_pet.loc[len(regs_pet.index)]=['Outaouais', 0]
regs_pet.loc[len(regs_pet.index)]=['Nord-du-Quebec', 0]
regs_pet.loc[len(regs_pet.index)]=['Laurentides', 0]
regs_pet.loc[len(regs_pet.index)]=['Lanaudiere', 0]
regs_pet.loc[len(regs_pet.index)]=['Chaudiere-Appalaches', 0]
regs_pet.loc[len(regs_pet.index)]=['Cote-Nord', 0]
regs_pet.loc[len(regs_pet.index)]=['Bas-Saint-Laurent', 0]
regs_pet.loc[len(regs_pet.index)]=['Centre-du-Quebec', 0]
regs_pet.loc[len(regs_pet.index)]=['Laval', 0]

# MRI only
regs_mri = pd.DataFrame({'Region' : s3.index, 'count' : s3.values})
regs_mri['Region'] = ['Montreal', 'Estrie', 'Capitale-Nationale', 'Mauricie', 'Saguenay - Lac-Saint-Jean', 'Monteregie']
regs_mri.loc[len(regs_mri.index)]=['Gaspesie - Iles-de-la-Madeleine', 0]
regs_mri.loc[len(regs_mri.index)]=['Abitibi-Temiscamingue', 0]
regs_mri.loc[len(regs_mri.index)]=['Outaouais', 0]
regs_mri.loc[len(regs_mri.index)]=['Nord-du-Quebec', 0]
regs_mri.loc[len(regs_mri.index)]=['Laurentides', 0]
regs_mri.loc[len(regs_mri.index)]=['Lanaudiere', 0]
regs_mri.loc[len(regs_mri.index)]=['Chaudiere-Appalaches', 0]
regs_mri.loc[len(regs_mri.index)]=['Cote-Nord', 0]
regs_mri.loc[len(regs_mri.index)]=['Bas-Saint-Laurent', 0]
regs_mri.loc[len(regs_mri.index)]=['Centre-du-Quebec', 0]
regs_mri.loc[len(regs_mri.index)]=['Laval', 0]

# Studies with MRI and PET imaging
regs_both = pd.DataFrame({'Region' : s4.index, 'count' : s4.values})
regs_both['Region'] = ['Montreal', 'Estrie', 'Capitale-Nationale']
regs_both.loc[len(regs_both.index)]=['Mauricie', 0]
regs_both.loc[len(regs_both.index)]=['Saguenay - Lac-Saint-Jean', 0]
regs_both.loc[len(regs_both.index)]=['Monteregie', 0]
regs_both.loc[len(regs_both.index)]=['Gaspesie - Iles-de-la-Madeleine', 0]
regs_both.loc[len(regs_both.index)]=['Abitibi-Temiscamingue', 0]
regs_both.loc[len(regs_both.index)]=['Outaouais', 0]
regs_both.loc[len(regs_both.index)]=['Nord-du-Quebec', 0]
regs_both.loc[len(regs_both.index)]=['Laurentides', 0]
regs_both.loc[len(regs_both.index)]=['Lanaudiere', 0]
regs_both.loc[len(regs_both.index)]=['Chaudiere-Appalaches', 0]
regs_both.loc[len(regs_both.index)]=['Cote-Nord', 0]
regs_both.loc[len(regs_both.index)]=['Bas-Saint-Laurent', 0]
regs_both.loc[len(regs_both.index)]=['Centre-du-Quebec', 0]
regs_both.loc[len(regs_both.index)]=['Laval', 0]

In [None]:
# Load in quebec administrative region data
# from: https://github.com/codeforgermany/click_that_hood/blob/main/public/data/quebec.geojson
# had to edit labels to remove accents so they could be read
geoname_data = os.path.join(path_data, f"quebec.geojson")

with open(geoname_data) as f:
    var_geojson = json.load(f)

In [None]:
colorscale = [[0.0, "#FFFFC0"], [0.5, "#CBC7C0"], [1.0, "#FFFFF0"]]

# filled grey map of administrative regions
trace1 = go.Choropleth(geojson=var_geojson,
                      showscale=False,
                      colorscale=colorscale,
                      zmin=0, zmax=1,
                      z=[0.5]*len(var_geojson['features']),
                      locations=regs_all['Region'],
                      featureidkey='properties.name',
                      hoverinfo='skip')


# sized markers for number of studies by region
maxval=regs_all['count'].max()
minval=regs_all['count'].min()
trace2 = go.Scattergeo(mode='markers',
                       marker=dict(size=np.power(regs_all['count'],0.5)*10, sizemode='area',
                                   opacity=0.8,
                                   cmin=minval,
                                   cmax=maxval,
                                   color=regs_all['count'],
                                   colorbar_title='Number of studies'),
                       geojson=var_geojson,
                       locations=regs_all['Region'],
                       featureidkey='properties.name',
                       customdata=regs_all[['Region','count']],
                       hovertemplate=
                       "<b>%{customdata[0]}</b><br>" +
                       "<b>Studies:</b> %{customdata[1]}<extra></extra>")

maxval=regs_pet['count'].max()
minval=regs_pet['count'].min()
trace3 = go.Scattergeo(mode='markers',
                       marker=dict(size=np.power(regs_pet['count'],0.5)*30, sizemode='area',
                                   opacity=0.8,
                                   cmin=minval,
                                   cmax=maxval,
                                   color=regs_pet['count'],
                                   colorbar_title='Number of studies'),
                       geojson=var_geojson,
                       locations=regs_pet['Region'],
                       featureidkey='properties.name',
                       customdata=regs_pet[['Region','count']],
                       visible=False,
                       hovertemplate=
                       "<b>%{customdata[0]}</b><br>" +
                       "<b>Studies:</b> %{customdata[1]}<extra></extra>")

maxval=regs_mri['count'].max()
minval=regs_mri['count'].min()
trace4 = go.Scattergeo(mode='markers',
                       marker=dict(size=np.power(regs_mri['count'],0.5)*10, sizemode='area',
                                   opacity=0.8,
                                   cmin=minval,
                                   cmax=maxval,
                                   color=regs_mri['count'],
                                   colorbar_title='Number of studies'),
                       geojson=var_geojson,
                       locations=regs_mri['Region'],
                       featureidkey='properties.name',
                       customdata=regs_mri[['Region','count']],
                       visible=False,
                       hovertemplate=
                       "<b>%{customdata[0]}</b><br>" +
                       "<b>Studies:</b> %{customdata[1]}<extra></extra>")

maxval=regs_both['count'].max()
minval=regs_both['count'].min()
trace5 = go.Scattergeo(mode='markers',
                       marker=dict(size=np.power(regs_both['count'],0.5)*15, sizemode='area',
                                   opacity=0.8,
                                   cmin=minval,
                                   cmax=maxval,
                                   color=regs_both['count'],
                                   colorbar_title='Number of studies'),
                       geojson=var_geojson,
                       locations=regs_both['Region'],
                       featureidkey='properties.name',
                       customdata=regs_both[['Region','count']],
                       visible=False,
                       hovertemplate=
                       "<b>%{customdata[0]}</b><br>" +
                       "<b>Studies:</b> %{customdata[1]}<extra></extra>")

# remove all default layout elements, only interested in quebec
layout = go.Layout(
    geo=dict(showland=False,
             showcountries=False,
             showocean=False,
             showrivers=False,
             showlakes=False,
             showcoastlines=False),
    title='Distribution of all PET and MRI studies in Quebec')

fig = go.Figure(data = [trace1, trace2, trace3, trace4, trace5], layout=layout)
#conic conformal projection recommended by Statistics Canada https://www150.statcan.gc.ca/n1/pub/92-195-x/2011001/other-autre/mapproj-projcarte/m-c-eng.htm
fig.update_geos(fitbounds="locations", projection_type='conic conformal') 
fig.update_layout(
    updatemenus=[
        dict(
            buttons=list([
                dict(label="All studies",
                     method="update",
                     args=[{"visible": [True, True, False, False, False]},
                           {"title": "Distribution of all PET and MRI studies in Quebec"}]),
                dict(label="PET studies",
                     method="update",
                     args=[{"visible": [True, False, True, False, False]},
                           {"title": "Distribution of PET studies in Quebec"}]),
                dict(label="MRI studies",
                     method="update",
                     args=[{"visible": [True, False, False, True, False]},
                           {"title": "Distribution of MRI studies in Quebec"}]),
                dict(label="PET+MRI studies",
                     method="update",
                     args=[{"visible": [True, False, False, False, True]},
                           {"title": "Distribution of studies in Quebec scanning participants in both PET and MRI"}]),
            ]),
        )
    ])

fig.update_layout(width=1000, height=600)
fig.show()


# important to note, this is by region but even outside of montreal these are mostly in cities.
# the 46 in Capitale-Nationale are all in Quebec City. Of the 53 in Estrie, 52 are specified as Sherbrooke. 
# Mauricie = Trois-Rivieres, monteregie = Longueil

# Oddity in Cote-Nord region, north-east border seems to claim the old disputed labrador territory
# I seriously doubt this geojson file was created pre-1949

In [None]:
# Age analysis
# Histogram of average age per study
# rename for simplicity
df = df.rename(columns = dict({'Average Age (Years)' : 'Age'}))

noagedf = df[df['Age'].isna()]
# studies with no age reported.

# drop all nans and nonnumerics (entries with > or other symbols)
Agedf = pd.to_numeric(df['Age'], errors='coerce')
Agedf = Agedf.dropna()


bins = np.arange(0,90,5)

agecounts = []
labels = []
for i,x in enumerate(bins):
    agecounts.append( Agedf[(Agedf >= x) & (Agedf < (x+5))].count() )
    labels.append( f'{x}-{x+5}' )

ages = pd.DataFrame({'Range':labels, 'count':agecounts})
ages.loc[len(ages.index)] = ['Unreported', len(noagedf.index)]

colors = ["royalblue"]*18 + ["dimgrey"]

fig, ax1 = plt.subplots(figsize=(10,6))

ax1.bar(x=ages['Range'], height=ages['count'], color=colors)
plt.xticks(rotation=-60)
ax1.set_xlabel('Average age')
ax1.set_ylabel('Number of studies')
ax1.set_title(f'Studies by average age of participants (n = {len(df.index)})')
ax1.set(axisbelow=True)
ax2 = ax1.twinx()
ax2.bar(x=ages['Range'], height=(ages['count']/(ages['count'].sum())*100), color=colors, visible=False)
ax2.set_ylabel('Percent of total studies')
ax2.set(axisbelow=True)
ax2.grid(axis='y')
plt.show()

In [None]:
# Sex analysis 

# male and female totals reported in most studies
male = df['Male'].sum()
female = df['Female'].sum()
# majority of unreported sex is studies which report no sex at all, however some is from studies which only partially report sex
unreported = (df.loc[:,'Total participants'] - df.loc[:,['Male','Female']].sum(axis=1)).sum()

labels = ['Male', 'Female', 'Unreported']
sizes = [male, female, unreported]
fig, ax = plt.subplots()
ax.pie(sizes, labels=labels,autopct='%1.1f%%')
plt.title(f'Breakdown of total participants by sex (n = {int(male + female + unreported)})')
plt.show()

In [None]:
# Age pyramid

# drop all rows with no age information, and eliminate nonnumerics (e.g. entries where age is listed as >0.69)
Agedf2 = df[df['Age'].notna()].copy()
Agedf2['Age'] = pd.to_numeric(Agedf2['Age'], errors='coerce')
Agedf2 = Agedf2.dropna(subset=['Age'])

# drop rows with no male or female information
Agedf2 = Agedf2[~((Agedf2['Female'].isna()) & Agedf2['Male'].isna())]
# Here we are down to 1085 studies from 1553

bins = np.arange(0,90,5)
agecounts_male = []
agecounts_female = []
labels = []
for i,x in enumerate(bins):
    agecounts_male.append( Agedf2.loc[(Agedf2['Age'] >= x) & (Agedf2['Age'] < (x+5)), 'Male'].sum() )
    agecounts_female.append( Agedf2.loc[(Agedf2['Age'] >= x) & (Agedf2['Age'] < (x+5)), 'Female'].sum() )
    labels.append( f'{x}-{x+5}' )


labels.reverse()
agecounts_male.reverse()
agecounts_female.reverse()
pyramid_df = pd.DataFrame(data={'age range':labels, 'male':agecounts_male, 'female':agecounts_female})
pyramid_df['male'] = pyramid_df['male'] * -1

plt.grid()
ax1 = sns.barplot(x='male', y='age range', data=pyramid_df, palette="Reds", hue=labels)
ax2 = sns.barplot(x='female', y='age range', data=pyramid_df, palette="Blues", hue=labels)
plt.title("Population pyramid for PET and MRI study participants in Quebec")
plt.xlabel("             Male    Female")
plt.xticks(ticks=[-4000, -2000, 0, 2000, 4000], labels=[4000, 2000, 0, 2000, 4000])

plt.show()

# NOTE: probably just ditch this for final notebook, since it loses information about distribution within each study.

In [None]:
# relational age/sex plot with percentages
# make percentage column 
df['Male'] = df['Male'].fillna(0)
df['Female'] = df['Female'].fillna(0)
ser = df['Female'] / df.loc[:,['Female','Male']].sum(axis=1) *100
try:
    df.insert(7, "Percentage female", ser)
except ValueError:
    pass

# Joint distribution of average age of participants and sex as a percentage of participants
df_rel = df[~(df['Percentage female'].isnull())].copy()
df_rel['Age'] = pd.to_numeric(df_rel['Age'], errors='coerce')
df_rel = df_rel.dropna(subset=['Age'])

# arbitrarily chosen value of studies to elminate small sample sizes
df_rel2 = df_rel[df_rel['Total participants']>=30].copy()

norm = mplc.Normalize(vmin=30, vmax=300, clip=True)

g = sns.relplot(
    data=df_rel,
    x = "Age", y="Percentage female",
    hue = "Total participants",
    hue_norm = norm,
    legend='auto',
)
g.set(xlabel='Average age (years)', ylabel='Percentage of female participants')
ax = g.axes[0][0]

plt.grid()
g.set(xlabel='Average age (years)', ylabel='Percentage of female participants', title="Joint distribution of average age of participants and proportion of sex")
plt.xticks([0, 10, 20, 30, 40, 50, 60, 70, 80])

g2 = sns.relplot(
    data=df_rel2,
    x = "Age", y="Percentage female",
    hue = "Total participants",
    hue_norm = norm,
)
plt.grid()

g2.set(xlabel='Average age (years)', ylabel='Percentage of female participants',title="Joint distribution of average age of participants and proportion of sex for studies with 30 or more participants")
plt.show()

# second plot here demonstrates that most of the studies with poor female/male ratio (either very high or low but not 100%) are low sample size
# note that 100% male or female studies are likely to simply be studies focused on one sex group


# TO DO
# update color scale to display more information
# make correlation line and stats

In [None]:
# ethnicity 
# sum each ethnicity column
fig,ax = plt.subplots(3,1, figsize=(20,16))
ax[0].pie(df[df.columns[8:17]].sum(), labels = df.columns[8:17], labeldistance=None, autopct='%1.1f%%')
ax[0].legend(bbox_to_anchor=(1.6, 1), loc='upper right')
ax[0].set_title(f'Total ethincity counts (n = {int(df[df.columns[8:17]].sum().sum())})')
ax[1].pie(df[df.columns[8:16]].sum(), labels = df.columns[8:16], labeldistance=None, autopct='%1.1f%%')
ax[1].set_title(f'Counts of all reported ethnicities (n = {int(df[df.columns[8:16]].sum().sum())})')
ax[1].legend(bbox_to_anchor=(1.6, 1), loc='upper right')
ax[2].pie(df[df.columns[8:15]].sum(), labels = df.columns[8:15], labeldistance=None, autopct='%1.1f%%')
ax[2].set_title(f'Counts of all reported non-white ethnicities (n = {int(df[df.columns[8:15]].sum().sum())})')
ax[2].legend(bbox_to_anchor=(1.6, 1), loc='upper right')

fig.suptitle('Ethnicity reporting breakdown for PET/MRI studies in Quebec')
plt.show()


# Perhaps worth noting that 2 of the 3 studies reporting ethnicity as "other" are in opposition to white, and the third reports white asian and other
# Quebec is (as of 2011) 87.2% white, of studies that report ethnicity 94.2% of participants are white.
# Test significance? 
# also worth noting here not a single indigenous participant was included

# note: here "not specified" means the study reported the ethnicity of the participants as unspecified, which is a valid report
# many studies report ethnicity of some but not all participants, which may happen for many reasons, e.g. participants not providing ethnicity information even when studies ask
# this is not synonymous with "unreported ethnicity" which indicates no mention of ethnicity at all, and is the undesired way of handling things

In [None]:
labels = df.columns[8:17].to_list()
parents = ['Non-white','Non-white','Non-white','Reported','Non-white','Non-white','Non-white','Reported','Total','Reported','Total','']
ethdata = df[df.columns[8:17]].sum()
tot = ethdata.sum()
ethdata['Non-white'] = df[df.columns[8:15]].sum().sum()
ethdata['Reported'] = df[df.columns[8:16]].sum().sum()
ethdata = ethdata/tot*100
labels.append('Non-white')
labels.append('Reported')
labels.append('Total')
ethdata['Total'] = 0
values = ethdata[labels].to_list()
values = [f'{(round(v,3))}' for v in values]

fig=go.Figure(go.Treemap(
    labels=labels,
    parents=parents,
    values = values,
    hoverinfo='skip',
    textinfo='label+value'
))

fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()

In [None]:
# grouping by year of publication and summing male and female participants
df_grouped = df_full[['Year of Publication','Male','Female']].copy().fillna(0)
df_grouped['Male'] = pd.to_numeric(df_grouped['Male'], errors='coerce').fillna(0)
df_grouped['Female'] = pd.to_numeric(df_grouped['Female'], errors='coerce').fillna(0)
df_ageyear = df_grouped.groupby('Year of Publication').sum()
df_ageyear = df_ageyear.loc[df_ageyear.index[0:32]]
s1 = df_ageyear.loc[:,['Female','Male']].sum(axis=1) / df_ageyear.loc[:,['Female','Male']].sum(axis=1) *100
s2 = df_ageyear['Female'] / df_ageyear.loc[:,['Female','Male']].sum(axis=1) *100
df_ageyear['Male'] = s1
df_ageyear['Female'] = s2

plt.figure(figsize=(12,5))
b1 = sns.barplot(x='Year of Publication', y='Male', data=df_ageyear, color='darkblue')
b2 = sns.barplot(x='Year of Publication', y='Female', data=df_ageyear, color='lightblue')
top = mpatches.Patch(color='darkblue', label='Male')
bottom = mpatches.Patch(color='lightblue', label='Female')
plt.legend(handles=[top, bottom])
plt.xticks(rotation=-60)
plt.yticks([0,10,20,30,40,50,60,70,80,90,100])
plt.ylim([0,100])
plt.ylabel("Percent")
plt.title("Participant sex in imaging studies by year of publication")
plt.show()

In [None]:
filtage = filtered[['Geographical location','Average Age (Years)']].copy().dropna()
filtage['Average Age (Years)'] = pd.to_numeric(filtage['Average Age (Years)'], errors='coerce').dropna()
filtage = filtage.dropna()
#filtage.groupby('Geographical location').sum()

fig,ax = plt.subplots(figsize=(10,6))
bp_colors = ['olive','turquoise','orchid']
bp_data = [filtage.loc[filtage['Geographical location'] == 'Montreal','Average Age (Years)'],
           filtage.loc[filtage['Geographical location'] == 'Sherbrooke','Average Age (Years)'],
           filtage.loc[filtage['Geographical location'].str.contains('Quebec'),'Average Age (Years)']]

flierprops = dict(marker='.', markerfacecolor='black', markersize=4,
                  markeredgecolor='none')

bp = ax.boxplot(bp_data, patch_artist=True, vert=False, flierprops=flierprops, medianprops=dict(color='grey'))

for patch, color in zip(bp['boxes'], bp_colors):
    patch.set_facecolor(color)
    patch.set_alpha(.1)

vp = ax.violinplot(bp_data, points=500, showmeans=False,
                   showextrema=False, vert=False)

for i, b in enumerate(vp['bodies']):
    m = np.mean(b.get_paths()[0].vertices[:,0])
    b.get_paths()[0].vertices[:,1] = np.clip(b.get_paths()[0].vertices[:,1], i+1, i+2)
    b.set_color(bp_colors[i])

for i, feature in enumerate(bp_data):
    y = np.full(len(feature), i+.8)
    idxs = np.arange(len(y))
    out = y.astype(float)
    out.flat[idxs] += np.random.uniform(low=-.05, high=.05, size=len(idxs))
    y=out
    plt.scatter(feature, y, s=.3, c=bp_colors[i])

plt.title('Distribution of average age of studies according to region')
ax.set_yticklabels(['Montreal','Sherbrooke','Quebec City'])
plt.xlabel('Average age of study (Years)')
plt.show()
#filtage.loc[filtage['Geographical location'] == 'Montreal','Average Age (Years)']