In [1]:
import pandas as pd
import numpy as np
import altair as alt
import seaborn as sns
from vega_datasets import data

please note that much of the cleaning code is hardcoded and lacks robustness to changes in the data (also, in converting this notebook i caught an inconsistency in how i sorted the dataframes so the plots are slightly different now so i'm glad we're doing this)

In [2]:
queries={'AB': 'Astrobiology OR exobiology OR cosmobiology OR bioastronomy', 
         'BA': 'bioastronautics OR "space medicine" OR "medicine in space"', 
         'SBE': '“space bioprocess engineering” OR ECLSS OR “Environmental Control and Life Support Systems” OR “life support in space” OR “bioregenerative life-support systems” OR “space systems bioengineering” OR “space biomanufacturing”'
        } #in case I want to programatically add query data to the sets

AB_geo = pd.read_csv('AB_geo.txt', delimiter='\t')
BA_geo = pd.read_csv('BA_geo.txt', delimiter='\t')
SBE_geo = pd.read_csv('SBE_geo.txt', delimiter='\t')
AB_geo['Type']='Astrobiology'
BA_geo['Type']='Bioastronautics'
SBE_geo['Type']='Space Bioprocess Engineering'
AB_geo['Relative Contribution (%)']=AB_geo['Record Count']/AB_geo['Record Count'].sum()*100
SBE_geo['Relative Contribution (%)']=SBE_geo['Record Count']/SBE_geo['Record Count'].sum()*100
BA_geo['Relative Contribution (%)']=BA_geo['Record Count']/BA_geo['Record Count'].sum()*100
geo_df = pd.concat([SBE_geo, BA_geo, AB_geo])


AB_year = pd.read_csv('AB_year.txt', delimiter='\t')
BA_year = pd.read_csv('BA_year.txt', delimiter='\t')
SBE_year = pd.read_csv('SBE_year.txt', delimiter='\t')
AB_year['Type']='Astrobiology'
BA_year['Type']='Bioastronautics'
SBE_year['Type']='Space Bioprocess Engineering'
year_df = pd.concat([SBE_year, BA_year, AB_year])
year_df.drop(['% of 1,109', '% of 299', '% of 13,566'], axis=1, inplace=True) #this line is not robust to changes in the query
year_df = year_df.groupby(['Type', 'Publication Years']).sum() \
  .groupby(level=0).cumsum().reset_index() #group year dataframe for linechart formatting

AB_research = pd.read_csv('AB_Research_Area.txt', delimiter='\t')
BA_research = pd.read_csv('BA_Research_Area.txt', delimiter='\t')
SBE_research = pd.read_csv('SBE_Research_Area.txt', delimiter='\t')
AB_research['Type']='Astrobiology'
BA_research['Type']='Bioastronautics'
SBE_research['Type']='Space Bioprocess Engineering'
AB_research['Relative Contribution (%)']=AB_research['Record Count']/AB_research['Record Count'].sum()*100
BA_research['Relative Contribution (%)']=BA_research['Record Count']/BA_research['Record Count'].sum()*100
SBE_research['Relative Contribution (%)']=SBE_research['Record Count']/SBE_research['Record Count'].sum()*100
research_df = pd.concat([SBE_research, BA_research, AB_research])

AB_affiliation = pd.read_csv('AB_Affiliation.txt', delimiter='\t')
BA_affiliation = pd.read_csv('BA_Affiliation.txt', delimiter='\t')
SBE_affiliation = pd.read_csv('SBE_Affiliation.txt', delimiter='\t')
AB_affiliation['Type']='Astrobiology'
BA_affiliation['Type']='Bioastronautics'
SBE_affiliation['Type']='Space Bioprocess Engineering'
AB_affiliation['Relative Contribution (%)']=AB_affiliation['Record Count']/AB_affiliation['Record Count'].sum()*100
BA_affiliation['Relative Contribution (%)']=BA_affiliation['Record Count']/BA_affiliation['Record Count'].sum()*100
SBE_affiliation['Relative Contribution (%)']=SBE_affiliation['Record Count']/SBE_affiliation['Record Count'].sum()*100
affiliation_df = pd.concat([SBE_affiliation, BA_affiliation, AB_affiliation])


#providing colormap hexcodes generated using Canva triadic:
color_dict={'Astrobiology': '#2FD03D', 'Bioastronautics': '#3D2FD0', 'Space Bioprocess Engineering': '#D03D2F'}
color_list = list([x for x in color_dict.values()])

#sorting barplot data to include top 10 by sum of relative contribution
sorted_geo_df = geo_df.loc[geo_df['Countries/Regions'].isin(geo_df.groupby(['Countries/Regions']).sum().sort_values(by='Relative Contribution (%)', ascending=False).head(10).index)]
sorted_research_df = research_df.loc[research_df['Research Areas'].isin(research_df.groupby(['Research Areas']).sum().sort_values(by='Relative Contribution (%)', ascending=False).head(10).index)]
sorted_affiliation_df = affiliation_df.loc[affiliation_df['Affiliations'].isin(affiliation_df.groupby(['Affiliations']).sum().sort_values(by='Relative Contribution (%)', ascending=False).head(10).index)]

sorted_geo_df.head()

  sorted_geo_df = geo_df.loc[geo_df['Countries/Regions'].isin(geo_df.groupby(['Countries/Regions']).sum().sort_values(by='Relative Contribution (%)', ascending=False).head(10).index)]
  sorted_research_df = research_df.loc[research_df['Research Areas'].isin(research_df.groupby(['Research Areas']).sum().sort_values(by='Relative Contribution (%)', ascending=False).head(10).index)]
  sorted_affiliation_df = affiliation_df.loc[affiliation_df['Affiliations'].isin(affiliation_df.groupby(['Affiliations']).sum().sort_values(by='Relative Contribution (%)', ascending=False).head(10).index)]


Unnamed: 0,Countries/Regions,Record Count,% of 299,Type,Relative Contribution (%),"% of 1,109","% of 13,566"
0,USA,120.0,40.134,Space Bioprocess Engineering,32.432432,,
1,PEOPLES R CHINA,47.0,15.719,Space Bioprocess Engineering,12.702703,,
2,GERMANY,44.0,14.716,Space Bioprocess Engineering,11.891892,,
3,ITALY,34.0,11.371,Space Bioprocess Engineering,9.189189,,
4,FRANCE,20.0,6.689,Space Bioprocess Engineering,5.405405,,


# Line Chart Plotting Code

In [3]:
lines = alt.Chart(year_df).mark_line(opacity=0.75).encode(
    x='Publication Years:N',
    y=alt.Y('Record Count:Q', scale=alt.Scale(type='log', domain=[1, 20000], clamp=True, nice=False)),
    color=alt.Color('Type', legend=None),
    strokeWidth=alt.value(4),
).properties(
    width=350, 
    height=350,
    title='History of the Space Bioscience Literature',
).configure_axis(
    grid=False,
    tickCount=10,
    labelFontSize=12,
    titleFontSize=18,
    titleFont='roboto',
    labelFont='roboto',
    labelOverlap=True,
    tickWidth=1.5,
    tickSize=7
).configure_legend(
    strokeColor='gray',
    labelFontSize=14,
    titleFontSize=16,
    orient='top-left',
    padding=10,
    cornerRadius=10,
    fillColor='#f1f1f1',
    labelLimit=0
).configure_title(
    fontSize=20,
    font='roboto'
).configure_range(
    category=color_list
)
lines

  for col_name, dtype in df.dtypes.iteritems():


# Barplot Code

In [4]:
def plot_bars(data, facet_height=350, width=350, font='roboto', x_param="Relative Contribution (%):Q", 
              y_param='Type:N', color_param='Type:N', row_param='Affiliations:N', text_param='Record Count:Q',
              domain=[0, 30], dx=2):
    bars = alt.Chart().mark_bar().encode(
    x=alt.X(x_param, axis=alt.Axis(tickSize=12), scale=alt.Scale(domain=domain)),
    y=alt.Y(y_param, axis=alt.Axis(labels=False, ticks=False, title=None)),
    color=alt.Color(color_param, scale=alt.Scale(range=color_list), legend=None)
).properties(
    width=width, 
    height=facet_height
)
    
    text = bars.mark_text(
        align='left',
        baseline='middle',
        dx=dx,  # Nudges text to right so it doesn't appear on top of the bar
        fontSize=10,
        font=font
    ).encode(
        text='Record Count:Q'
    )

    chart = alt.layer(bars+text).facet(
        data=data, facet=alt.Facet(row_param, sort=list(data.groupby(row_param[:-2]).sum().sort_values(by='Relative Contribution (%)', ascending=False).index)),
        columns=1
    ).configure_axis(
        grid=False,
        tickCount=10,
    #     labelFontSize=0,
    #     titleFontSize=0,
        labelOverlap=False,
        labelFont=font,
        tickWidth=1.5,
        titleFont=font,
    ).configure_legend(
        strokeColor='gray',
        labelFontSize=14,
        labelFont=font,
        titleFont=font,
        titleFontSize=12,
        orient='top-left',
        padding=10,
        cornerRadius=10,
        fillColor='#f1f1f1',
        labelLimit=0
    ).configure_title(
        fontSize=20,
        font=font
    ).configure_facet(
        spacing=2
    ).configure_header(
        titleFontSize=10,
        labelFontSize=10,
        titleFont=font,
        labelFont=font,
        labelAngle=0,
        labelAlign='right',
        labelOrient='right'
    )

    return chart

In [5]:
#use plotting function to create JS plot objects
geo_plot = plot_bars(sorted_geo_df, facet_height=35, width=264, row_param='Countries/Regions:N', domain=[0, 50], dx=0.5)
research_plot = plot_bars(sorted_research_df, facet_height=35, width=264, row_param='Research Areas:N', domain=[0, 30])
affiliation_plot = plot_bars(sorted_affiliation_df, facet_height=35, width=264, domain=[0, 6.5])

  data=data, facet=alt.Facet(row_param, sort=list(data.groupby(row_param[:-2]).sum().sort_values(by='Relative Contribution (%)', ascending=False).index)),
  data=data, facet=alt.Facet(row_param, sort=list(data.groupby(row_param[:-2]).sum().sort_values(by='Relative Contribution (%)', ascending=False).index)),
  data=data, facet=alt.Facet(row_param, sort=list(data.groupby(row_param[:-2]).sum().sort_values(by='Relative Contribution (%)', ascending=False).index)),


# Save High-Res Plots

note that the following cell requires the use of the altair_saver package, which itself requires a working installation of js+react

In [6]:
research_plot.save('research_bars_final.png', scale_factor=4)
affiliation_plot.save('affiliation_bars_final.png', scale_factor=4)
geo_plot.save('geo_bars_final.png', scale_factor=4)
lines.save('space_bioscience_time_series.png', scale_factor=4)

  for col_name, dtype in df.dtypes.iteritems():
  for col_name, dtype in df.dtypes.iteritems():
