# Charts & Distributions of Microbiome dataset in PubMed

## Load pre-calcuated data

In [None]:
import joblib

In [None]:
country_counter = joblib.load('./data/country_counter.pkl')
country_counter_affil = joblib.load('./data/country_counter_affil.pkl')

## Metadata

1. Abbreviations

In [None]:
continents_abbr = {
    'AS': 'Asia',
    'AF': 'Africa',
    'EU': 'Europe',
    'OC': 'Oceania',
    'NA': 'North America',
    'SA': 'South America',
    'AN': 'Antarctica'
}
income_level_abbr = {
    'HIC': 'High income',
    'INX': 'Not classified',
    'LIC': 'Low income',
    'LMC': 'Lower middle income',
    'LMY': 'Low & middle income',
    'MIC': 'Middle income',
    'UMC': 'Upper middle income'
}

2. Colors

In [None]:
continent_color_mapper = {
    continents_abbr["AS"]: "#1b9e77",
    continents_abbr["AF"]: "#d95f02",
    continents_abbr["EU"]: "#7570b3",
    continents_abbr["OC"]: "#e7298a",
    continents_abbr["NA"]: "#66a61e",
    continents_abbr["SA"]: "#e6ab02",
    continents_abbr["AN"]: "#a6761d",
}
income_color_mapper = {
    income_level_abbr["HIC"]: "#006400",
    income_level_abbr["UMC"]: "#9400d3",
    income_level_abbr["LMC"]: "#ff7f00",
    income_level_abbr["LIC"]: "#dc143c",
    income_level_abbr["INX"]: "#7a7a7a",
}

3. Shapes

In [None]:
# https://plotly.com/python/marker-style/#custom-marker-symbols
continent_shape_mapper = {
    continents_abbr["AS"]: "circle",
    continents_abbr["AF"]: "square",
    continents_abbr["EU"]: "diamond",
    continents_abbr["OC"]: "triangle-left",
    continents_abbr["NA"]: "triangle-up",
    continents_abbr["SA"]: "triangle-down",
    continents_abbr["AN"]: "triangle-right",
}
income_shape_mapper = {
    income_level_abbr["HIC"]: "circle",
    income_level_abbr["UMC"]: "triangle-up",
    income_level_abbr["LMC"]: "triangle-down",
    income_level_abbr["LIC"]: "square",
    income_level_abbr["INX"]: "pentagon",
}

4. Country Income info

In [None]:
import json

with open('./data/country_income.json', 'r', encoding='utf-8') as f:
    inc = json.loads(f.read())

## Prepare data for charts

In [None]:
from collections import defaultdict
from itertools import chain

import numpy as np
import pandas as pd

In [None]:
# country_data = defaultdict(lambda: np.zeros(2))
country_data = []
continent_data, income_data = defaultdict(lambda: np.zeros(2)), defaultdict(lambda: np.zeros(2))
for key in set(chain(country_counter.keys(), country_counter_affil.keys())):
    iso_3, name, cont = key
    data = np.array([country_counter[key], country_counter_affil[key]])
    continent_data[continents_abbr[cont]] += data
    income_level = income_level_abbr[inc[iso_3] if iso_3 in inc else 'INX']
    income_data[income_level] += data
#     country_data[(iso_3, name, continents_abbr[cont], income_level)] = data
    country_data.append([iso_3, name, continents_abbr[cont], income_level, *data])

In [None]:
cdf = pd.DataFrame(country_data, columns=['iso3', 'name', 'continent', 'income_level', 'mention_text', 'mention_affil'])
texts = cdf.apply(lambda x: "<br>".join(f"<b>{k}:</b> {v}" for k, v in x.to_dict().items()), axis=1).values
income_colors = cdf['income_level'].apply(lambda x: income_color_mapper[x]).values
continent_colors = cdf['continent'].apply(lambda x: continent_color_mapper[x]).values
income_shapes = cdf['income_level'].apply(lambda x: income_shape_mapper[x]).values
continent_shapes = cdf['continent'].apply(lambda x: continent_shape_mapper[x]).values

## Charts

In [None]:
import plotly.offline as py
import plotly.graph_objects as go
from ipywidgets import Dropdown, interact, interact_manual

py.init_notebook_mode()

### Geo Distribution

In [None]:
@interact(by=Dropdown(options=["Text", "Affiliation"], value="Text", description="Mentions in:"), 
          scale=Dropdown(options=['Log', 'Linear'], value="Log", description="Scale: "))
def plot_world_map(by, scale=np.log):
    if by == 'Text':
        counter = country_counter
    elif by == 'Affiliation':
        counter = country_counter_affil
    else:
        print("`Mentions in` must be in {'Text', 'Affiliation'}")
        return
    countries = [x for _, x, _ in counter.keys()]
    counts = np.array(list(counter.values()))
    # Source: https://stackoverflow.com/questions/62566708/plotly-highlight-identify-certain-countries-in-choropleth
    data = dict(
        type='choropleth',
        locations=countries,
        locationmode='country names',
        colorscale='Reds',  # See: https://plotly.com/python/reference/choropleth/#choropleth-colorscale
        z=np.log(counts) if scale == "Log" else counts
    )
    wmap = go.Figure(data=[data])
    py.iplot(wmap)

### Country mentions in - Text Vs Affiliation

In [None]:
@interact(color_by=Dropdown(options=["Continent", "Income level"], value="Income level", description="Color By:"), 
          scale=Dropdown(options=['Log', 'Linear'], value="Log", description="Scale: "))
def mentions(color_by, scale=np.log):
    if color_by == "Continent":
        color_mapper = continent_color_mapper
        colors = continent_colors
    elif color_by == "Income level":
        color_mapper = income_color_mapper
        colors = income_colors
    else:
        print("`Color By` should be in {'Continent', 'Income level'}")
        return
    # Add legends
    traces = [go.Scatter(x=[None], y=[None], name=k, mode="markers", 
                         marker=dict(symbol='square', color=v, size=10), 
                         legendrank=idx) 
              for idx, (k, v) in enumerate(color_mapper.items())]
    countries = cdf['name'].values
    x = cdf['mention_text'].values
    y = cdf['mention_affil'].values
    x, y = (np.log(x), np.log(y)) if scale == 'Log' else (x, y)
    traces.append(go.Scatter(x=x, y=y, mode='markers', text=countries, textposition='top right', 
                             marker=dict(color=colors),
                             showlegend=False)
                 )
    # https://plotly.com/python/line-and-scatter/
    fig = go.Figure(data=traces)
    fig.update_layout(legend_traceorder='normal')
    fig.update_xaxes(title_text='Mentions in <b>Text</b>')
    fig.update_yaxes(title_text='Mentions in <b>Affiliation</b>')
    py.iplot(fig)

### Distribution based on `Continent` & `Income level`

In [None]:
@interact(by=Dropdown(options=["Continent", "Income level"], value="Continent", description="By:"))
def plot_bars(by):
    if by == "Continent":
        data = continent_data
    elif by == "Income level":
        data = income_data
    else:
        print("`By` should be in {'Continent', 'Income level'}")
        return
    categories = list(data.keys())
    values = np.array(list(data.values()))
    in_text = values[:, 0]
    in_affil = values[:, 1]
    fig_cc = go.Figure(data=[
        go.Bar(name='In Text', x=categories, y=in_text),
        go.Bar(name='In Affiliation', x=categories, y=in_affil)
    ])
    fig_cc.update_layout(barmode='group')
    # Source: https://community.plotly.com/t/plotly-express-histogram-any-way-to-sort-bar-by-value/23905/10
    # https://plotly.com/python/reference/layout/xaxis/#layout-xaxis-categoryorder
    fig_cc.update_xaxes(categoryorder='total descending')

    py.iplot(fig_cc)

### Dimension Reduction

In [None]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from umap import UMAP

In [None]:
from ipywidgets import FloatRangeSlider

@interact_manual(method=Dropdown(options=['tsne', 'pca', 'umap'], value="tsne", 
                                 description="Method"), 
                 normalization=Dropdown(options=["minmax", "gaussian", "fraction"], value="minmax", 
                                        description="Normalization"),
                 size_by=Dropdown(options=["text", "affiliation", "frac_per_text", "frac_per_affil"], 
                                  value="text", description="Size by"),
                 size_range=FloatRangeSlider(min=1.0, max=50, value=[3., 20], readout_format='.1f',
                                             description="Size Range:"),
                 continent_as_shape=Dropdown(options=[('Income as Color & Continent as Shape', True),
                                                      ('Continent as Color & Income as Shape', False)],
                                             value=True, description="Color Vs Shape:")
                )
def dimension_reduction(method='tsne', normalization='norm', size_by='text', size_range=[3, 20],
                        continent_as_shape=True):
    jitter_alpha = 2.0
    mt, ma = cdf['mention_text'], cdf['mention_affil']
    if normalization == 'minmax':
        cdf['mention_text_normalized'] = (mt - mt.min()) / (mt.max() - mt.min())
        cdf['mention_affil_normalized'] = (ma - ma.min()) / (ma.max() - ma.min())
    elif normalization == 'gaussian':
        cdf['mention_text_normalized'] = (mt - mt.mean()) / mt.std()
        cdf['mention_affil_normalized'] = (ma - ma.mean()) / ma.std()
    elif normalization == 'fraction':
        cdf['mention_text_normalized'] = mt / mt.max()
        cdf['mention_affil_normalized'] = ma / ma.max()
    else:
        print(f'Normalization `{normalization}` is not recognized')
        return

    inc_one_hot = pd.get_dummies(cdf['income_level']).values
    cont_one_hot = pd.get_dummies(cdf['continent']).values
    ta_vec = cdf[['mention_text_normalized', 'mention_affil_normalized']].values
    vecs = np.concatenate((ta_vec, inc_one_hot, cont_one_hot), axis=1)
    
    if method == 'tsne':
        fitted = TSNE().fit_transform(vecs)
        fitted += np.random.rand(*fitted.shape) * jitter_alpha
        x = fitted[:, 0]
        y = fitted[:, 1]
    elif method == 'pca':
        fitted = PCA().fit_transform(vecs)
        fitted += np.random.rand(*fitted.shape) * jitter_alpha
        x = fitted[:, 0]
        y = fitted[:, 1]
    elif method == 'umap':
        fitted = UMAP().fit_transform(vecs)
        fitted += np.random.rand(*fitted.shape) * jitter_alpha
        x = fitted[:, 0]
        y = fitted[:, 1]
    else:
        print(f'Method `{method}` is not recognized')
        return
    if size_by == 'text':
        sz = mt
    elif size_by == 'affiliation':
        sz = ma
    elif size_by == 'frac_per_text':
        sz = np.nan_to_num(ma / mt, posinf=0.0)
    elif size_by == 'frac_per_affil':
        sz = np.nan_to_num(mt / ma, posinf=0.0)
    else:
        print(f'Size by `{method}` is not recognized')
        return
    # Shapes & colors
    if continent_as_shape:
        shapes = continent_shapes
        colors = income_colors
        shape_mapper = continent_shape_mapper
        color_mapper = income_color_mapper
    else:
        shapes = income_shapes
        colors = continent_colors
        shape_mapper = income_shape_mapper
        color_mapper = continent_color_mapper
    sizes = np.nan_to_num(np.log(sz), neginf=0.0)
#     sizes = sz
    # Min-Max scaling
#     sizes = (sizes - sizes.min()) / (sizes.max() - sizes.min())
    sizes = (sizes - sizes.min()) / (sizes.max() - sizes.min()) * (size_range[1] - size_range[0]) + size_range[0]
    # https://plotly.com/python/line-and-scatter/
    fig = go.Figure(data=[go.Scatter(x=x, y=y, mode='markers', text=texts,
                                     marker_symbol=shapes,
                                     marker=dict(
                                         color=colors,
                                         size=sizes,
                                         opacity=0.7
                                     ),
                                     showlegend=False,
                                     hoverinfo='text',
                                    ),

                          # Add legends: https://community.plotly.com/t/plotly-express-how-to-separate-symbol-and-color-in-legend/38950/2
                          # Colors
                          *[go.Scatter(x=[None], y=[None], name=k, mode="markers", legendgroup="color", 
                                       legendgrouptitle_text="Colors:", 
                                       marker=dict(symbol='square', color=v, size=10)) 
                            for k, v in color_mapper.items()],
                          # Shapes
                          *[go.Scatter(x=[None], y=[None], name=k, mode="markers", legendgroup="shape", 
                                       legendgrouptitle_text="Shapes:", 
                                       marker=dict(symbol=v, color='#000000', size=10)) 
                            for k, v in shape_mapper.items()]
                         ],
                   layout=go.Layout(paper_bgcolor='rgba(255, 255, 255, 255)',
                                    plot_bgcolor='rgba(255, 255, 255, 255)')
                   )
    # Remove axes
    fig.update_xaxes(visible=False)
    fig.update_yaxes(visible=False)
    py.iplot(fig)