In [None]:
!pip install sparklines > /dev/null

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
#import networkx as nx
#from networkx.readwrite import json_graph
import csv
import json
from pandas.api.types import is_string_dtype, CategoricalDtype
import warnings
import sys
import os
#from sklearn.preprocessing import MinMaxScaler
import datetime
from pandas.tseries.offsets import MonthBegin
from operator import attrgetter
import sparklines
import base64
import copy
from itertools import combinations
from io import BytesIO
from IPython.display import display, HTML, Image
import matplotlib.pyplot as plt
from plotly.subplots import make_subplots
sys.path.append('/kaggle/input/custom-survey-scripts-2021/')
import kaggle_survey_2021 as show

pd.set_option('display.MAX_COLUMNS', None)
pd.set_option('display.MAX_ROWS', 5000)
pd.set_option('display.max_colwidth', None)

# path_2021 = r'../input/kaggle-survey-2021'
# path_2020 = r'../input/kaggle-survey-2020'
# path_2019 = r'../input/kagglesurvey2019'
# path_2018 = r'../input/kaggle-survey-2018'

In [None]:
cleaned_mcr = show.load_cleaned_data()

In [None]:
def data_for_sparkline(df, grp_var, max_time='2021',
                              min_time='2018',
                              agg_var=None,
                              is_pct=False,
                              is_univariate=False, 
                              is_bivariate=False, 
                              period=3, 
                              cust_sparkline=True):
    """
    Create data for sparkline
    """
    tmp_df = df.copy()
    if is_univariate and not agg_var:
        g = tmp_df.groupby(grp_var).size()
        if is_pct:
            g = g.groupby(level=[0]).apply(lambda x: x/x.sum()).reset_index()
        else:
            g = g.reset_index()
    elif is_bivariate and not agg_var:
        g = tmp_df.groupby(grp_var).size()
        if is_pct:
            g = g.groupby(level=[0, 1]).apply(lambda x: x/x.sum()).reset_index()
        else:
            g = g.reset_index()
    elif is_univariate and agg_var:
        g = tmp_df.groupby(grp_var).agg({agg_var: np.sum,})
        if is_pct:
            g = g.groupby(level=[0]).apply(lambda x: x/x.sum()).reset_index().rename(columns={0: 'value'})
        else:
            g = tmp_df.groupby(grp_var).agg({agg_var: np.mean,}).reset_index().rename(columns={0: 'value'})
    elif is_bivariate and agg_var:
        g = tmp_df.groupby(grp_var).agg({agg_var: np.sum,})
        if is_pct:
            g = g.groupby(level=[0, 1]).apply(lambda x: x/x.sum()).reset_index().rename(columns={0: 'value'})
        else:
            g = tmp_df.groupby(grp_var).agg({agg_var: np.mean,}).reset_index().rename(columns={0: 'value'})
    
    # grp_mean = g.iloc[:, -1].mean() # compute mean for the last column
    g['Survey'] = g['Survey'].astype(str)
    g = g.pivot_table(index=grp_var[:-1], columns='Survey', fill_value=0)
    g.columns = g.columns.droplevel() # drop usage_value
    g = g.rename_axis(None, axis=1) # remove usage_month
    if cust_sparkline:
        g['trend'] = g.apply(custom_sparkline, axis=1)
    else:
        g['trend'] = g.apply(lambda x: sparklines.sparklines(x)[0], axis=1)
    g['growth'] = np.round((g[max_time] / g[min_time]) ** (1/period) - 1, 2)
    g['growth'] = g['growth'].replace(np.inf, 0).replace(np.nan, 0)
    return g

In [None]:
def highlight_table(row, threshold=.05):
    """
    Helper function to highlight cells
    in a Pandas dataframe
    """
    if isinstance(row[0], str): return
    return [
        'background-color: #FF7F7F; color: white' if cell <= threshold
        else 'background-color: green; color: white'
        for cell in row
    ]

In [None]:
def custom_sparkline(data, figsize=(3, 0.25), **kwags):
    """
    Create a sparkline chart
    https://github.com/iiSeymour/sparkline-nb/blob/master/sparkline-nb.ipynb
    """
    data = list(data)
    fig, ax = plt.subplots(1, 1, figsize=figsize, **kwags)
    ax.plot(data)
    for k, v in ax.spines.items():
        v.set_visible(False)
    
    ax.set_xticks([])
    ax.set_yticks([])
    
    plt.plot(len(data)-1, data[len(data)-1], 'r.', linewidth=2)
    
    # ax.fill_between(range(len(data)), data, len(data)*[min(data)], alpha=0.1)
    
    img = BytesIO()
    plt.savefig(img, transparent=True, dpi=100, bbox_inches='tight')
    # img.seek(0)
    plt.close()
    
    return f'<img src="data:image/png;base64,{base64.b64encode(img.getvalue()).decode()}"/>'
    # return display(HTML(df.to_html(escape=False))

In [None]:
def subset_sparkline_data(df, obs_var, time='Survey',
                          agg_var=None,
                          is_univariate=False,
                          is_bivariate=False,
                          is_pct=False,
                          max_time='2021',
                          min_time='2018',
                          period=3
                         ):
    """
    Used for subsetting the original dataframe by various
    dimensions
    """
    if is_univariate:
        subset_df = data_for_sparkline(df, [obs_var] + [time], max_time=max_time,
                                  min_time=min_time,
                                  agg_var=agg_var,
                                  is_pct=is_pct,     
                                  is_univariate=is_univariate, 
                                  is_bivariate=is_bivariate,
                                  period=period,
                                  cust_sparkline=False)
    elif is_bivariate:
        subset_df = data_for_sparkline(df, obs_var + [time], max_time=max_time,
                          min_time=min_time,
                          agg_var=agg_var,
                          is_pct=is_pct,
                          is_univariate=is_univariate, 
                          is_bivariate=is_bivariate,
                          period=period,             
                          cust_sparkline=False)
    grad_cols = subset_df.columns.drop(['trend', 'growth']).tolist()
    return subset_df, grad_cols

In [None]:
def plot_pandas_table(df, obs_var, 
                      is_pct=False,
                      is_univariate=False, 
                      is_bivariate=False,
                      sort_field='growth', 
                      agg_var=None,
                      filter_dim=None,
                      filter_list=[],
                      top_n=5,
                      asc=False,
                      min_time='2018',
                      period=3,
                      is_profession=False,
                      title=None):
    """
    Wrapper to subset the data for a particular dimension
    and to display the table
    Returns: A styled Pandas Table
    """
    tbl=None
    if is_univariate and top_n == 0 and not agg_var:
        trend_df, grad_cols = subset_sparkline_data(df, obs_var,
                                                    is_univariate=is_univariate,
                                                   min_time=min_time,
                                                   period=period)
        tbl = trend_df.sort_values([sort_field, obs_var], ascending=[False, True], kind='mergesort').style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_caption(title)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0) 
    elif is_univariate and top_n > 0 and not agg_var:
        trend_df, grad_cols = subset_sparkline_data(df, obs_var,
                                                    is_univariate=is_univariate,
                                                   min_time=min_time,
                                                   period=period)
        tbl = trend_df.sort_values([sort_field, obs_var], ascending=[asc, True], kind='mergesort')[:top_n].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_caption(title)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0) 
    elif is_univariate and agg_var and is_pct:
        trend_df, grad_cols = subset_sparkline_data(df, obs_var,
                                                    agg_var=agg_var,
                                                    is_pct=is_pct,
                                                    is_univariate=is_univariate,
                                                   min_time=min_time,
                                                   period=period)
        tbl = trend_df.sort_values([sort_field, obs_var], ascending=[asc, True], kind='mergesort')[:top_n].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .format('{:.1%}', subset=grad_cols)\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_caption(title)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0) 
    elif is_univariate and agg_var and not is_pct:
        if len(filter_list) > 0:
            subset_df = df[~df[obs_var].isin(filter_list)].copy()
        else:
            subset_df = df.copy()
        trend_df, grad_cols = subset_sparkline_data(subset_df, obs_var,
                                                    agg_var=agg_var,
                                                    is_pct=is_pct,
                                                    is_univariate=is_univariate,
                                                   min_time=min_time,
                                                   period=period)
        tbl = trend_df.sort_values([sort_field, obs_var], ascending=[asc, True], kind='mergesort')[:top_n].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .format('${:,.0f}', subset=grad_cols)\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_caption(title)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0) 
        
    elif is_bivariate and top_n == 0 and not agg_var:
        subset_df = df.query(f'{obs_var[0]} == @filter_list').copy()
        trend_df, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                               is_bivariate=is_bivariate,
                                                   min_time=min_time,
                                                   period=period)
        tbl = trend_df.sort_values(obs_var[:1] + [sort_field], ascending=[True, False], kind='mergesort').groupby(obs_var[0])\
                                                .style\
                                                .format('{:.1%}', subset=['growth'])\
                                                .set_table_styles([{
                                                    'selector': 'caption',
                                                    'props': [
                                                        ('font-size', '16px')
                                                    ]
                                                }])\
                                                .set_caption(title)\
                                                .set_properties(padding='10px', border='2px solid white')\
                                                .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)
    elif is_bivariate and top_n > 0 and agg_var:
        subset_df = df.query(f'{obs_var[0]} == @filter_list').copy()
        trend_df, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                               is_bivariate=is_bivariate,
                                                   min_time=min_time,
                                                   period=period)
        tbl = trend_df.sort_values(obs_var[:1] + [sort_field], ascending=[True, False], kind='mergesort').groupby(obs_var[0])\
                                                .head(top_n)\
                                                .style\
                                                .format('{:.1%}', subset=['growth'])\
                                                .set_table_styles([{
                                                    'selector': 'caption',
                                                    'props': [
                                                        ('font-size', '16px')
                                                    ]
                                                }])\
                                                .set_caption(title)\
                                                .set_properties(padding='10px', border='2px solid white')\
                                                .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)
    elif is_bivariate and top_n == 0 and is_profession:
        subset_df = df[~df.isin(filter_list)]
        trend_df, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                               is_bivariate=is_bivariate,
                                                   min_time=min_time,
                                                   period=period)
        tbl = trend_df.sort_values(obs_var[:1] + [sort_field], ascending=[True, False], kind='mergesort').groupby(obs_var[0])\
                                                .style\
                                                .format('{:.1%}', subset=['growth'])\
                                                .set_table_styles([{
                                                    'selector': 'caption',
                                                    'props': [
                                                        ('font-size', '16px')
                                                    ]
                                                }])\
                                                .set_caption(title)\
                                                .set_properties(padding='10px', border='2px solid white')\
                                                .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0) 
    elif is_bivariate and top_n > 0 and is_profession:
        subset_df = df[~df.isin(filter_list)]
        trend_df, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                               is_bivariate=is_bivariate,
                                                   min_time=min_time,
                                                   period=period)
        tbl = trend_df.sort_values(obs_var[:1] + [sort_field], ascending=[True, False], kind='mergesort').groupby(obs_var[0])\
                                                .head(top_n)\
                                                .style\
                                                .format('{:.1%}', subset=['growth'])\
                                                .set_table_styles([{
                                                    'selector': 'caption',
                                                    'props': [
                                                        ('font-size', '16px')
                                                    ]
                                                }])\
                                                .set_caption(title)\
                                                .set_properties(padding='10px', border='2px solid white')\
                                                .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0) 
    
    return tbl

In [None]:
Image(filename="/kaggle/input/banner-survey/Image_20211024_065457.png")

## Kaggle trends

This is the 5th instalment of the annual Kaggle Survey. The survey had 369 questions and 25973 responses - a 3% yearly growth rate in the last 4 years.


The demographic analysis covers all participants to understand how responses have varied in the last 4 years including the years 2020 and 2021,which have been extremely difficult on everyone. Post analysis of individual questions, a comparison of students and professsionals is done to understand how each group differs in their preferences to various questions asked in the rest of the survey.

Pandas styled dataframes have been used to show number of responses/percentage across each category for the years from 2018-2021; this is augmented by a compounded growth for each category across time. The data is mostly sorted in descending order of growth or frequency where there is data for only 2021. Each cell represents either a number of responses, a share of responses or a dollar amount.

## Demograhic analysis

### Fastest growing Age groups

In [None]:
obs_var = 'Age'
plot_pandas_table(cleaned_mcr, obs_var, 
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=[], 
                  top_n=0, 
                  title='Fastest growing Age categories - 2018-2021')

**What do we observe**

The 70+ categorgy has been clubbed into the 70-79 category as the maximum age group is 80+ years.

One plausible explanation could be that the pandemic, needless to say, has put a lot of stress on the economy and its people; I suspect
lot of young folks were affected due to academic disruptions and loss of jobs which has had a profound effect on their mental well being. Despite the upheaval, there has been a healthy rise in participation in 2021.

1) A side effect of clubbing the 70+ category into the 70-79 category has resulted in this group showing the highest
growth in the last 4 years. However, even if this category is ignored, there seems to be an increase in Kagglers in the 
other age categories, especially - 50-54, 55-59, and 60-69 years.

2) On the other hand, the growth rate in the young groups, baring 18-21 years, such as 22-24, 25-29 and 30-34 years have 
seen a slight drop in growth the last 4 years.

## Fastest growing Countries

In [None]:
obs_var = 'Country'
plot_pandas_table(cleaned_mcr, obs_var, 
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=[], 
                  top_n=12, 
                  asc=False,
                  title='Fastest growing Countries - 2018-2021')

In [None]:
def plot_side_by_side_tables(df,
                            obs_var, 
                            is_univariate=True, 
                            is_bivariate=False,
                            sort_field='growth', 
                            is_pct=False, 
                            agg_var=None,
                            filter_list=[],
                            filter_level=None,
                            filter_list2=[],
                            filter_list3=[], 
                            n=5, 
                            m=0, 
                            asc=False,
                            title1=None,
                            title2=None):
    """
    Display styled tables side by side
    # https://stackoverflow.com/questions/38783027/jupyter-notebook-display-two-pandas-tables-side-by-side
    """
    a, b = None, None
    if is_univariate and len(filter_list) == 0 and not agg_var and len(filter_list2) == 0:
        trend, grad_cols = subset_sparkline_data(df, obs_var, 
                                                    is_univariate=True)
        a = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[:n].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title1)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()
        b = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[-n:].style\
                                                .format('{:.1%}', subset=['growth'])\
                                                .set_table_styles([{
                                                    'selector': 'caption',
                                                    'props': [
                                                        ('font-size', '16px')
                                                    ]
                                                }])\
                                                .set_table_attributes("style='display:inline'")\
                                                .set_caption(title2)\
                                                .set_properties(padding='10px', border='2px solid white')\
                                                .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                ._repr_html_() 
    if is_univariate and len(filter_list) > 0 and not agg_var and len(filter_list2) == 0:
        subset_df = df[~df[obs_var].isin(filter_list)].copy()
        trend, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                                    is_univariate=True)
        a = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[:n].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title1)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()
        b = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[-n:].style\
                                                .format('{:.1%}', subset=['growth'])\
                                                .set_table_styles([{
                                                    'selector': 'caption',
                                                    'props': [
                                                        ('font-size', '16px')
                                                    ]
                                                }])\
                                                .set_table_attributes("style='display:inline'")\
                                                .set_caption(title2)\
                                                .set_properties(padding='10px', border='2px solid white')\
                                                .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                ._repr_html_() 
    elif is_univariate and agg_var and len(filter_list2) > 0 and is_pct:
        trend, grad_cols = subset_sparkline_data(df, obs_var, 
                                               agg_var=agg_var, 
                                               is_pct=is_pct,         
                                               is_univariate=is_univariate)
        a = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[:n].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .format('{:.1%}', subset=grad_cols)\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title1)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()

        subset_df = df[~df.isin(filter_list2)].copy()
        trend, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                                       agg_var=agg_var, 
                                                       is_pct=is_pct,         
                                                       is_univariate=is_univariate)
        b = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[-n:].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .format('{:.1%}', subset=grad_cols)\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title2)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()
    elif is_univariate and agg_var and len(filter_list) > 0 and not is_pct:
        subset_df = df.query(f'{obs_var} == @filter_list').copy()
        # subset_df = df[~df[obs_var].isin(filter_list)].copy()
        trend, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                               agg_var=agg_var, 
                                               is_pct=is_pct,         
                                               is_univariate=is_univariate)
        a = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[:n].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .format('${:,.0f}', subset=grad_cols)\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title1)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()

        b = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[-n:].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .format('${:,.0f}', subset=grad_cols)\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title2)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()
    elif is_univariate and agg_var and len(filter_list2) > 0 and not is_pct:
        trend, grad_cols = subset_sparkline_data(df, obs_var, 
                                               agg_var=agg_var, 
                                               is_pct=is_pct,         
                                               is_univariate=is_univariate)
        a = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[:n].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .format('${:,.0f}', subset=grad_cols)\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title1)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()

        subset_df = df[~df.isin(filter_list2)].copy()
        trend, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                                       agg_var=agg_var, 
                                                       is_pct=is_pct,         
                                                       is_univariate=is_univariate)
        b = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[-n:].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .format('${:,.0f}', subset=grad_cols)\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title2)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()
    elif is_univariate and not agg_var and len(filter_list2) > 0:
        trend, grad_cols = subset_sparkline_data(df, obs_var,         
                                               is_univariate=is_univariate)
        a = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[:n].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title1)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()

        subset_df = df[~df.isin(filter_list2)].copy()
        trend, grad_cols = subset_sparkline_data(subset_df, obs_var,         
                                                       is_univariate=is_univariate)
        b = trend.sort_values(['growth', obs_var], ascending=[False, True], kind='mergesort')[-m:].style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title2)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()
    elif is_bivariate and len(filter_list) > 0 and filter_level is None:
        subset_df = df.query(f'{obs_var[0]} == @filter_list').copy()
        trend, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                                       is_bivariate=True)
        a = trend.sort_values(obs_var[:1] + ['growth'], ascending=[True, False], kind='mergesort').groupby(obs_var[0])\
                                                        .head(n)\
                                                        .style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title1)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()

        b = trend.sort_values(obs_var[:1] + ['growth'], ascending=[True, False], kind='mergesort').groupby(obs_var[0])\
                                                        .tail(n)\
                                                        .style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title2)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()
    elif is_bivariate and filter_level and len(filter_list2) > 0 and len(filter_list3) > 0:
        subset_df = df.query(f'{obs_var[0]} == @filter_list').copy()
        trend, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                                       is_bivariate=True)
        a = trend.sort_values(obs_var[:1] + ['growth'], ascending=[True, False], kind='mergesort').groupby(obs_var[0])\
                                                        .head(n)\
                                                        .style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title1)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()

        subset_df = df.query(f'{obs_var[0]} == @filter_list2').copy()
        subset_df = subset_df[~subset_df.isin(filter_list3)].copy()
        trend, grad_cols = subset_sparkline_data(subset_df, obs_var, 
                                                       is_bivariate=True)
        b = trend.sort_values(obs_var[:1] + ['growth'], ascending=[True, False], kind='mergesort').groupby(obs_var[0])\
                                                        .tail(n)\
                                                        .style\
                                                        .format('{:.1%}', subset=['growth'])\
                                                        .set_table_styles([{
                                                            'selector': 'caption',
                                                            'props': [
                                                                ('font-size', '16px')
                                                            ]
                                                        }])\
                                                        .set_table_attributes("style='display:inline'")\
                                                        .set_caption(title2)\
                                                        .set_properties(padding='10px', border='2px solid white')\
                                                        .background_gradient(cmap='RdYlGn', subset=grad_cols, axis=1)\
                                                        .background_gradient(cmap='RdYlGn', subset=['growth'], axis=0)\
                                                        ._repr_html_()
        
    output = a + "\xa0\xa0\xa0\xa0" + b    
    display(HTML(output))

In [None]:
obs_var = 'Country'
plot_side_by_side_tables(cleaned_mcr,
                            obs_var, 
                            is_univariate=True, 
                            is_bivariate=False,
                            sort_field='growth', 
                            agg_var=None,
                            filter_list=[], 
                            n=12, 
                            asc=False,
                            title1='Top 12 fastest growing Countries - 2018-2021',
                            title2='Bottom 12 slowest growing Countries - 2018-2021')

**What do we observe**

There is a growing trend among African and Asian participants in the last 4 years, and a declining one among the American and European countries.

1) The top 2 fastest growing countries in terms of participation rate are Egypt and Nigeria, followed by Pakistan, Bangladesh and Kenya. India is in 11th place.

2) From the top 12 slowest growing countries, USA shares the 1st place along with Netherlands.

## Fastest growing Gender

In [None]:
obs_var = 'Gender'
plot_pandas_table(cleaned_mcr, obs_var, 
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=[], 
                  top_n=0, 
                  title='Fastest growing Gender - 2018-2021')

**What do we observe**

1) For a change, the pandemic has seen an upward trend in Female participation, which grew at 7% in the last 4 years.


## Fastest and slowest growing Countries with respect to Gender

In [None]:
obs_var = ['Gender', 'Country']
gender_list = ['Female', 'Male']
plot_side_by_side_tables(cleaned_mcr,
                            obs_var, 
                            is_univariate=False, 
                            is_bivariate=True,
                            sort_field='growth', 
                            agg_var=None,
                            filter_list=gender_list, 
                            n=3, 
                            asc=False,
                            title1='Top 3 fastest growing countries based on Gender growth - 2018-2021',
                            title2='Bottom 3 slowest growing countries based on Gender growth - 2018-2021')

**What do we observe**

Needless to say, the countries with the highest absolute numbers aren't the ones witnessing highest growth rates in the last 4 years.

1) The country with the highest overall particiaption rate is also the country with the highest female participation rate - Egypt. Similarly, Nigeria and Bangladesh have also witnessed an increase in women participation, in the last 4 years.

## Fastest and slowest aging Countries

In [None]:
obs_var = ['Age', 'Country']
age_list = ['18-21', '22-24', '25-29', '30-34', '35-39']
age_list2 = ['40-44', '45-49', '50-54', '55-59', '60-69']
country_list = ['Finland', 'Hungary', 'New Zealand', 'Nigeria', 'Undisclosed',
               'Hong Kong', 'China', 'Belarus', 'Denmark', 'Norway', 'Malaysia',
               'Kenya', 'Tunisia', 'Greece', 'Canada', 'Switzerland', 'Czech Republic', 
               'Ukraine', 'Viet Nam', 'South Africa', 'South Korea', 'Taiwan', 'Thailand',
               'Uganda', 'UAE', 'Singapore', 'Spain', 'Turkey', 'Sri Lanka', 'Saudi Arabia',
               'Romania', 'Portugal', 'Sweden', 'Nepal', 'Kazakhstan', 'Morocco', 'Philippines',
               'Ireland', 'Iraq', 'Egypt', 'Bangladesh', 'Peru', 'Ethiopia', 'Ghana', 'Algeria',
               'Indonesia', 'Iran', 'Belgium', 'Ecuador', 'Colombia']
plot_side_by_side_tables(cleaned_mcr,
                            obs_var, 
                            is_univariate=False, 
                            is_bivariate=True,
                            sort_field='growth', 
                            agg_var=None,
                            filter_level=2, 
                            filter_list=age_list, 
                            filter_list2=age_list2,
                            filter_list3=country_list, 
                            n=3, 
                            asc=False,
                            title1='Top 3 youngest growing countries Age wise - 2018-2021',
                            title2='Bottom 3 oldest growing countries Age wise - 2018-2021')

**What do we observe**

1) Youngest growing countries are from Middle east, Asia and African regions, whereas, oldest growing countries are from North America, Europe and Australia.

## Fastest growing Education categories

In [None]:
obs_var = 'Education'
plot_pandas_table(cleaned_mcr, obs_var, 
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=[], 
                  top_n=0, 
                  title='Fastest growing Education categories - 2018-2021')

**What do we observe**

Professional Doctorate is a new category introduced this year. This has been clubbed with Doctoral degree for ease of analysis. Professional degree
was removed from this year's survey.

1) There is a growing trend among participants who didn't want to disclose their level of education.

2) Those with no degree or those with some schooling up to high school are on the rise. While the more formal degrees are either seeing very small growth or declining growth in the last 4 years.

## Mean yearly Salary 

In [None]:

fig = px.line(cleaned_mcr.groupby('Survey')['Salary_Cleaned'].mean().round(), y='Salary_Cleaned',
              title='Mean salary - 2018-2021')
fig.update_xaxes(dtick="M1",
               #  tickformat="%b\n%Y"
                )
fig.show()

**How is the data prepared**

1) The midpoint in the slaray range is taken, and the yearly mean is computed.

2) Undisclosed and null values are ignored.

**What do we observe**

1) The mean salary has seen a decline since 2020, which is expected as people lost jobs, and others may have had to take a pay cut.

## Fastest growing Salary by Country

In [None]:
obs_var = 'Country'
country_list = ['Finland', 'Hungary', 'New Zealand', 'Undisclosed', 'Uganda',
                   'UAE', 'Taiwan', 'Sri Lanka', 'Saudi Arabia', 'Nepal', 
                   'Kazakhstan', 'Iraq', 'Ghana', 'Ethiopia', 'Ecuador', 'Algeria']
plot_side_by_side_tables(cleaned_mcr,
                            obs_var, 
                            is_univariate=True, 
                            is_bivariate=False,
                            is_pct=False,
                            sort_field='growth', 
                            agg_var='Salary_Cleaned',
                            filter_level=2,  
                            filter_list2=country_list,
                            n=10, 
                            asc=False,
                            title1='Top 10 high rate payers - 2018-2021',
                            title2='Bottom 10 low rate payers - 2018-2021')

**What do we observe**

1) It is mostly the European countries that have seen a positive growth in mean Salary in the last 4 years. A couple of outliers in the top10 list are South Africa and Singapore each with a 10% and 8% growth rate respectively.

## Fastest growing Age groups based on their Salaries

In [None]:
obs_var = 'Age'
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var='Salary_Cleaned',
                  filter_list=[], 
                  top_n=10, 
                  title='Fastest Income growers Age wise - 2018-2021')

**What do we observe**

1) The mean income, in the last 4 years, has declined across all age groups. The pandemic may have impacted the earning power of the younger groups in the last 2 years

## Popular Role titles

In [None]:
obs_var = 'Role title'
role_list = ['Chief Officer', 'Consultant', 'Data Journalist', 
            'Developer Advocate', 'Manager', 'Marketing Analyst', 
            'Principal Investigator', 'Salesperson']
plot_side_by_side_tables(cleaned_mcr,
                            obs_var, 
                            is_univariate=True, 
                            is_bivariate=False,
                            sort_field='growth', 
                            agg_var=None,
                            filter_level=2,  
                            filter_list2=role_list,
                            n=10,
                            m=4, 
                            asc=False,
                            title1='Top 10 fastest growing Role titles growth - 2018-2021',
                            title2='Bottom 10 slowest growing Role titles growth - 2018-2021')

**What do we observe**

1) Move over Data Scientists! Kaggle has been attracting people from other walks of life in the last 4 years; this could bring a fresh perspective to competitions and conversations in general.

In [None]:
obs_var = 'Role title'
role_list = ['Developer Relations/Advocacy', 'Machine Learning Engineer']
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var='Salary_Cleaned',
                  filter_list=role_list, 
                  top_n=10, 
                  title='Top 10 popular Role titles based on mean Salary - 2018-2021')

**What do we observe**

Developer Advocates and Machine Learning Engineers were filtered as these options were available only in the last 2 years.

1) Everyone has witnessed a decline in their salaries due to the pandemic. However, Data Scientists, Software Engineers and Business Analysts have seen a sharper decline relatively compared with Product/Project Manager, DBAs and Research Scientists.

## Top Industries

In [None]:
obs_var = 'Industry'
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='2021', 
                  agg_var=None,
                  filter_list=[], 
                  top_n=10, 
                  min_time='2021',
                  title='Top 10 Industries - 2021')

**What do we observe**

This is a new question asked in the 2021 survey

1) No surprises. Computers/IT dominate the pack, followed by Education. It is good to note that Kagglers from the Accounting domain are also active here.

## Most popular Company sizes

In [None]:
obs_var = 'Company size'
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=[], 
                  top_n=0, 
                  min_time='2019',
                  period=2,
                  title='Popular Company sizes - 2019-2021')

**What do we observe**

1) Small sized companies seem to be growing the fastest of the lot.

## How many Data Science folks does it take to change a light bulb?

In [None]:
obs_var = 'Num of DS folks'
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=[], 
                  top_n=0, 
                  min_time='2019',
                  period=2,
                  title='Popular Data Science workloads - 2019-2021')

**What do we observe**

1) Have Data Science roles been shrinking? It is hard to say, but based on the responses, it appears DS workloads with 0 individuals has been on the rise for the last 3 years.


## What are some of the popular Machine Learning use cases?

In [None]:
obs_var = 'Is ML used at work'
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=[], 
                  top_n=0, 
                  title='Popular Machine Learning use cases - 2018-2021')

**What do we observe**

1) The responses indicate that companies are still not mature enough to deploy models into production. Respondents are either not aware of ML use cases or they are sure their companies aren't using them or they are just using them to support decision making.


## How much do you spend on Machine Learning services?

In [None]:
obs_var = 'Expenditure on ML services'
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=True, 
                  is_bivariate=False,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=[], 
                  top_n=0, 
                  min_time='2019',
                  period=2,
                  title='Expenditure classification on Machine Learning services - 2019-2021')

**What do we observe**

1) This seems to suggest that Machine Learning is an expensive affair. Only those organisations that know what they are doing or have extra cash to spare can afford to spend time on building ML services.

## Coding experience, languages learned and company analysis

In this section let us separate the students from professionals, and analyze their responses to various questions

* Professional - Anybody who isn't in the 'Other' or 'Not employed' category; this category makes up 62% of the responses.

* Student - Someone who identifies themself as a student; this category has a share of 24%; the rest form the 'Other' and 'Not employed categories.

## How many years of Coding experience do you have?

In [None]:
obs_var = ['Profession', 'Coding experience']
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=3, 
                  is_profession=True,
                  title='Top 3 fastest growing Coding experience categories by Profession - 2018-2021')

**What do we observe**

It is interesting to note that experienced programers also compete on Kaggle; this is immensely useful for the community as they can share their experiences with those less experienced who are generally more in number.

1) Participants in the 20-30 years category have witnessed the highest growth in the last 4 years followed by those in the 10-20 years group.

## What programming language would you recommend?

In [None]:
obs_var = ['Profession', 'Recommended language']
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  is_profession=True,
                  title='Top 5 fastest growing Recommended languages by Profession - 2018-2021')

**What do we observe**

Javascript and SQL seem to be growing in popularity, much faster than Python, with both Students and Professionals. Although Professionals make up 62% of the responses compared with Students who make up 24% of the population, Python's popularity, based on growth rate, has been slower among Professionals than among Students.

In [None]:
def melt_feats(df, label, rename=None):
    """
    Unstack the multiple choice questions
    into a dataframe
    """
    q_s = [col for col in cleaned_mcr.columns.tolist() if label in col]
    sub = ( pd.melt(df, id_vars=['Profession', 'Survey'], value_vars=q_s)
                                                .drop(['variable'], axis=1)
                                                .rename(columns={'value': rename})
           )
    return sub[(sub[rename] != 'None') & (sub[rename] != 'Other')]

## Which are the fastest growing regularly used programming languages?

In [None]:
feat = 'Language'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  is_profession=True,
                  title='Top 5 fastest growing regularly used languages by Profession - 2018-2021')

**What do we observe**

In 2018, C/C++ were clubbed together.

1) While Python might still be the most regularly used programming languages in 2021; its growth tells another story. Among the Professionals, it only grew at 4% yearly, and is closely followed by SQL at 3% yearly. On the other hand, among Students, SQL has pipped Python by 1 percent point.

## How do Machine Learning experiences compare across Professions?

In [None]:
obs_var = ['Profession', 'Machine learning experience']
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  is_profession=True,
                  title='Top 5 fastest growing Machine Learning experiences by Profession - 2018-2021')

**What do we observe**

1) Respondents with no ML experience are growing at the rates of 16% and 45%, respectively, among both the Professional and Student categories.


## What are your favourite IDEs to use?

In [None]:
feat = 'IDE'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  is_profession=True,
                  title='Top 5 fastest growing IDEs by Profession - 2018-2021')

**What do we observe**

Question on choice of IDE for Visual Studio / Visual Studio Code, in 2018 and 2021 were asked as separate questions. However, in 2019 and 2020, they were clubbed. So, for the ease of analysis, they have been clubbed here as well; this may affect the ranking based on growth.

1) After cleaning up the data, especially the responses relating to the use of Visual Studio / Visual Studio Code, it appears to be the favourite among both the Professionals and Students alike, followed by Jupyter Notebook.

## What is your choice of tool for Analysis?

In [None]:
feat = 'Primary_Tool'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  is_profession=True,
                  title='Top 5 popular primary tools for analysis - 2018-2021')

**What do we observe**

1) Spreadsheets are the goto tool for data analysis among both the students and professionals.

## Fastest growing open source service for sharing data analysis projects

In [None]:
feat = 'Share'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2020',
                  period=1,
                  is_profession=True,
                  title='Top 5 popular places to share data analysis projects - 2020-2021')

**What do we observe**

It is not clear why Students were not part of this question.

1) Streamlit appears to be the most tool for sharing work publicly.

## Fastest growing Hosted Notebook?

In [None]:
feat = 'Hosted_NB'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2019',
                  period=2,
                  is_profession=True,
                  title='Top 5 fastest growing Hosted Notebooks - 2019-2021')

**What do we observe**

Variations in notebook names were replaced to align with the names used in the most recent survey; this is especially true for Colab, Kaggle and Azure related notebooks. For example, Google Colab was replaced with Colab Notebooks.

1) Based on growth in the last 3 years, the top 3 preferences for Students are: Code Ocean, Paperspace and Colab  in that order. On the other hand, Professionals prefer Code Ocean, Colab, and Kaggle Notebooks in that oder.

## What are you favourite Visualization libraries?

In [None]:
feat = 'Visualization'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2018',
                  period=3,
                  is_profession=True,
                  title='Top 5 favourite Visualization libraries - 2018-2021')

**What do we observe**


1) Professionals have a preference for Geoplotlib, Altair, and Seaborn in that order. Whereas Students prefer Seaborn, Matplotlib and Geoplotlib in that order. Plotly has fallen out of favour among Professionals but has grown at 11% yearly among the Students.

## Whats is your choice of Computing platform?

In [None]:
feat = 'Computing platform'
obs_var = ['Profession', feat]
plot_pandas_table(cleaned_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=10, 
                  min_time='2020',
                  period=1,
                  is_profession=True,
                  title='Popular Computing platforms - 2020-2021')

**What do we observe**

The options 'A laptop' and A personal computer/Desktop appeared as two separate choices in the 2021 survey, but appeared as 1 choice in 2020. So, the responses in 2021 were merged into 1 option to align with the response in 2020 for ease of analysis.

1) A PC/Laptop are the most popular computing platforms among both groups. Choice 'None', here makes no sense.

## What is you choice of Hardware?

In [None]:
feat = 'Hardware'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2019',
                  period=2,
                  is_profession=True,
                  title='Popular Specialized Hardware - 2019-2021')

**What do we observe**

'GPUs' in 2019 were replaced with NIVIDIA GPUs

'TPUs in 2019 were replaced with Google Cloud TPUs

1) Surprisingly, there has been a tremendous growth in the usage of specialized hardware - TPUs in the last 3 years. NIVIDIA GPUs are more common, but they have been growing relatively slowly in the last 3 years.

## How frequently have you used TPUs?

In [None]:
feat = 'TPU Usage'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='2021', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2021',
                  period=1,
                  is_profession=True,
                  title='Frequency of TPU usage - 2021')

**What do we observe**

1) Most participants have never used a TPU despite its rising popularity as shown in the table above this one.

## Fastest growing Machine Learning frameworks

In [None]:
feat = 'ML_framework'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=3, 
                  min_time='2018',
                  period=3,
                  is_profession=True,
                  title='Top 3 fastest growing Machine Learning frameworks - 2018-2021')

## Machine Learning Algorithms

In [None]:
feat = 'ML_algorithm'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=3, 
                  min_time='2019',
                  period=2,
                  is_profession=True,
                  title='Top 3 fastest growing Machine Learning algorithms - 2019-2021')

**What do we observe**

It appears that both students and professionals works on different problems that warrant the variation in ML algorithms used. Professionals use deep learning algorithms, whereas, students use trees and regression algoritms implying the source is tabular data.

## Computer vision methods

In [None]:
feat = 'Computer_Vision'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=3, 
                  min_time='2019',
                  period=2,
                  is_profession=True,
                  title='Top 3 popular Computer Vision methods - 2019-2021')

## Natural Language Processing methods used

In [None]:
feat = 'NLP'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=3, 
                  min_time='2019',
                  period=2,
                  is_profession=True,
                  title='Top 3 popular Natural Language Processing methods - 2019-2021')

## Most common Activities performed

In [None]:
feat = 'Activities'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2020',
                  period=1,
                  is_profession=True,
                  title='Top 5 popular Activities performed - 2020-2021')

**What do we observe**

This question only applies to professionals.

1) It appears not much as changed in the last few years. Most companies are still in the analysis and decision support phase.

## Cloud related services

## Popular Cloud platforms

In [None]:
feat = 'Cloud_platform'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=10, 
                  min_time='2019',
                  period=2,
                  is_profession=True,
                  title='Top 10 popular Cloud related services - 2019-2021')

**What do we observe**

This question wasn't shown to students since 2019. Don't schools/colleges use cloud providers?

1) There are a few up and coming risers such as SAP Cloud, Oracle Cloud, Alibaba Cloud and Salesforce Cloud that are chipping away at the market shares
of the heavy weights such as Microsoft Azure, Google Cloud Platform and Amazon Web Services during the last 3 years.

## Popular Cloud computing services

In [None]:
feat = 'Cloud_computing'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2019',
                  period=2,
                  is_profession=True,
                  title='Top 5 popular Cloud computing services - 2019-2021')

**What do we observe**

Azure Cloud services, one of the responses in 2019 was clubbed with Microsoft Azure Virtual Machines, as I thought that was the closest match. In any case, it shouldn't affect the growth rate computation significantly.

1) Microsoft VMs have witnessed a postive growth rate despite not retaining the top position in platform growth during the same period.

## Fastest growing Big Data products/services

In [None]:
feat = 'Big_data'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2019',
                  period=2,
                  is_profession=True,
                  title='Top 5 fastest growing Big Data products/services - 2019-2021')

**What do we observe**

As this question was phrased differently in 2018, 2019 and 2020 and 2021, I have used the data from the last 3 years for analysis.
It would have been nice to compare proprietary databases separately from open source databases.

1) Amazon Dynamo DB is a NoSQL database which is a proprietry DB unlike MySQL and PostgreSQL which are open source.

## Popular Data storage solutions

In [None]:
feat = 'Data_storage'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='2021', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2021',
                  period=1,
                  is_profession=True,
                  title='Top 5 most popular Data storage solutions - 2021')

## Learning and Media

## Favourite Online learning platforms

In [None]:
feat = 'Online'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2019',
                  period=2,
                  is_profession=True,
                  title='Top 5 most popular Online learning platforms - 2019-2021')

**What do we observe**

1) LinkedIn Learning appears to be a rising star in the last 3 years.

## Your favourite media sources for news on Data Science topics

In [None]:
feat = 'Media'
obs_var = ['Profession', feat]
subset_mcr = melt_feats(cleaned_mcr, feat, rename=feat)
plot_pandas_table(subset_mcr, obs_var,
                  is_pct=False,
                  is_univariate=False, 
                  is_bivariate=True,
                  sort_field='growth', 
                  agg_var=None,
                  filter_list=['Other', 'Not employed'], 
                  top_n=5, 
                  min_time='2018',
                  period=3,
                  is_profession=True,
                  title='Top 5 popular Media for Data Science news - 2018-2021')

**What do we observe**

1) YouTube is #1 among Professionals whereas Course Forums are #1 with Students.