In [None]:
import numpy as np 
import pandas as pd 
import re 

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

import ipywidgets as widgets
from IPython.display import display, clear_output
from ipywidgets import Output, Button, interact

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True) 
!jupyter nbextension enable --py --sys-prefix widgetsnbextension

## Welcome to my Kaggle survey notebook

### IMPORTANT - I did not know that ipywidgets don't work in the Kaggle environment once the notebook has been commited. It looks like Plotly's own dropdown possibilities will not cut it to achieve what i want.
### Please click on "copy & edit" to get the notebook, you will be ale to run it and use the ipywidget Dropdown menu 

If you like what you see please give it an upvote =)          
If you don't feel free to comment any criticism you may have about it                  
This notebook is interactive and you need to download / copy it to be able to run its functions                        
All the graphs are produced with Plotly which means that you can interact with them (click on the items in the legends if you want to hide them, double click if you want to isolate them, hover over the graph to get infos, zoom in/out, export the graph)

In [None]:
all_data_2018 = pd.read_csv("../input/kaggle-survey-2018/multipleChoiceResponses.csv", low_memory=False)
all_data_2019 = pd.read_csv('../input/kaggle-survey-2019/multiple_choice_responses.csv', low_memory=False)
all_data_2020 = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv', low_memory=False)
all_data_2021 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', low_memory=False)


def df_cleaner(_data):

    old_cols = list(_data.columns)
    new_cols = _data.iloc[0]
    updated_cols = []

    for a, b in zip(old_cols, new_cols):
        _new_col = str(a) + " " + str(b)
        updated_cols.append(_new_col)

    _data.columns = updated_cols
    _data = _data[1:].copy()
    _data.reset_index(drop=True, inplace=True)

    # Columns are very wordy, let's clean some of that

    _data.columns = _data.columns.str.replace(' - Selected Choice', '', regex=True)
    _data.columns = _data.columns.str.replace('(Select all that apply)', '', regex=True)
    _data.columns = _data.columns.str.replace('() -', '', regex=True)
    _data.rename(columns=lambda x: re.sub(' +', ' ', x), inplace=True)
    _data.rename(columns={"Time from Start to Finish (seconds) Duration (in seconds)" : "Duration (seconds)", 
                    "Q1 What is your age (# years)?" : "Age", "Q2 What is your gender?" : "Gender", "Q3 In which country do you currently reside?" : "Country",
                    'Q4 What is the highest level of formal education that you have attained or plan to attain within the next 2 years?' : "Degree",
                    'Q5 Select the title most similar to your current role (or most recent title if retired):' : "Title"},
                    inplace=True)
    _data.columns = _data.columns.str.strip()

    _outliers_index = [_index for _index, _value in _data['Duration (seconds)'].iteritems() if int(_value) > 14400]

    print(f"{len(_outliers_index)} lines will be removed from the dataset out of {_data.shape[0]}")

    _data = _data.drop(_outliers_index, axis=0)

    return _data

## Dataset inspection, tuning and cleaning



### Data relevance

We have a total of 25973 survey participants, Kaggle however has over [5 million registered users](https://www.kaggle.com/general/164795). Less than 0.6 % of the registered users completed the survey, this is a common problem with surveys and we will have to work with that, keeping in mind that it might not be a very good representation of the 99% + who did not take part in the survey.

The data quality of the .csv is quite good. I'll just take the first line of the dataset, where the questions are and concatenate it to the column names

### The first column.. 

..is about the time it took for a given participant to complete the survey, let's print some infos about it

In [None]:
_values = [all_data_2021["Time from Start to Finish (seconds)"][1:].astype('int')]

print(f"""
Duration (seconds) infos :

Mean : {int(sum(_values).mean())}     
Median :  {int(sum(_values).median())}
Min value :  {int(sum(_values).min())}
Max value :  {int(sum(_values).max())}
Standard deviation : {round(np.std(_values), ndigits=2)}
""")



Ok there is something going on there. The average participant took just under 11 minutes to finish the survey, and the slowest took over 28 days. Now there could be a lot of reasons for that, and Kaggle specifies that the data has been cleaned before being made available to us, but i'll choose to get rid of any line that took more than 4 hours to complete. 

In [None]:
df_2021 = df_cleaner(all_data_2021.copy())

What does it look like now?

In [None]:
fig = make_subplots(rows=1, cols=2, specs=[[{"type": "box"}, {"type": "box"}]])

fig.add_trace(go.Box(y=all_data_2021["Time from Start to Finish (seconds)"][1:].astype("int"), name=""), row=1, col=1)
fig.add_trace(go.Box(y=df_2021['Duration (seconds)'].astype("int"), name=""), row=1, col=2)

fig.update_xaxes(title_text="1. Original dataset", row=1, col=1)
fig.update_xaxes(title_text="2. Some outliers removed", row=1, col=2)

fig.update_traces(hovertemplate="<extra></extra>", row=1, col=1)

fig.update_layout(height=600, width=1000, 
            title_text="Boxplots of 'Duration (in seconds)' column", 
            paper_bgcolor="darkgray", 
            title_x=0.5, 
            showlegend=False)
            
fig.show()

We can work with that, and always get back to it if we wanna modify the threshold 

## Throwback

We have at our disposal right here on Kaggle the datasets of the previous years' surveys, i'll include the ones from 2018, 2019 and 2020 and compare a few things

Let's start by applying the thing we did to the first dataset, which is go and see how many out of the participants took over 4 hours to complete and could be considered outliers

In [None]:
df_2018 = df_cleaner(all_data_2018.copy())
df_2019 = df_cleaner(all_data_2019.copy())
df_2020 = df_cleaner(all_data_2020.copy())

df_2018.rename(columns={"Q1 What is your gender?" : "Gender", 
                        "Q2 What is your age (# years)?" : "Age",
                        "Q6 Select the title most similar to your current role (or most recent title if retired):" : "Title"                        
                        }, inplace=True)
df_2018["Gender"] = df_2018["Gender"].replace({'Male': "Man", "Female" : "Woman"})
df_2019["Gender"] = df_2019["Gender"].replace({'Male': "Man", "Female" : "Woman"})

In [None]:
fig = go.Figure()

_year = 2018

for _df, _dff in zip([df_2018, df_2019, df_2020, df_2021], [all_data_2018, all_data_2019, all_data_2020, all_data_2021]):

    _outliers = [_value for _value in _dff[_dff.columns[0]][1:] if int(_value) > 14400]

    fig.add_trace(go.Bar(
                        x=[_year], 
                        y=[len(_df)], 
                        name=_year, 
                        customdata=[len(_dff)],
                        hovertemplate="<br>".join([
                            "No outliers : %{y}",
                            "Total : %{customdata}", 
                        ]),
                        texttemplate="%{y}",
                        textposition="auto",
                        marker_color='rgb(55, 83, 109)',  
                        showlegend=False,
                        ))
    fig.add_trace(go.Bar(
                        x=[_year], 
                        y=[len(_outliers)], 
                        name=_year, 
                        customdata=[len(_outliers)],
                        hovertemplate="<br>".join([
                            "Outliers : %{customdata}", 
                        ]),
                        texttemplate="<b>%{customdata}<b> outliers",
                        textposition="inside",
                        #marker_color='rgb(55, 83, 109)',  
                        showlegend=False
                        ))

    _year += 1

fig.update_layout(
                paper_bgcolor="darkgray",
                title="<b>2018 - 2021 total survey participants<b><br><sup>Outlier : participant who took over 4 hours to complete the survey",
                title_x=0.5,
                height=700,
                barmode='stack', 
                xaxis = dict(
                tickmode = 'linear',
                tick0 = _year,
                dtick = 1,
                title = "Year",
                categoryorder = 'category ascending',
            ),
            yaxis = dict( 
                title = "Participants",
                tickformat = "000"
            )
                    )

fig.show()

2021 is the biggest year in term of partcipants, nice!


Now I want to see how the general population has evolved over the past few years, the function the the collapsed cell below just does that

In [None]:
def historic_graphs(_grouper):

    fig = go.Figure()

    base_df = pd.DataFrame(columns={_grouper})
    mean_age_df = pd.DataFrame(columns={"Mean Age", 2019, 2020, 2021})
    _year = 2018
    _total = []

    for _df in [df_2018, df_2019, df_2020, df_2021]:
        info = round(_df[_grouper].value_counts(normalize=True)*100, ndigits=2)
        info = pd.DataFrame(info.rename('Percent').reset_index().rename(columns={"index" : _grouper}).sort_values(by=_grouper).reset_index(drop=True))
        temp_df = pd.DataFrame({_grouper : info[_grouper].to_list() , _year : info["Percent"].to_list()})
        base_df = base_df.merge(temp_df, how="right")
        _year += 1 

        if _grouper == "Degree":          
            
            for _age in _df["Age"]:
                if "70+" in _age:
                    _total.append(70)
                elif "80+" in _age:
                    _total.append(80)                 
                else:
                    _mean = [int(a) for a in _age.split("-")]
                    _total.append(sum(_mean)/2) 

            _mean_age = int(sum(_total)/len(_total))


    for _element in base_df[_grouper]:
        data = base_df[base_df[_grouper] == _element].T[1:].reset_index()
        data.rename(columns={"index" : "year", data.columns[1] : _element}, inplace=True)
        fig.add_trace(go.Scatter(
                        x=data["year"], 
                        y=data[data.columns[1]],
                        mode='lines+markers',
                        name=_element,                    
                        hovertemplate="<br>".join([
                                        "Year : %{x}",
                                        "%{y}%"
                                    ]), 
                        line=dict(width=3)
                        ))


    fig.update_layout(
        title = f"<b>{_grouper} distribution 2018 - 2021<b>",
        title_x = 0.5,
        paper_bgcolor="darkgray", 
        height=600, 
        width=1000,
        showlegend=True,
        xaxis = dict(
            tickmode = 'linear',
            tick0 = _year,
            dtick = 1,
            title = "Year"
        ),
        yaxis = dict( 
            title = "Percentage"
        )
    )

    return fig, base_df

### Age, Degree, Gender, and Title are possible options

In [None]:
a, b, c, d = historic_graphs("Title")[0], historic_graphs("Age")[0], historic_graphs("Degree")[0], historic_graphs("Gender")[0]
display(a, b, c, d)

### How did the average Kaggle survey respondant evolve over the years?

We can see that the population is getting a little younger, and therefore more students and people with bachelor's degree are present. It still is very much dominated by men unfortunately

## 2021 Overall view

Back to the 2021 dataset


Let's create a few graphs about the general participant population shall we?

In [None]:
df = df_2021

overall_age = pd.DataFrame(df["Age"].value_counts()).reset_index()
overall_age.rename(columns={"index" : "Age", "Age" : "Count"}, inplace=True)
overall_age["Percent"] = overall_age["Count"].apply(lambda x : round(x*100 / overall_age["Count"].sum(), ndigits=2) if overall_age["Count"].sum() != 0 else 0)
overall_age.sort_values(by="Age", inplace=True)

overall_country = pd.DataFrame(df["Country"].value_counts()).reset_index()
overall_country.rename(columns={"index" : "Country", "Country" : "Count"}, inplace=True)
overall_country["Percent"] = overall_country["Count"].apply(lambda x : round(x*100 / overall_country["Count"].sum(), ndigits=2) if overall_country["Count"].sum() != 0 else 0)
overall_country.sort_values(by="Count", ascending=False, inplace=True)

overall_Title = pd.DataFrame(df["Title"].value_counts()).reset_index()
overall_Title.rename(columns={"index" : "Title", "Title" : "Count"}, inplace=True)
overall_Title["Percent"] = overall_Title["Count"].apply(lambda x : round(x*100 / overall_Title["Count"].sum(), ndigits=2) if overall_Title["Count"].sum() != 0 else 0)
overall_Title.sort_values(by="Count", ascending=False, inplace=True)

overall_Gender = pd.DataFrame(df["Gender"].value_counts()).reset_index()
overall_Gender.rename(columns={"index" : "Gender", "Gender" : "Count"}, inplace=True)
overall_Gender["Percent"] = overall_Gender["Count"].apply(lambda x : round(x*100 / overall_Gender["Count"].sum(), ndigits=2) if overall_Gender["Count"].sum() != 0 else 0)
overall_Gender.sort_values(by="Count", ascending=False, inplace=True)

overall_Degree = pd.DataFrame(df["Degree"].value_counts()).reset_index()
overall_Degree.rename(columns={"index" : "Degree", "Degree" : "Count"}, inplace=True)
overall_Degree["Percent"] = overall_Degree["Count"].apply(lambda x : round(x*100 / overall_Degree["Count"].sum(), ndigits=2) if overall_Degree["Count"].sum() != 0 else 0)
overall_Degree.sort_values(by="Count", ascending=False, inplace=True)

title_degree = round(df.groupby("Title")["Degree"].value_counts(normalize=True)*100, ndigits=2)
title_degree = title_degree.rename('Percent').reset_index().sort_values(by="Title").reset_index(drop=True)

fig = make_subplots(rows=4, cols=2, 
                    specs=[
                           [{'type':'bar', "colspan": 2}, None],    # First line on the graph, 'colspan" cause the graph spans over 2 columns
                           [{'type':'domain'}, {'type':'domain'}],
                           [{'type':'domain'}, {'type':'domain'}],
                           [{'type':'bar', "colspan": 2}, None],                     
                           ], 
                    subplot_titles=("Top 10 countries", "Age", 
                                    "Gender", "Degree",
                                    "Title", "Degree distribution per Job title")
                    )

fig.add_trace(go.Bar(name='All', x=overall_country[:10]['Country'], y=overall_country[:10]['Percent'],
                    customdata=overall_country[:10]['Count'],
                    hovertemplate="<br>".join([
                                    "%{x}",
                                    "%{customdata} Kagglers",
                                    "%{y}%"
                                ]),  
                    texttemplate="%{customdata}",
                    textposition="inside",
                    marker_color='rgb(55, 83, 109)', 
                    showlegend=True,
                    legendgroup='1',
                    ),
            row=1, col=1)

fig.add_trace(go.Pie(
       labels=overall_age["Age"], values=overall_age["Count"], 
       hole=.5, 
       legendgroup='2',
       name=""
       ),
       row=2, col=1)

fig.add_trace(go.Pie(
       labels=overall_Gender["Gender"], values=overall_Gender["Count"], 
       hole=.5, 
       legendgroup='2',
       name=""
       ),
       row=2, col=2)

fig.add_trace(go.Pie(
       labels=overall_Degree["Degree"], values=overall_Degree["Count"], 
       hole=.5, 
       legendgroup='3',
       name=""
       ),
       row=3, col=1)

fig.add_trace(go.Pie(
       labels=overall_Title["Title"], values=overall_Title["Count"], 
       hole=.5, 
       legendgroup='3',
       name=""
       ),
       row=3, col=2)


fig.add_trace(go.Bar(name='', x=title_degree['Title'], y=title_degree['Percent'], 
                    customdata=title_degree['Degree'],
                    hovertemplate="<br>".join([
                                    "%{x}",
                                    "%{customdata}",
                                    "%{y}%"
                                ]),
                    texttemplate="%{customdata}",
                    textposition="inside",
                    marker_color='rgb(55, 83, 109)',  
                    showlegend=False,
                    legendgroup='4',
                    ),
            row=4, col=1)

fig.update_yaxes(title_text="Percentage of total", row=1, col=1)

fig.update_layout( 
            paper_bgcolor="darkgray", 
            title=f"<b>Overall view of the Kaggle 2021 survey<b>", 
            title_x=0.5,
            height=2000,
            legend_tracegroupgap = 350,
            showlegend=True,
            autosize=True,
            barmode='group',
                         )

fig.show()

### What can we see at a glance?

* India and the U.S.A are at the top of the chart when it comes to participant number, together they represent over 37% of the total population
* The age groups 18 - 21, 22 - 24 and 25 - 29 are about the same proportions (just under 19%)
* This is a man's world... almost 80% of dudes and bros
* A similar proportion of Bachelor's and Masters degrees, over 10% of Doctors as well, Kagglers got brains!
* Over a quarter of students, which is understandable, but that's a variable we may wanna get rid of if we want insights about the **professional** data science world
* Data Analysts, Business Analysts, Software engineers, Data engineers have about the same proportions of Bachelors and Masters, research scientists have over 56% of PhDs


In [None]:
def generate_country_df(_country : str):

    _df = df[df["Country"] == _country]
    return _df.reset_index(drop=True)

def generate_country_dashboard(_country : str):

    if _country[0].islower() == True:
        _country = _country.capitalize()
        
    _country_df = generate_country_df(f"{_country}")

    country_age_values = round(_country_df["Age"].value_counts(normalize=True)*100, ndigits=2)
    country_age_values = country_age_values.rename("percent").reset_index().sort_values("index", ascending=True)

    world_age_values = round(df["Age"].value_counts(normalize=True)*100, ndigits=2)
    world_age_values = world_age_values.rename("percent").reset_index().sort_values("index", ascending=True)

    country_degree_values = _country_df["Degree"].value_counts()
    country_degree_labels = country_degree_values.index

    world_degree_values = df["Degree"].value_counts()
    world_degree_labels = world_degree_values.index

    gender_age_world = round(df.groupby("Age")["Gender"].value_counts(normalize=True)*100, ndigits=2)
    gender_age_world = gender_age_world.rename('Percent').reset_index().sort_values(by="Age").reset_index(drop=True)

    gender_age_country = round(_country_df.groupby("Age")["Gender"].value_counts(normalize=True)*100, ndigits=2)
    gender_age_country = gender_age_country.rename('Percent').reset_index().sort_values(by="Age").reset_index(drop=True)

    gender_job_world = round(df.groupby("Title")["Gender"].value_counts(normalize=True)*100, ndigits=2)
    gender_job_world = gender_job_world.rename('Percent').reset_index().sort_values(by="Title").reset_index(drop=True)

    gender_job_country = round(_country_df.groupby("Title")["Gender"].value_counts(normalize=True)*100, ndigits=2)
    gender_job_country = gender_job_country.rename('Percent').reset_index().sort_values(by="Title").reset_index(drop=True)


    fig = make_subplots(rows=4, cols=2, 
                        specs=[
                            [{'type':'bar', "colspan": 2}, None],    # First line on the graph, 'colspan" cause the graph spans over 2 columns
                            [{'type':'domain'}, {'type':'domain'}],
                            [{'type':'bar', "colspan": 2}, None],
                            [{'type':'bar', "colspan": 2}, None]
                            ], 
                        subplot_titles=("Age Distribution", f"{_country} degree distribution ", 
                                        "World degree distribution", "Gender distribution per age group",
                                        "Gender distribution per job title")
                        )
        
    fig.add_trace(go.Bar(name=f'{_country}', x=country_age_values["index"], y=country_age_values["percent"],
                     hovertemplate="<br>".join([
                                    "Age : %{x}",
                                    "%{y}%",
                                ]),  
                     marker_color='rgb(55, 83, 109)', 
                     legendgroup = '1'),
              row=1, col=1)

    fig.add_trace(go.Bar(name=f'World', x=world_age_values["index"], y=world_age_values["percent"], 
                        hovertemplate="<br>".join([
                                        "Age : %{x}",
                                        "%{y}%",
                                    ]), 
                        marker_color='rgb(26, 118, 255)', 
                        legendgroup = '1'),
                row=1, col=1)

    fig.add_trace(go.Pie(labels=country_degree_labels, values=country_degree_values, hole=.5, name=f"{_country}", legendgroup = '2'),
                row=2, col=1)

    fig.add_trace(go.Pie(labels=world_degree_labels, values=world_degree_values, hole=.5, name="World", legendgroup = '2'),
                row=2, col=2)

    fig.add_trace(go.Bar(name=f'{_country}', x=gender_age_country['Age'], y=gender_age_country['Percent'],
                        customdata=gender_age_country['Gender'],
                        hovertemplate="<br>".join([
                                        "Age : %{x}",
                                        "%{customdata}",
                                        "%{y}%"
                                    ]),  
                        texttemplate="%{customdata}",
                        textposition="inside",
                        marker_color='rgb(55, 83, 109)', 
                        legendgroup = '3'),
                row=3, col=1)
                
    fig.add_trace(go.Bar(name='World', x=gender_age_world['Age'], y=gender_age_world['Percent'], 
                        customdata=gender_age_world['Gender'],
                        hovertemplate="<br>".join([
                                        "Age : %{x}",
                                        "%{customdata}",
                                        "%{y}%"
                                    ]),
                        texttemplate="%{customdata}",
                        textposition="inside",
                        marker_color='rgb(26, 118, 255)',  
                        legendgroup = '3'),
                row=3, col=1)
    
    fig.add_trace(go.Bar(name=f'{_country}', x=gender_job_country['Title'], y=gender_job_country['Percent'],
                    customdata=gender_job_country['Gender'],
                    hovertemplate="<br>".join([
                                    "%{x}",
                                    "%{customdata}",
                                    "%{y}%"
                                ]),  
                    texttemplate="%{customdata}",
                    textposition="inside",
                    marker_color='rgb(55, 83, 109)', 
                    legendgroup = '4',
                    #orientation='h'
                    ),
            row=4, col=1)
            
    fig.add_trace(go.Bar(name='World', x=gender_job_world['Title'], y=gender_job_world['Percent'], 
                        customdata=gender_job_world['Gender'],
                        hovertemplate="<br>".join([
                                        "%{x}",
                                        "%{customdata}",
                                        "%{y}%"
                                    ]),
                        texttemplate="%{customdata}",
                        textposition="inside",
                        marker_color='rgb(26, 118, 255)',  
                        legendgroup = '4',
                        #orientation='h'
                        ),
                row=4, col=1)

    fig.update_layout( 
            paper_bgcolor="darkgray", 
            title=f"Overall view of {_country} <br><sup>{len(_country_df)} Kagglers - {round(len(_country_df)*100/(len(all_data_2021)), ndigits=2)}% of total<sup>", 
            title_x=0.5,
            height=1700,
            legend_tracegroupgap = 355,
            showlegend=True,
            barmode='group',
            bargap=0.15, 
            bargroupgap=0.1,
    )

    print(f"""
    The average participant from {_country} took {int(_country_df["Duration (seconds)"].median())} seconds to fill the survey,     
    is in the age group {_country_df["Age"].mode()[0]}, identifies as a {_country_df["Gender"].mode()[0]}, 
    the Kaggler's education is : {_country_df["Degree"].mode()[0]}    
    and current job title : {_country_df["Title"].mode()[0]}
    """)

    return fig

## Country infos


The first informations i'd like to get would be about how countries compare to the overall trend on a few elements:
* Age
* Education
* Gender distribution per age group
* Gender distribution per job title

That way we can have a few key informations about how the Kagglers from a selected country compare to the world

Copy the notebook and select a country you want informations about from the dropdown menu below, a small graph will be generated

In [None]:
output = Output()
start = Button(description="Generate graph")

country_widget = widgets.Dropdown(
    options=sorted(list(all_data_2021['Q3'][1:].unique())),
    value=sorted(list(all_data_2021['Q3'][1:].unique()))[0],
    description='Country :',
    disabled=False,
)

def click_start(b):
    with output:
        clear_output()
        display(generate_country_dashboard(country_widget.value))
       
start.on_click(click_start)

def on_change(change):
    with output:
        clear_output()
        display(generate_country_dashboard(country_widget.value))

country_widget.observe(on_change)

display(country_widget, start, output)

### Here's what it looks like for France ( no bias whatsoever :)

In [None]:
generate_country_dashboard("France")

### What are we learning ?

From that figure we can get a few informations:

* French kagglers are older than the world's average
* Over 70% have a master's degree, that's way more than the world's average
* Gender distribution per age group is quite similar to the world's one
* Women are a bit more present in a few jobs : Statistician, Data Analyst, Business Analyst

## Understanding the questions 

There are several types of questions in the survey.

* Single answer questions about yourself (Age, gender, location, education, job title)
* Single answer questions about your job / coding habits (where "None" or "Other" are options)
* Multiple answer questions where you get to choose more than one option if you so wish
* Multiple answer questions with sub-categories (A and B) those questions are usually about your use of a particular technology (part A) and your eagerness to learn more about it (part B), you may answer one part and not answer the second one and vice versa

## My approach to the survey

A lot of insights can be found in those answers.    
It would be interesting to be able to group the answers, i will do that and use the answers of the first 5 questions as possible groupers (**Age, Gender, Country, Degree, Title**).     
I will create functions that will generate graphs for all questions, depending on the type of question (amongst the types listed above) 2 or 4 graphs will be generated. The first half of the them will be a global graphic, the second will be graphs grouped by the grouper of your choice.

In [None]:
########################################################
# Little function to generate a df for each question to make data manipulation easier

def generate_df_for_question(_data , _question_number : int):

    _cols = [_col for _col in _data.columns if f"Q{str(_question_number)} " in str(_col) or f"Q{str(_question_number)}_" in str(_col)]

    for i in _data.columns[:6][::-1]:
        _cols.insert(0, i)

    return _data[_cols]


########################################################
# Manipulation of the data for each question according to the grouper selected

def create_insights_for_question(_data , _question : int, _grouper : str):

    _df = generate_df_for_question(_data, _question)

    if len(_df.columns) < 8:

        _df_grouped_overall = _df.groupby(_grouper).count().reset_index()
        _df_grouped_overall = round(_df[_df.columns[-1]].value_counts(normalize=True)*100, ndigits=2)
        _df_grouped_overall = _df_grouped_overall.rename("Percent").reset_index()
        _df_grouped_overall.rename(columns={"index" : _df.columns[-1]}, inplace=True)

        _df_grouped_focused = round(_df.groupby(by=_grouper)[_df.columns[-1]].value_counts(normalize=True)*100, ndigits=2)
        _df_grouped_focused = _df_grouped_focused.rename("Percent").reset_index()

        return _df_grouped_overall, _df_grouped_focused

    else:

        _current_col = [_col for _col in _df.columns if f"Q{_question} " in str(_col) or f"Q{_question}_" in str(_col)]
        _flag = False

        for i in _current_col:
            if i[3] == "B" or i[4] == "B":
                _flag = True

        if _flag == True:

            _question_1 = _df.columns[6].split("()")[0]
            _question_2 = _df.columns[-1].split("()")[0]
            special_new_cols = _df.columns[:6].to_list()

            for _col in _df.columns[6:]:
                _begin = _col[0:5]
                if "()" in _col:
                    _end = _col.split("()")[-1]
                elif "?" in _col:
                    _end = _col.split("?")[-1]
                _new = _begin + _end
                special_new_cols.append(_new.strip())
            
            _single_col = _df.columns[7].split("()")[0].strip()

            _df.columns = special_new_cols
            _df_grouped = _df.groupby(_grouper).count().reset_index()
            _df_grouped.drop(_df_grouped.columns[1:6].to_list(), axis=1, inplace=True)

            _len = int((len(_df_grouped.columns)-1)/2)

            _df_focused_A = _df_grouped[_df_grouped.columns[0:_len+1]]
            _df_focused_B = _df_grouped[_df_grouped.columns[_len+1:]]
            _df_focused_B = pd.DataFrame(_df_grouped[_df_grouped.columns[0]]).join(_df_focused_B)

            _dic_A = {}
            _dic_B = {}

            for _col_A, _col_B in zip(_df_focused_A.columns[1:], _df_focused_B.columns[1:]):
                _dic_A.update({_col_A.strip() : _df_focused_A[_col_A].sum()})
                _dic_B.update({_col_B.strip() : _df_focused_B[_col_B].sum()})

            _df_world_A = pd.DataFrame.from_dict(_dic_A, orient='index').reset_index()
            _df_world_A.rename(columns={"index" : _question_1, 0 :"Count"}, inplace=True)
            _df_world_A["Percent"] = _df_world_A["Count"].apply(lambda x : round(x*100 / _df_world_A["Count"].sum(), ndigits=2))
            _df_world_A.sort_values("Count", ascending=False, inplace=True)
            _df_world_A.reset_index(drop=True, inplace=True)

            _df_world_B = pd.DataFrame.from_dict(_dic_B, orient='index').reset_index()
            _df_world_B.rename(columns={"index" : _question_2, 0 :"Count"}, inplace=True)
            _df_world_B["Percent"] = _df_world_B["Count"].apply(lambda x : round(x*100 / _df_world_B["Count"].sum(), ndigits=2))
            _df_world_B.sort_values("Count", ascending=False, inplace=True)
            _df_world_B.reset_index(drop=True, inplace=True)

            return _df_world_A, _df_world_B, _df_focused_A, _df_focused_B, _question_1, _question_2

        else:
            
            if "()" in _df.columns[6]:
                _new_cols = [i.split("()")[-1].strip() for i in _df.columns[6:]]
                _single_col = _df.columns[-1].split("()")[0].strip()
            else:
                for i in _df.columns[6:]:
                    if len(i.split("?")[-1].strip()) > 0:
                        _new_cols = [i.split("?")[-1].strip() for i in _df.columns[6:]]
                    else:
                        _new_cols = [i.split("?")[0].strip() for i in _df.columns[6:]]

                _single_col = _df.columns[-1].split("?")[0].strip()      

            for i in range(6): 
                _new_cols.insert(i, _df.columns[i])

            _df.columns = _new_cols

            _df_grouped = _df.groupby(_grouper).count().reset_index()
            _df_grouped.drop(_df_grouped.columns[1:6].to_list(), axis=1, inplace=True)

            _dic = {}

            for _col in _df_grouped.columns[1:]:
                _dic.update({_col : _df_grouped[_col].sum()})

            _df_world = pd.DataFrame.from_dict(_dic, orient='index').reset_index()
            _df_world.rename(columns={"index" : _single_col, 0 :"Count"}, inplace=True)
            _df_world["Percent"] = _df_world["Count"].apply(lambda x : round(x*100 / _df_world["Count"].sum(), ndigits=2))
            _df_world.sort_values("Count", ascending=False, inplace=True)
            _df_world.reset_index(drop=True, inplace=True)
         
            return _df_world, _df_grouped

########################################################
# Generation of the graphs based on the transformed data
def generate_graphs_for_questions(_data , _question : int, _grouper : str):

    _counter = 1           # Basic counter to keep track of how many graphs will be generated by the function, it can either be 2 or 4

    try:
        _df_global, _df_focused = create_insights_for_question(_data, _question, _grouper)
        _counter = 2
    except:
        _df_world_A, _df_world_B, _df_focused_A, _df_focused_B, _question_1, _question_2 = create_insights_for_question(_data, _question, _grouper)
        _counter = 4

    #print("Nb de graphs :", _counter)

    if _counter == 2:

        global_fig = px.histogram(_df_global, x="Percent", y=_df_global.columns[0])

        global_fig.update_layout(
                        paper_bgcolor="darkgray", 
                        title=_df_global.columns[0] + f"<br><sup>All answers<sup>", 
                        title_x=0.5
                        )

        global_fig.update_xaxes(title="Percentage")
        global_fig.update_yaxes(title="", autorange="reversed")
        global_fig.update_traces(hovertemplate="%{x}%")

        try:
            grouped_fig = px.histogram(_df_focused, x="Percent", y=_grouper, color=_df_focused.columns[1])

            grouped_fig.update_layout(
                            paper_bgcolor="darkgray", 
                            title=_df_focused.columns[1] + f"<br><sup>Grouped by {_grouper}<sup>", 
                            title_x=0.5, 
                            legend=dict(title=None) # this hides the legend title which i'm using as the title of the graph
                            )

            grouped_fig.update_xaxes(title="Percentage")
            grouped_fig.update_yaxes(title="", autorange="reversed")
            grouped_fig.update_traces(hovertemplate="%{y} : %{x}%")
            
        except:
            df = pd.DataFrame(columns=["Answer", "Percent", _grouper])

            for _element in _df_focused[_grouper]:
                
                mini_df = _df_focused[_df_focused[_grouper] == _element].T#.reset_index()
                mini_df = mini_df[1:].reset_index()
                mini_df[_grouper] = _element
                mini_df.rename(columns={"index" : "Answer", mini_df.columns[1] : "Percent"}, inplace=True)
                mini_df["Percent"] = mini_df["Percent"].apply(lambda x : round(x*100 / mini_df["Percent"].sum(), ndigits=2))

                df = pd.concat([df, mini_df])

            grouped_fig = px.histogram(df, x="Percent", y=_grouper, color=df.columns[0])

            grouped_fig.update_layout(
                            paper_bgcolor="darkgray", 
                            title=_df_global.columns[0] + f"<br><sup>Grouped by {_grouper}<sup>", 
                            title_x=0.5, 
                            legend=dict(title=None) # this hides the legend title which i'm using as the title of the graph
                            )

            grouped_fig.update_xaxes(title="Percentage")
            grouped_fig.update_yaxes(title="", autorange="reversed")
            grouped_fig.update_traces(hovertemplate="%{y} : %{x}%")

        return global_fig, grouped_fig
    
    elif _counter == 4:

        global_fig_A = px.histogram(_df_world_A, x="Percent", y=_df_world_A.columns[0])

        global_fig_A.update_layout(
                        paper_bgcolor="darkgray", 
                        title=_question_1 + f"<br><sup>All answers<sup>", 
                        title_x=0.5
                        )

        global_fig_A.update_xaxes(title="Percentage")
        global_fig_A.update_yaxes(title="", autorange="reversed")
        global_fig_A.update_traces(hovertemplate="%{x}%")

        global_fig_B = px.histogram(_df_world_B, x="Percent", y=_df_world_B.columns[0])

        global_fig_B.update_layout(
                        paper_bgcolor="darkgray", 
                        title=_question_2 + f"<br><sup>All answers<sup>", 
                        title_x=0.5
                        )

        global_fig_B.update_xaxes(title="Percentage")
        global_fig_B.update_yaxes(title="", autorange="reversed")
        global_fig_B.update_traces(hovertemplate="%{x}%")

        df_A_final = pd.DataFrame(columns=["Answer", "Percent", _grouper])
        df_B_final = pd.DataFrame(columns=["Answer", "Percent", _grouper])

        for _element in _df_focused_A[_grouper]:
            
            mini_df = _df_focused_A[_df_focused_A[_grouper] == _element].T
            mini_df = mini_df[1:].reset_index()
            mini_df[_grouper] = _element
            mini_df.rename(columns={"index" : "Answer", mini_df.columns[1] : "Percent"}, inplace=True)
            mini_df["Percent"] = mini_df["Percent"].apply(lambda x : round(x*100 / mini_df["Percent"].sum(), ndigits=2) if mini_df["Percent"].sum() != 0 else 0)

            df_A_final = pd.concat([df_A_final, mini_df])

        for _element in _df_focused_B[_grouper]:
            
            mini_df = _df_focused_B[_df_focused_B[_grouper] == _element].T
            mini_df = mini_df[1:].reset_index()
            mini_df[_grouper] = _element
            mini_df.rename(columns={"index" : "Answer", mini_df.columns[1] : "Percent"}, inplace=True)
            mini_df["Percent"] = mini_df["Percent"].apply(lambda x : round(x*100 / mini_df["Percent"].sum(), ndigits=2) if mini_df["Percent"].sum() != 0 else 0)

            df_B_final = pd.concat([df_B_final, mini_df])

        focus_A_fig = px.histogram(df_A_final, x="Percent", y=_grouper, color=df_A_final.columns[0])

        focus_A_fig.update_layout(
                    paper_bgcolor="darkgray", 
                    title=_question_1 + f"<br><sup>Grouped by {_grouper}<sup>", 
                    title_x=0.5,
                    legend=dict(title=None) 
                    )

        focus_A_fig.update_xaxes(title="Percentage")
        focus_A_fig.update_yaxes(title="", autorange="reversed")
        focus_A_fig.update_traces(hovertemplate="%{x}%")

        focus_B_fig = px.histogram(df_B_final, x="Percent", y=_grouper, color=df_B_final.columns[0])

        focus_B_fig.update_layout(
                    paper_bgcolor="darkgray", 
                    title=_question_2 + f"<br><sup>Grouped by {_grouper}<sup>", 
                    title_x=0.5,
                    legend=dict(title=None) 
                    )

        focus_B_fig.update_xaxes(title="Percentage")
        focus_B_fig.update_yaxes(title="", autorange="reversed")
        focus_B_fig.update_traces(hovertemplate="%{x}%")       

        return global_fig_A, global_fig_B, focus_A_fig, focus_B_fig

In [None]:
output2 = Output()
start2 = Button(description="Create graphs")

_groupers = ["Age", "Gender", "Country", "Degree", "Title"]

question_widget = widgets.Dropdown(
    options=[i for i in range(6, 39)],
    value=6,
    description='Question :',
    disabled=False,
)

grouper_widget = widgets.Dropdown(
    options=_groupers,
    value=_groupers[0],
    description='Group by :',
    disabled=False,
)

def click_start(b):
    with output2:
        clear_output()
        try:
            a, b, c, d = generate_graphs_for_questions(df, question_widget.value, grouper_widget.value)
            display(a, b, c, d)
        except:
            a, b = generate_graphs_for_questions(df, question_widget.value, grouper_widget.value)
            display(a, b)
       
start2.on_click(click_start)

display(question_widget, grouper_widget, start2, output2)

### Here is what the function generates for Question 32 grouped by Age

In [None]:
a, b, c, d = generate_graphs_for_questions(df, 32, "Age")

display(a, b, c, d)

### mySQL remains the most used and sought after big data storage solution. It is interesting to see that over 10% of respondents do not use any databases. The Age of the respondent does not seem to influence his database choice 

### And for Question 35 grouped by Title

In [None]:
a, b = generate_graphs_for_questions(df, 35, "Title")

display(a, b)

### Power BI sure is loved amongst Kagglers

### Please do copy the notebook and have a play with the widget, see what other insights you can find =)