In [None]:
# Setting the notebook
import numpy as np
import pandas as pd
import os
from IPython.display import display, HTML, Javascript
import plotly.graph_objects as go
import pandas as pd
import requests
import json

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Get the kagglers answers
answers = pd.read_csv("/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")

# Store the questions contained in the answers (first line) and drop the line from the answers
questions = answers.iloc[0]
answers = answers.drop(0).reset_index(drop=True)

In [None]:
"""
This cell contains some python wrappers for javascript based modules such as ChartJs, D3.js.
It mainly contains adapted source code from different sources referenced at the end of my notebook.
"""


# Cool way to display numbers
def countUp(html_ids, values, titles):
    """
    Displays numerical values within cards with countUp effects style.
    
    Args:
        html_ids (list of str): set the ids for the displayed HTML elements
        values (list of int): set the values displayed on the cards
        titles (list of str): set the titles displayed on the cards

    Returns:
        Displays the cards with a countUp animation
    """
    count_ups = ""
    html_strings = ""
    for i in range(0, len(html_ids)):
        html_strings = html_strings + """
        <div style="box-shadow: 0 4px 8px 0 rgba(0, 0, 0, 0.2); width: 300px; margin: auto; text-align: center;">
            <h4 style="padding-top: 20px">%(title)s</h4>
            <br>
            <h1 id=%(html_id)s style="padding-bottom: 20px"> </h1>
        </div>
        """%{"html_id": html_ids[i], "title": titles[i]}
        
        count_ups = count_ups + """
        var numAnim = new countUp.CountUp(%(html_id)s, %(value)s);
        numAnim.start();
        """%{"html_id": html_ids[i], "value": values[i]}
        
    js_string = """
    require(['https://cdnjs.cloudflare.com/ajax/libs/countup.js/2.0.7/countUp.umd.js'], function(countUp){
        %(count_ups)s
    })
    """%{"count_ups": count_ups}
    
    display(Javascript(js_string))
    display(
        HTML(
            """
            <div style="justify-content: space-around; display:flex; flex-wrap: wrap; align-content:space-between">
                %(html_strings)s
            </div>
            """%{"html_strings": html_strings}
        )
    )
    

    
# Use the chartjs library to plot graphs
def chartjs(chartType, chartId, data, options={}, width="700px", height="400px", displayGraph=True):
    """
    Method allowing to use chartjs modules for plotting.
    Parameters can be found at: https://www.chartjs.org/
    A simple example for the parameters structure are shown at: https://www.chartjs.org/docs/latest/#creating-a-chart
    
    Args:
        chartType: one of the supported chart type options (line, bar, radar, polarArea, pie, doughnut)
        data: a python dictionary containing the data and the labels
        options: a python dictionary containing graph options
        width: default 700px
        height: default 400px
    """
    
    js_string = """
        require(['https://cdnjs.cloudflare.com/ajax/libs/Chart.js/2.9.3/Chart.min.js', 'https://cdn.jsdelivr.net/gh/emn178/chartjs-plugin-labels/src/chartjs-plugin-labels.js'], function(chartjs, legendPlugin){
            var chartType="%(chartType)s";
            var data=%(data)s;
            var options=%(options)s;
            var chartId="%(chartId)s";

            var ctx = document.getElementById(chartId).getContext('2d');
            var myChart = new Chart(ctx, {
                type: chartType,
                data: data,
                options: options
                });
                })
            """ %{'chartType': chartType, 'data': data, 'options': options, 'chartId': chartId}
    
    html_string = """
    <canvas id="%(chartId)s" width=%(width)s height=%(height)s> </canvas>
    """%{"width": width, "height":height, "chartId": chartId}
    
    if displayGraph:
        display(Javascript(js_string))
        display(HTML(html_string))
    else:
        return html_string, js_string

def loadJs(jsStrings):
    for jsString in jsStrings:
        display(Javascript(jsString))


def pandasToChartJs(df, chartType, xAxis, yAxis, yLabels=None):
    labels = df[xAxis].tolist()
    data = df[yAxis]
    if chartType == "doughnut":
        datasets = [{
            'label': labels,
            'backgroundColor': palette[0:data.shape[0]],
            'borderColor': palette_plain[0:data.shape[0]],
            'hoverBorderWidth': 2,
            'data': data.iloc[:,0].tolist()
        }]
    else:
        if yLabels is None: yLabels = [i for i in range(0, data.shape[1])]
        datasets = [{
            'label': yLabels[i],
            'backgroundColor': [palette[i] for _ in range(data.shape[0])],
            'borderColor': [palette_plain[i] for _ in range(data.shape[0])],
            'hoverBorderWidth': 2,
            'data': data.iloc[:,i].tolist()
        }  for i in range(0, data.shape[1])]
    return {'labels': labels, 'datasets': datasets}


def doughnut(df, columnName, chartId):
    # Gender distribution
    doughnut_data = df[columnName].value_counts().reset_index()
    doughnut_data = pandasToChartJs(doughnut_data, chartType="doughnut", xAxis="index", yAxis=[columnName])

    # Chart options
    options= {
        'legend': {'position': 'top'},
    }

    # Show gender distribution
    return chartjs(chartType="doughnut", chartId=chartId, data=doughnut_data, height='180px', options=options, displayGraph=False)


# Palette https://colorbrewer2.org/#type=qualitative&scheme=Set3&n=11
palette = ['rgba(141,211,199,0.7)','rgba(255,255,179,0.7)','rgba(190,186,218,0.7)','rgba(251,128,114,0.7)','rgba(128,177,211,0.7)',
 'rgba(253,180,98,0.7)','rgba(179,222,105,0.7)','rgba(252,205,229,0.7)','rgba(217,217,217,0.7)','rgba(188,128,189,0.7)',
 'rgba(204,235,197,0.7)', 'rgba(111,211,199,0.7)', 'rgba(255,235,159,0.7)', 'rgba(140,196,208,0.7)']

palette_plain = ['rgba(141,211,199,1)','rgba(255,255,179,1)','rgba(190,186,218,1)','rgba(251,128,114,1)','rgba(128,177,211,1)',
 'rgba(253,180,98,1)','rgba(179,222,105,1)','rgba(252,205,229,1)','rgba(217,217,217,1)','rgba(188,128,189,1)',
 'rgba(204,235,197,1)', 'rgba(111,211,199,1)', 'rgba(255,235,159,1)', 'rgba(140,196,208,1)']

<h2> Kagglers' Profiles </h2>

Working with data involves mastering a large array of skills. Kagglers are all working with data but don't specialize in the same things ! That is why some are hired by some company as **Research Scientists**, **Business Analysts** or **Data Engineers**. The distinctions don't only come up with a different title name but are also coming up with differences in the **tools**, **programming languages**, **developing frameworks** each profession requires.

If you played a few role-playing games in your childhood, you often begin as a **Novice** and have to choose a **character class** conditionning the skills you're going to develop. For instance if you chose to become a **Swordsman**, you will develop your **strength** and **stamina** but lack the **wisdom** and **dexterity** that your **Mage** fellow has.

By analogy, you can now replace the words **Novice** to **Student**, **character class** to **job title**, **swordsman** to **software engineer**, **strength** to **programming skill** and you get a sentence which is still making sense.

While studying a student has to choose courses leading to raise certain skills ; while working a worker has to apply his skills and develop new ones. The whole notebook will revolve an assumption which is : Skills are defining who you are.


Instead of working directly on the job titles (answers of Q5 in the survey), I will specify 3 types of profiles:
* **The Mathematician**: she/he is interested in the comprehension of the maths behind ML algorithms. Her/His strong taste for statistics makes him a good candidate for Research scientist or Statistician. She/He knows enough about coding to plot graphs aimed at showing her/his point.
* **The Practitioner**: she/he is interested in being practical and is the one who will be the more exposed to the client. She/He gives as much importance for the form as the content and will rely on tools allowing her/him to tell good stories about the data she/he has. She/He does not want to get too involved in low-level frameworks (except for data visualization maybe) and rely on others to provide him what he lacks.
* **The Developer**: she/he is the one who has a better knowledge of programming than the others. The one who will get hives if you don't follow the PEP8 conventions. She/He is good at structuring things making her/him to be a great candidate for a software engineer position.

The profiles will simply be a mapping from the job titles (Q5) and will be organized this way ...

In [None]:
# Assign the profiles with their corresponding titles
profiles = {
    'Data Engineer': 'Developer',
    'Software Engineer': 'Developer',
    'DBA/Database Engineer': 'Developer',
    
    'Data Analyst': 'Practitioner',
    'Business Analyst': 'Practitioner',
    'Product/Project Manager': 'Practitioner',
    
    'Machine Learning Engineer': 'Mathematician',
    'Research Scientist': 'Mathematician',
    'Statistician': 'Mathematician',
    
    'Data Scientist': 'Data Scientist',
    'Student': 'Student'
}

answers["Profile"] = answers["Q5"].map(profiles)
answers["Profile"].fillna("Unemployed - Other", inplace=True)
answers[["Q5", "Profile"]].head(4)

*A "**Profile**" column is added to the original input data*

Now that the **"Profile"** is in our dataframe, I will try to confirm the assumptions I made for the profiles and analyze in more depth what are the characteristics of each profile. But beforehand ...

<a id="s1">
    <h2> ... What are 2020 kagglers made of? </h2>
</a>

In [None]:
### Display global indicators
# Answers represented with some global indicators
number_of_questions = 35
number_of_respondents = answers.shape[0]
display(HTML("<hr></hr>"))
display(HTML("""<h2 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Kagglers answered to the Kaggle 2020 survey where results ended with ... </h2>"""))
display(HTML("""<h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> 20 036 kagglers answering the 35 questions of the survey </h3>"""))
countUp(["nbQuestions", "nbRespondents"], [number_of_questions, number_of_respondents], titles=["Number of questions", "Number of respondents"])
display(HTML("<hr></hr>"))

### Display map graph
# Set the ISO 3 codes
display(HTML("""<h2 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Kagglers are coming from all five continents ... </h2>"""))
display(HTML("""<h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> India and the USA are taking the lead with respectively 29.2% and 11.16% of the respondents</h3>"""))
geographical_distribution = answers["Q3"].value_counts().reset_index()
url = "http://country.io/names.json"
resp = requests.get(url=url).json()
countries_dict = {v: k for k, v in resp.items()}

geographical_distribution["code"] = geographical_distribution["index"].map(countries_dict)
url = 'http://country.io/iso3.json'
resp = requests.get(url=url).json()

# The undetected countries for ISO 3 mapping
countries_dict_comp = {
    "United States of America": "USA",
    "Other": "ATA",
    "United Kingdom of Great Britain and Northern Ireland": "GBR",
    "Iran, Islamic Republic of...": "IRN",
    "Viet Nam": "VNM",
    "Republic of Korea": "KOR"
}
geographical_distribution["code"] = geographical_distribution["code"].replace(resp)
geographical_distribution["code"] = np.where(geographical_distribution["code"].isna(), geographical_distribution["index"].replace(countries_dict_comp), geographical_distribution["code"])
geographical_distribution["Q3"] = np.round(geographical_distribution["Q3"]/sum(geographical_distribution["Q3"])*100, 2)

# Configuration
config = dict(
    {'scrollZoom': False,
     'modeBarButtonsToRemove': ['toImage', 'pan2d', 'select2d', 'lasso2d']
    }
)

# Kagglers location map
fig = go.Figure(data=go.Choropleth(
    locations = geographical_distribution['code'],
    z = geographical_distribution['Q3'],
    text = geographical_distribution['index'],
    colorscale = [[i/5.0, palette[i]] for i in range(0, 6)],
    autocolorscale=False,
    reversescale=False,
    marker_line_color='darkgray',
    marker_line_width=0.5,
    colorbar_ticksuffix = '%',
    colorbar_title = '% of Kagglers',
    hovertemplate = "<b>%{text}</b><br><br>" + "<b>Percentage of Kagglers: %{z}%<b> <extra></extra>"
))

fig.update_layout(
    geo=dict(
        showframe=False,
        showcoastlines=False,
        projection_type='equirectangular'
    ),
    margin=dict(t=10, b=10)
)

fig.show(config=config)

display(HTML("""<div style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-style:italic; color:rgba(50, 50, 50, 1)"> Note that people coming from "Other" countries actually live in Antartica ! Joke aside, 6.93% of the kagglers living in other countries can't be neglected</div>"""))

display(HTML("<hr></hr>"))
display(HTML("""<h2 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Kagglers are fairly young ... </h2>"""))
display(HTML("""<h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> Youngsters are very welcome here since the age groups' Top 3 is (in order) 25-29, 22-24 and 18-21</h3>"""))
# Age BarChart
age_distribution = answers["Q1"].value_counts().reset_index()
age_distribution["index"] = pd.Categorical(age_distribution["index"]).reorder_categories(['18-21', '22-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-69', '70+'])
age_distribution.sort_values("index", inplace=True)
age_distribution_data = pandasToChartJs(age_distribution, chartType="bar", xAxis="index", yAxis=["Q1"], yLabels=["# of Kagglers"])
for i in range(0, 3):
    age_distribution_data['datasets'][0]["backgroundColor"][i] = palette[1]
    age_distribution_data['datasets'][0]["borderColor"][i] = palette_plain[1]
# Chart options
options= {'legend': {'display': 0},
          'scales': {'yAxes':[{'scaleLabel':{'display': 1, 'labelString':'Number of Respondents'}}],
                     'xAxes':[{'scaleLabel':{'display': 1, 'labelString':'Age Ranges'}}]},
          'plugins':{'labels': {'render':'value'}}
         }
# Show Age distibution
bar_html, bar_js = chartjs(chartType="bar", chartId="age_distribution", data=age_distribution_data, options=options, height='auto', width='auto', displayGraph=False)
display(Javascript(bar_js))
display(HTML(bar_html))

# Gender
display(HTML("<hr></hr>"))
display(HTML("""<h2 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Kagglers are mainly men ...</h2>"""))
display(HTML("""<h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> A convergence to parity seems to be distinguishable with the coming generation </h3>"""))
gender = answers[(answers["Q2"] == "Man") | (answers["Q2"] == "Woman")]
gender = gender.groupby(["Q1", "Q2"]).apply(lambda x: x.shape[0])
gender = gender.groupby(level="Q1").apply(lambda x: 100 * x / float(x.sum())).reset_index(name="percentage")

men = gender[gender["Q2"] == 'Man']
women = gender[gender["Q2"] == 'Woman']
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=men["percentage"],
    y=men["Q1"],
    name='Percentage of men',
    marker=dict(
        color=palette[0],
        line_color=palette_plain[0],
    )
))
fig.add_trace(go.Scatter(
    x=women["percentage"],
    y=women["Q1"],
    name='Percent of Women',
    marker=dict(
        color=palette[2],
        line_color=palette_plain[2]
    )
))

fig.update_traces(mode='markers', marker=dict(line_width=1, symbol='circle', size=16))

fig.update_layout(
    yaxis=dict(showspikes=True, title="Kaggler's Age"),
    xaxis=dict(
        showgrid=False,
        showline=True,
        linecolor='rgb(102, 102, 102)',
        tickfont_color='rgb(102, 102, 102)',
        showticklabels=True,
        dtick=10,
        ticks='outside',
        tickcolor='rgb(102, 102, 102)',
        showspikes=True,
    ),
    margin=dict(t=0),
    legend=dict(
        font_size=10,
        yanchor='middle',
        xanchor='right',
    ),
    width=800,
    height=600,
    paper_bgcolor='white',
    plot_bgcolor='white',
    hovermode='closest',
)
fig.add_vline(x=50, line_width=2, line_dash="dash", line_color="rgba(204, 204, 204, 0.95)")
fig.show(config={'displayModeBar': False})

That said, I invite you to have a look at the following notebooks who are offering the univariate and bivariate analysis you might be looking for:
* https://www.kaggle.com/subinium/kaggle-2020-visualization-analysis
* https://www.kaggle.com/frankmollard/survey-2020-analysis

<a id="s2"> <h2>The Profiles</h2> </a>

Is it necessary to have a study through "Profiles"? Short answer is no but the main reason is the simplification of the analysis. Instead of describing what each job consists of, I chose to tell you what the job is relying on. If you ask what does a Research Scientist do, I will answer you that he does Maths. If you ask what does a Statistician do, I will still answer you that he does Maths. That is because Maths is the raw material for both jobs and what they have in common. 
Making an EDA with 3 groups (# of Profiles) instead of more makes the comparisons easier to do and allows me to focus on the "core skills" that define a range of jobs.

The drawback for this regroupment in "Profiles" might be oversimplification ! Creating bigger groups necessarily leads to a loss of information. While we surely lost information down the road, you might be interested in the reason [why the grouping we chose limits the loss compared to others groupings](#groups).

In [None]:
def composition_doughnut(df, chartId):
    # Composition of the mathematicians with a doughnut chart
    titles_composition = df.groupby("Q5").apply(lambda x : x.shape[0]).reset_index(name="count")
    titles_composition_chartjs = pandasToChartJs(titles_composition, chartType="doughnut", xAxis="Q5", yAxis=["count"])
    titles_composition_options = {
        'legend': {'position': 'bottom'},
        'circumference': 1*np.math.pi,
        'rotation': 1*np.math.pi,
        'cutoutPercentage': 70,
        'plugins':{
            'labels': {
                'render': 'percentage',
                'fontColor': ['green', '#FFC270', 'purple'],
                'precision': 2,
                'arc': 1,
                'fontSize': 8
            }
        }
    }
    return chartjs("doughnut", chartId, titles_composition_chartjs, displayGraph=False, options=titles_composition_options, height=250, width='auto')

def skills_radar(df, chartId, color=0, return_chart_data=False, label=''):
    # Mathematicians skills
    def answers_count(df, question_column, question_parts_number):
        """
        Counts if an answer exists or not for a specific question.
        Ex: If Q7 is chosen, counts the values which are not NA for Q7_Part_1 to Q7_Part_12 (including Q7_OTHER) if question_parts_number=13
        """
        sub_df = df[["{}_Part_{}".format(question_column, i) for i in range(1, question_parts_number)] + ["{}_OTHER".format(question_column)]]
        nb_answers = sub_df.apply(lambda x: x.notna()).sum(axis=1)
        #nb_answers[nb_answers < 1] = 0
        #nb_answers[nb_answers >=1] = 1
        return np.round(nb_answers.mean(),2)
    programming_languages = answers_count(df, "Q7", 13)
    programming_environments = answers_count(df, "Q9", 12)
    plotting_libraries = answers_count(df, "Q14", 12)
    ml_libraries = answers_count(df, "Q16", 16)
    cloud_computing_platforms = answers_count(df, "Q26_A", 12)
    databases = answers_count(df, "Q29_A", 18)
    bi_tools = answers_count(df, "Q31_A", 15)
    radar_labels = ["#Programming Languages", "#IDEs", "#Plotting Libraries", "#ML Libraries", "#Cloud Computing Platforms", "#Big Data Products", "#BI Tools"]
    radar_dataset = [{
        'data': [programming_languages, programming_environments, plotting_libraries, ml_libraries, cloud_computing_platforms, databases, bi_tools],
        'backgroundColor': palette[color],
        'borderColor': palette_plain[color],
        'hoverBackgroundColor': palette[color],
        'label': label
    }]
    options= {'legend': {'display': 0}, 'scale':{'ticks': {'suggestedMin': 0, 'suggestedMax': 3.5}}}
    data = {
        'labels': radar_labels,
        'datasets': radar_dataset
    }
    if return_chart_data:
        return data
    else:
        return chartjs('radar', chartId, data=data, height=250, displayGraph=False, options=options, width='auto')
    
    
# Mathematicians stats
mathematicians = answers[answers["Profile"] == "Mathematician"]
# Number of mathematicians in the dataset
number_of_mathematicians = mathematicians.shape[0]
# Proportion of mathematicians
prop_of_mathematicians = np.round((number_of_mathematicians/answers.shape[0])*100, 2)
# Doughnut charts
mathematicians_doughnut_html, mathematicians_doughnut_js = composition_doughnut(mathematicians, "mathematiciansDoughnut")
# Skill radar
mathematicians_radar_html, mathematicians_radar_js =  skills_radar(mathematicians, "mathematicianRadar", color=0, label="Mathematician")

# HTML output
html_string = """
<table>
    <tr>
        <h2 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> The Mathematician </h2>
        <br></br>
        <h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> Number of Mathematicians: %(number_of_mathematicians)s among %(number_of_observations)s observations (%(prop_of_mathematicians)s%%) </h3>
    </tr>
    <tr>
        <td><img src="https://image.flaticon.com/icons/png/512/827/827371.png" width="600px"></td>
        <td>
            <div align='justify' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif;">
                <h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Description </h3>
                <br></br>
                The Mathematician is interested in the comprehension of the maths behind ML algorithms.
                Her/His strong taste for statistics makes him a good candidate for Research scientist or Statistician.
                She/He knows enough about coding to plot graphs aimed at showing her/his point
                his point.
            </div>
        </td>  
    </tr>
    <tr>
        <td>
            <div>
                <h3 align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)">  Mathematicians are composed of ... </h3>
                %(mathematicians_doughnut_html)s
            </div>
            </td>
        <td>
            <div>
                <h3 align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Skills </h3>
                <p align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:italic; color:rgba(150, 150, 150, 1)"> Each point corresponds to a computed average </p>
                %(radar_html)s
            </div>
        </td>
    </tr>
</table>


"""%{"radar_html": mathematicians_radar_html,
     "number_of_mathematicians": number_of_mathematicians,
     "number_of_observations": answers.shape[0],
     "prop_of_mathematicians": prop_of_mathematicians,
     "mathematicians_doughnut_html": mathematicians_doughnut_html
    }

display(Javascript(mathematicians_radar_js))
display(Javascript(mathematicians_doughnut_js))
display(HTML(html_string))

In [None]:
# Developer stats
practitioners = answers[answers["Profile"] == "Practitioner"]
# Number of mathematicians in the dataset
number_of_practitioners = practitioners.shape[0]
# Proportion of mathematicians
prop_of_practitioners = np.round((number_of_practitioners/answers.shape[0])*100, 2)
# Doughnut charts
practitioners_doughnut_html, practitioners_doughnut_js = composition_doughnut(practitioners, "practitionerDoughnut")
# Skill radar
practitioners_radar_html, practitioners_radar_js =  skills_radar(practitioners, "practitionerRadar", color=1, label='Practitioner')

# HTML output
html_string = """
<table>
    <tr>
        <h2 align="left" style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> The Practitioner </h2>
        <br></br>
        <h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> Number of Practitioners: %(number_of_practitioners)s among %(number_of_observations)s observations (%(prop_of_practitioners)s%%)</h3>
    </tr>
    
    <tr>
        <td><img src="https://image.flaticon.com/icons/png/512/1754/1754024.png" width="600px"></td>
        <td>
            <div align='justify' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif;">
            <h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Description </h3>
            <br></br>
            The Practitioner is interested in being practical and is the one who will be the more exposed to the client.
            She/He gives as much importance for the form as the content and will rely on tools allowing her/him to tell good stories about the data she/he has.
            She/He does not want to get too involved in low-level frameworks (except for data visualization maybe) and rely on others to provide him what he lacks.
            </div>
        </td>
    </tr>
    
    <tr>
        <td>
            <div>
                <h3 align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Practitioners are composed of ... </h3>
                %(practitioners_doughnut_html)s 
            </div>
        </td>
        <td>
            <div>
                <h3 align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Skills </h3>
                <p align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:italic; color:rgba(150, 150, 150, 1)"> Each point corresponds to a computed average </p>
                %(radar_html)s
            </div>
        </td>
    </tr>
    
</table>

"""%{"radar_html": practitioners_radar_html,
     "number_of_practitioners": number_of_practitioners,
     "number_of_observations": answers.shape[0],
     "prop_of_practitioners": prop_of_practitioners,
     "practitioners_doughnut_html": practitioners_doughnut_html,
    }

display(Javascript(practitioners_radar_js))
display(Javascript(practitioners_doughnut_js))
display(HTML(html_string))

In [None]:
# Developer stats
developers = answers[answers["Profile"] == "Developer"]
# Number of developers in the dataset
number_of_developers = developers.shape[0]
# Proportion of developers
prop_of_developers = np.round((number_of_developers/answers.shape[0])*100, 2)
# Doughnut charts
developers_doughnut_html, developers_doughnut_js = composition_doughnut(developers, "developersDoughnut")
# Skill radar
developers_radar_html, developers_radar_js = skills_radar(developers, "developerRadar", color=2, label='Developer')

# HTML output
html_string = """
<table>
    <tr>
        <h2 align="left" style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> The Developer </h2>
        <br></br>
        <h3 align="left" style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> Number of Developers: %(number_of_developers)s among %(number_of_observations)s observations (%(prop_of_developers)s%%) </h3>
    </tr>
    <tr>
        <td><img src="https://image.flaticon.com/icons/png/512/1688/1688400.png" width="600px"></td>
        <td>
            <div align='justify' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif;">
            <h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Description </h3>
            <br></br>
            The Developer is the one who has a better knowledge of programming than the others.
            The one who will get hives if you don't follow the PEP8 conventions.
            She/He is good at structuring things making her/him to be a great contender for a software engineer position.
            </div>
        </td>
    </tr>
    
    <tr>
        <td>
            <div>
                <h3 align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Developers are composed of ... </h3>
                %(developers_doughnut_html)s 
            </div>
        </td>
        <td>
            <div>
                <h3 align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Skills </h3>
                <p align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:italic; color:rgba(150, 150, 150, 1)"> Each point corresponds to a computed average </p>
                %(radar_html)s
            </div>
        </td>
    </tr>
</table>


"""%{"radar_html": developers_radar_html,
     "number_of_developers": number_of_developers,
     "number_of_observations": answers.shape[0],
     "prop_of_developers": prop_of_developers,
     "developers_doughnut_html": developers_doughnut_html,
    }

display(Javascript(developers_radar_js))
display(Javascript(developers_doughnut_js))
display(HTML(html_string))

The 3 types of profiles I defined are covering 40.14% of the dataset (12.63% are Developers, 14.8% are Practitioners and 12.71% are Mathematicians). Each profile is represented by a radar chart where each spoke represents the average number of tools used for its respective label, i.e the Developer is using 3.06 programming languages in average on a daily basis. To understand how the values are computed and where they are coming from, you can have a look at the [appendices of the notebook](#radar) (or follow the source code!)

Each profile performs best in different areas and just by overlaying the graphs we may see which components are standing out.

In [None]:
m_data = skills_radar(mathematicians, "mathematicianRadar", color=0, return_chart_data=True, label='Mathematician')
p_data = skills_radar(practitioners, "practitionerRadar", color=1, return_chart_data=True, label='Practitioner')
d_data = skills_radar(developers, "developerRadar", color=2, return_chart_data=True, label='Developer')

p_data["datasets"].append(m_data["datasets"][0])
p_data["datasets"].append(d_data["datasets"][0])

display(HTML("""
<h3 align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Skills overlay</h3>
<p align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> Each point corresponds to a computed average </p>
"""))
chartjs('radar', "stackRadar", data=p_data, height=250, width='auto')

* **The Practitioners** are slightly more familiar with BI tools than other types of profiles and have a lower use of the tools in all the others areas. A possible explanation is that the practitioners are investing time in non-related data tasks while at work (might be team management, communication related tasks ...)

* **The Developers** are using a wide range of programming languages to complete their tasks and are using them on their adapted IDE's. They are also topping on the daily use of Big Data and database related products.

* **The Mathematicians** are distinguishing themselves by their more extensive use of Machine Learning Libraries and the knowledge of around 2 plotting libraries to present their works.

<a id="s1">
    <h2> What about the other kagglers? </h2>
</a>

So far, I only covered a specific part of the respondents. I purposely left aside the other kinds of kagglers and I will now get back to two of them.


The first one I want to get back on is the "Data Scientist". One of the main reasons why I didn't categorize the data scientist is because the term itself is less easier to define. The role of a data scientist is kind of hybrid and the data scientist has to possess a wide range of skills to be defined as such.

<img src="https://images.squarespace-cdn.com/content/v1/5150aec6e4b0e340ec52710a/1364352051365-HZAS3CLBF7ABLE3F5OBY/ke17ZwdGBToddI8pDm48kB2M2-8_3EzuSSXvzQBRsa1Zw-zPPgdn4jUwVcJE1ZvWQUxwkmyExglNqGp0IvTJZUJFbgE-7XRK3dMEBRBhUpxPe_8B-x4gq2tfVez1FwLYYZXud0o-3jV-FAs7tmkMHY-a7GzQZKbHRGZboWC-fOc/Data_Science_VD.png?format=1500w"/>

>*The Data Scientist is theoritically at the crossroads of the Venn graph. The representation is pretty popular and widely used to explain what it takes to be a data scientist.*

The graph also gives you the reason why I inferred the 3 types of profiles! If everything goes well, we should see that Kaggle's Data Scientists are in reality mathematicians, developers and practitioners all at the same time.

In [None]:
# Data Scientist stats
data_scientists = answers[answers["Profile"] == "Data Scientist"]
# Number of Data Scientists in the dataset
number_of_data_scientists = data_scientists.shape[0]
# Proportion of Data Scientists
prop_of_data_scientists = np.round((number_of_data_scientists/answers.shape[0])*100, 2)
# Skill radar
data_scientists_radar_html, data_scientists_radar_js =  skills_radar(data_scientists, "scientistRadar", color=3, label='Data Scientist')

# HTML output
html_string = """
<table>
    <tr>
        <h2 align="left" style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> The Data Scientist </h2>
        <br></br>
        <h3 align="left" style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> Number of Data Scientists: %(number_of_data_scientists)s among %(number_of_observations)s observations (%(prop_of_data_scientists)s%%) </h3>
    </tr>
    <tr>
        <td><img src="https://cdn3.iconfinder.com/data/icons/casino-and-gambling-icons/505/Jack-2-512.png" width="400px" height="100px"></td>
        <td>
            <div align='justify' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif;">
            <h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Description </h3>
            <br></br>
            A Jack of all trades who is proficient in statistical analysis, who has great insights on the business impact and who is creative enough to develop the solutions on his own.
            </div>
        </td>
    </tr>
    
    <tr>
        <td colspan="2">
            <div>
                <h3 align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Skills </h3>
                <p align='left' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:italic; color:rgba(150, 150, 150, 1)"> Each point corresponds to a computed average </p>
                %(radar_html)s
            </div>
        </td>
    </tr>
</table>


"""%{"radar_html": data_scientists_radar_html,
     "number_of_data_scientists": number_of_data_scientists,
     "number_of_observations": answers.shape[0],
     "prop_of_data_scientists": prop_of_data_scientists,
    }

display(Javascript(data_scientists_radar_js))
display(HTML(html_string))

Again, just by overlaying the Data Scientist radar chart over the other profiles' we obtain:

In [None]:
ds_data = skills_radar(data_scientists, "scientistRadar", color=3, return_chart_data=True, label='Data Scientist')
p_data["datasets"].append(ds_data["datasets"][0])
chartjs('radar', "stackRadar2", data=p_data, height=250, width='auto')

**The Data Scientists** are manipulating a lot of tools in many different contexts and usually more than any of the other profiles. This is particularly true for both ML libraries and Plotting libraries where the average number of ML and Plotting libraries used on a daily basis are respectively 3.26 and 2.64.

Letting aside the kagglers with job titles labelled as "Currently not employed", "Other", and "Nan" due to the inherent within heterogeneity of these groups, the remaining job title is the one labelled as **Student**.

In [None]:
# Student stats
students = answers[answers["Profile"] == "Student"]
# Number of Students in the dataset
number_of_students = students.shape[0]
# Proportion of Students
prop_of_students = np.round((number_of_students/answers.shape[0])*100, 2)
# Skill radar
students_radar_html, students_radar_js =  skills_radar(students, "studentRadar", color=4, label='Student')

# HTML output
html_string = """
<table>
    <tr>
        <h2 align="left" style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> The Student </h2>
        <br></br>
        <h3 align="left" style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(150, 150, 150, 1)"> Number of Students: %(number_of_students)s among %(number_of_observations)s observations (%(prop_of_students)s%%) </h3>
    </tr>
    <tr>
        <td><img src="https://image.flaticon.com/icons/png/512/3750/3750011.png" width="400px" height="100px"></td>
        <td>
            <div align='justify' style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif;">
            <h3 style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-weight:normal; color:rgba(50, 50, 50, 1)"> Description </h3>
            <br></br>
            The student is the starting point of the data journey.
            She/he will learn through courses different aspects of data and will follow the path of the Mathematician, Practitioner or Developer.
            As a learner, every possibilities are offered to her/him !
            </div>
        </td>
    </tr>
</table>


"""%{"radar_html": students_radar_html,
     "number_of_students": number_of_students,
     "number_of_observations": answers.shape[0],
     "prop_of_students": prop_of_students,
    }

display(Javascript(students_radar_js))
display(HTML(html_string))

The students are diverging from their working counterparts by:
* The fact that they are not using any BI tool, Big Data Infrastructures nor Cloud Computing Platforms ([following the survey's logic](https://www.kaggle.com/c/kaggle-survey-2020/data))

* Having the highest rate of kagglers with a Bachelor's degree and people in some college/university without earning a bachelor's degree probably due to the fact that the students are composed of freshmen and students who are still studying to obtain a master degree or a doctoral degree 

In [None]:
stackbar_degrees = answers[answers["Profile"] != 'Unemployed - Other']
stackbar_degrees = stackbar_degrees.groupby(["Q4", "Profile"]).apply(lambda x: x.shape[0])
stackbar_degrees = stackbar_degrees.groupby(level="Profile").apply(lambda x: 100 * x / float(x.sum())).reset_index(name="percentage")

# Cartesian product of profiles and Q4
profiles = stackbar_degrees["Profile"].unique()
q4 = ['Bachelor’s degree', 'Master’s degree', 'Doctoral degree', 'I prefer not to answer', 
      'No formal education past high school', 'Some college/university study without earning a bachelor’s degree',
      'Professional degree']
index = pd.MultiIndex.from_product([q4, profiles], names = ["Q4", "Profile"])
cartesian_product = pd.DataFrame(index = index).reset_index()

# Left join 
stackbar_degrees = cartesian_product.merge(stackbar_degrees, how='left', on=["Q4", "Profile"]).fillna(0)
stackbar_degrees["percentage"] = np.round(stackbar_degrees["percentage"], 2)

labels = stackbar_degrees["Profile"].unique().tolist()
datasets = []

for i, degree in enumerate(stackbar_degrees["Q4"].unique()):
    degree_df = stackbar_degrees[stackbar_degrees["Q4"] == degree]
    datasets.append({
        'label': degree,
        'data': degree_df["percentage"].to_list(),
        'backgroundColor': palette[i],
        'hoverBackgroundColor': palette[i],
        'hoverBorderWidth': 1
    })
    
options = {
    'scales': {
        'xAxes': [{ 
            'stacked': 1,
            'ticks': {
                'min': 0,
                'max': 100,
            },
            'scaleLabel': {
                'display': 1,
                'labelString': 'Degrees Obtained (%)',
            },
            }],
        'yAxes': [{ 
            'stacked': 1, 
            'gridLines': { 'display': 0 },
        }],
    },
    'plugins':{
            'labels': {
                'render': 'percentage',
                'precision': 2,
                'fontSize': 8,
            }
        },
}
del(stackbar_degrees)
chartjs(chartType="horizontalBar", chartId="hbarDegree", data={'labels': labels, 'datasets': datasets}, options=options, height=300)

* Being the youngest among all the others groups of kagglers

In [None]:
stackbar_ages = answers[answers["Profile"] != 'Unemployed - Other']
stackbar_ages = stackbar_ages.groupby(["Q1", "Profile"]).apply(lambda x: x.shape[0])
stackbar_ages = stackbar_ages.groupby(level="Profile").apply(lambda x: 100 * x / float(x.sum())).reset_index(name="percentage")

# Cartesian product of profiles and Q4
profiles = stackbar_ages["Profile"].unique()
q1 = stackbar_ages["Q1"].unique()
index = pd.MultiIndex.from_product([q1, profiles], names = ["Q1", "Profile"])
cartesian_product = pd.DataFrame(index = index).reset_index()

# Left join 
stackbar_ages = cartesian_product.merge(stackbar_ages, how='left', on=["Q1", "Profile"]).fillna(0)
stackbar_ages["percentage"] = np.round(stackbar_ages["percentage"], 2)

labels = stackbar_ages["Profile"].unique().tolist()
datasets = []

for i, age in enumerate(stackbar_ages["Q1"].unique()):
    age_df = stackbar_ages[stackbar_ages["Q1"] == age]
    datasets.append({
        'label': age,
        'data': age_df["percentage"].to_list(),
        'backgroundColor': palette[i],
        'hoverBackgroundColor': palette[i],
        'hoverBorderWidth': 1
    })
    
options = {
    'scales': {
        'xAxes': [{ 
            'stacked': 1,
            'ticks': {
                'min': 0,
                'max': 100,
            },
            'scaleLabel': {
                'display': 1,
                'labelString': 'Age Range (%)',
            },
            }],
        'yAxes': [{ 
            'stacked': 1, 
            'gridLines': { 'display': 0 },
        }],
    },
    'plugins':{
            'labels': {
                'render': 'percentage',
                'precision': 2,
                'fontSize': 8,
            }
        },
}
del(stackbar_ages)
chartjs(chartType="horizontalBar", chartId="hbarAge", data={'labels': labels, 'datasets': datasets}, options=options, height=300)
display(HTML("""<div style="font-family: Helvetica Neue, Helvetica, 'Arial', sans-serif; font-style:italic; color:rgba(50, 50, 50, 1)"> Also note that the data scientists are the eldest (and thus, overall more exeperienced) which concurs to their mastering of a lot of tools !</div>"""))


<a id="s2"> <h2>Sources</h2> </a>
1. Source of the Data Science Venn Diagram: http://drewconway.com/zia/2013/3/26/the-data-science-venn-diagram
2. Some love for ChartJS and its documentation: https://www.chartjs.org/docs/latest/
3. Python wrapping of ChartJs from: https://gist.github.com/fabriziopandini/7e8efdd7063a518a2d2d

<a> <h2>Appendices</h2> </a>

<a id="radar"> <h4>What are the values in the radar charts ? </h4> </a>
Skills levels are coming from the answers of multiple choice questions and are computed the following way:
1. Select the data from a specific multiple choice question (i.e all the columns beginning with *Q7_Part_** for Q7. What programming languages do you use on a regular basis?)
2. Count the number of non NaN answers for each observation (i.e if Python and R are used on regular basis for an observation, the count will be 2)
3. Average the counts over all the observations (i.e the average number of programming languages used is represented)

Skills level are computed for each profiles and each spoke refers to a question of the survey:
- **Programming Languages**: the average number of programming languages used on a daily basis refers to **Q7**
- **IDEs**: the average number of integrated development environments (IDE's) used on a regular basis refers to **Q9**
- **Plotting Libraries**: the average number of data visualization libraries or tools used on a regular basis refers to **Q14**
- **ML Libraries**: the average number of machine learning frameworks do you used on a regular basis refers to **Q16**
- **Cloud Computing Platforms**: the average number of cloud computing platforms used on a daily basis refers to **Q26-A**
- **Big Data Products**: the average number of big data products used on a daily basis refers to **Q29-A**
- **BI Tools**: the average number of business intelligence tools used on a daily basis refers to **Q31-A**

<a id="groups"> <h4> Why the grouping we chose minimizes the loss compared to others groupings ?</h4> </a>
A little note on why I split the data the way I did. Each profile is the aggregation of job titles. The groups I made can be considered fairly good if they are distincts from each other. To verify that, I will use a method, which is far from being the most academic one and does not constitute an accurate proof, but will do the job to show my point.

So here is the strategy:
1. I will create new groups using the titles. These new groups will be compared to the Mathematician/Practitioner/Developer group I chose for the notebook.
2. I will use an ML algorithm and see which group get the best accuracy and use a k-fold cross-validation

The model with the best accuracies should be the one where the features have more discriminative power

In [None]:
import lightgbm as lgbm
import pprint
from sklearn import preprocessing
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

def random_groups():
    """
    Forming random groups of three job titles
    """
    profiles = ['Data Engineer', 'Software Engineer', 'DBA/Database Engineer', 
                'Data Analyst', 'Business Analyst', 'Product/Project Manager',
                'Machine Learning Engineer', 'Research Scientist','Statistician']
    groups = {}
    np.random.shuffle(profiles)
    j=0
    for i in range(0, len(profiles)):
        if i%3 == 0:
            j = j+1
        groups.update({profiles[i]: "Group {}".format(j)})
    return groups

In [None]:
answers = pd.read_csv("/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv").drop(0).reset_index(drop=True)

def model(groups):
    """
    A minimalist data preparation to fit LGBMClassifier
    """
    df = answers.copy()
    df["path"] = df["Q5"].map(groups[0])
    df = df[df["path"].notna()]
    df = df.fillna(0)
    for col in df.columns:
        le = preprocessing.LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
    label = df["path"]
    scaler = preprocessing.MinMaxScaler(feature_range=(-1,1))
    columns = df.columns
    df = scaler.fit_transform(df)
    df = pd.DataFrame(df, columns=columns)
    return df.drop("path", axis=1).reset_index(drop=True), label.reset_index(drop=True)

def kfolding():
    print("K-fold with the Random grouping")
    kf = KFold(n_splits=5, shuffle=True)
    X, y = model(groups)
    X.drop("Q5", axis=1, inplace=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clf = LGBMClassifier(objective='multiclass')
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        print(accuracy_score(y_test,preds))
        
    print("\nK-fold with the Notebook's grouping")
    X, y = model([{
        'Data Engineer': 'Developer',
        'Software Engineer': 'Developer',
        'DBA/Database Engineer': 'Developer',
    
        'Data Analyst': 'Practitioner',
        'Business Analyst': 'Practitioner',
        'Product/Project Manager': 'Practitioner',

        'Machine Learning Engineer': 'Mathematician',
        'Research Scientist': 'Mathematician',
        'Statistician': 'Mathematician',
    }])
    X.drop("Q5", axis=1, inplace=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        clf = LGBMClassifier(objective='multiclass')
        clf.fit(X_train, y_train)
        preds = clf.predict(X_test)
        print(accuracy_score(y_test,preds))

In [None]:
groups = []
for _ in range(1):
    groups.append(random_groups())

print("Random Grouping: ")
pprint.pprint(groups)

print("\n")
kfolding()

By rerunning the cell above, you will probably end to the conclusion that the Notebook's grouping is better