In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import pandas as pd
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
import os
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

#for dirname, _, filenames in os.walk('/kaggle/input'):
#    for filename in filenames:
#        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# If you earn over 80% of the people...

Data science and machine learning related jobs are among the top demanding careers in the world today ([1](http://https://knowledge.wharton.upenn.edu/article/whats-driving-demand-data-scientist/)). However, among the people working in this area, there are still some of them earning much more than the others. From the Kaggle survey 2021 on 25,973 valid* participants, 15,391 answered the question **What is your current yearly compensation (approximate $USD)?**

*"To ensure response quality, we excluded respondents that were flagged by our survey system as
“Spam” or "Duplicate. We also dropped responses from respondents that spent less than 2
minutes completing the survey, as well as responses from respondents that selected fewer than
15 answer choices in total." -- Kaggle ML & DS Survey 2021 Methodology [(link)](http://www.kaggle.com/c/kaggle-survey-2021/data)

In [None]:
# Load data
survey = pd.read_csv('/kaggle/input/kaggle-survey-2021/kaggle_survey_2021_responses.csv',low_memory=False)

# concate the first row into the header
survey.columns=survey.columns+'|'+survey.iloc[0,:]
survey = survey.iloc[1: , :]

# create a column with unique respondent_id
survey['respondent_id'] = range(1, survey.shape[0] + 1 ,1)

# number of participants
# 25973
nr_participants = survey['respondent_id'].nunique()
print("The number of valid survey participants: " + str(nr_participants) + ".")

In [None]:
# How many participants answer the question of yearly compensation?
# replace the null value  with 'unknown'
survey['Q25|What is your current yearly compensation (approximate $USD)?'] = survey['Q25|What is your current yearly compensation (approximate $USD)?'].replace(np.nan, 'Unknown')
# remove the unknown values of yearly compensation
survey_compensation = survey.loc[survey['Q25|What is your current yearly compensation (approximate $USD)?'] != 'Unknown']

# get number of respondents in each compensation group
survey_compensation_count = survey_compensation['Q25|What is your current yearly compensation (approximate $USD)?'].value_counts().reset_index()
survey_compensation_count = survey_compensation_count.rename(columns={'index': 'yearly_compensation_range', 'Q25|What is your current yearly compensation (approximate $USD)?': 'number_of_respondents'})

#ordering the segments by amount 
survey_compensation_count.yearly_compensation_range = pd.Categorical(survey_compensation_count.yearly_compensation_range, 
                      categories=["$0-999","1,000-1,999","2,000-2,999","3,000-3,999","4,000-4,999","5,000-7,499","7,500-9,999",
                                 "10,000-14,999","15,000-19,999","20,000-24,999","25,000-29,999","30,000-39,999","40,000-49,999","50,000-59,999",
                                 "60,000-69,999","70,000-79,999","80,000-89,999","90,000-99,999","100,000-124,999","125,000-149,999",
                                 "150,000-199,999","200,000-249,999","250,000-299,999","300,000-499,999","$500,000-999,999",">$1,000,000"],
                      ordered=True)
survey_compensation_count = survey_compensation_count.sort_values(by='yearly_compensation_range', ascending = False)

# if you earn more than 70,000 USD a year, you are over 80% of the participants
survey_compensation_count['cum_sum_number_of_respondents'] = survey_compensation_count['number_of_respondents'].cumsum()
survey_compensation_count['cum_perc_respondents'] = round(100*survey_compensation_count['cum_sum_number_of_respondents']/survey_compensation_count['number_of_respondents'].sum(),1)
survey_compensation_count['share'] = round(100 * survey_compensation_count['number_of_respondents'] / survey_compensation.shape[0],1)

# define a user as "top 20" if his/her yearly compensation is over 70,000 USD, otherwise "bottom 80"
survey_compensation_count['20-80'] = np.where(survey_compensation_count['cum_perc_respondents'].astype(float) <= 80, 'Less than 70K', 'More than 70K')
survey_compensation_count.groupby(['20-80']).sum()

# add a new column in survey data - tell whether a user has high/low compensation
survey_compensation['compensation_segment'] = np.where(survey_compensation['Q25|What is your current yearly compensation (approximate $USD)?'].isin(["70,000-79,999",
                                                                                                                          "80,000-89,999",
                                                                                                                          "90,000-99,999",
                                                                                                                          "100,000-124,999",
                                                                                                                          "125,000-149,999",
                                                                                                                          "150,000-199,999",
                                                                                                                          "200,000-249,999",
                                                                                                                          "250,000-299,999",
                                                                                                                          "300,000-499,999",
                                                                                                                          "$500,000-999,999",
                                                                                                                          ">$1,000,000"]), 
                                                                                                                          'More than 70K', 'Less than 70K')
# number of participants answer the question
# Q25|What is your current yearly compensation (approximate $USD)?
nr_compensation = survey_compensation['respondent_id'].nunique()

In [None]:
print(str(round(100 * nr_compensation/nr_participants)) +"%" + " of participants" + "(" +str(nr_compensation)+")"+ " answered the question - What is your current yearly compensation (approximate $USD)?")

Among those who provided their compensation data, we are seeing if you earn more than **70k USD** annually, you have surpassed 80% of the industry fellows in the world.

In [None]:
# yearly compensation distribution
fig = make_subplots(rows=2, cols=1)

fig.append_trace(go.Bar(
    y=['% of participants answered the question'],
    x=[41],
    width = 0.5,
    name='Not answered the yearly compensation question',
    orientation='h',
    hoverinfo='skip',
    texttemplate="41%",
        textposition="inside",
        textfont_size = 20,
        textangle=0,
        textfont_color="black",
    marker=dict(
        color='#f3f3f3',
        line=dict(color='#f3f3f3', width=1)
    ),
), row=1, col=1)

fig.append_trace(go.Bar(
    y=['% of participants answered the question'],
    x=[59],
    width = 0.5,
    name='Answered the yearly compensation question',
    orientation='h',
    hoverinfo='skip',
    texttemplate="59%",
        textposition="inside",
        textfont_size = 20,
        textangle=0,
        textfont_color="black",
    marker=dict(
        color='#cccccc',
        line=dict(color='#cccccc', width=1)
    ),
), row=1, col=1)

fig.append_trace(go.Bar(
    y=['% of participants lie in compensation range'],
    x=[41],
    width = 0.5,
    name='Earn less than 70k USD',
    orientation='h',
    hoverinfo='skip',
    marker=dict(
        color='rgba(0,0,0,0)',
        line=dict(color='rgba(0,0,0,0)', width=1)
    ),
), row=2, col=1)

fig.append_trace(go.Bar(
    y=['% of participants lie in compensation range'],
    x=[79*0.59],
    width = 0.5,
    name='Earn less than 70k USD',
    orientation='h',
    hoverinfo='skip',
    texttemplate="79%",
        textposition="inside",
        textfont_size = 20,
        textangle=0,
        textfont_color="black",
    marker=dict(
        color='#9fc5e8',
        line=dict(color='#9fc5e8', width=1)
    ),
), row=2, col=1)

fig.append_trace(go.Bar(
    y=['% of participants lie in compensation range'],
    x=[21*0.59],
    width = 0.5,
    name='Earn more than 70k USD',
    orientation='h',
    hoverinfo='skip',
    texttemplate="21%",
        textposition="inside",
        textfont_size = 20,
        textangle=0,
        textfont_color="black",
    marker=dict(
        color='#0b5394',
        line=dict(color='#0b5394', width=1)
    ),
), row=2, col=1)



fig.update_layout(height=400, 
                  width=680, 
                  margin = {'t':100,
                           'b':100,
                           'l':50,
                           'r':50},
                  barmode='stack',
                  title_text="Yearly compensation of survey participants<br><sup>59% of 25,973 survey participants answered the Question 25 - <br>What is your current yearly compensation (approximate $USD)?</sup>",
                 showlegend=False,
                  paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')
fig.update_xaxes(showgrid=False,
                visible = False)
fig.update_yaxes(showgrid=False,
                visible = False)

fig.add_annotation(text="21% earn MORE <br>than 70k USD a year",
                  xref="paper", yref="paper",
                  x=1, y=0.46, showarrow=False)

fig.add_annotation(text="79% earn LESS <br>than 70k USD a year",
                  xref="paper", yref="paper",
                  x=0.5, y=0.00, showarrow=False)

fig.show()

In [None]:
# unique question : Question number - Question content
unique_questions = []
for q in survey_compensation.columns:
    if 'Q' in q:
        if '_Part' in q:
            q_num = q.split("_Part")[0] + " - " + q.split("|")[1].split("- Selected Choice")[0]
            unique_questions.append(q_num)
        elif '_OTHER' in q:
            q_num = q.split("_OTHER")[0] + " - " + q.split("|")[1].split("- Selected Choice")[0]
            unique_questions.append(q_num)
        else:
            q_num = q.split("|")[0] + " - " + q.split("|")[1].split("- Selected Choice")[0]
            unique_questions.append(q_num)
    else:
        pass
unique_questions = set(unique_questions)

In [None]:
unique_questions = [
    "Q1 - What is your age (# years)?",
    "Q2 - What is your gender? ",
    "Q3 - In which country do you currently reside?",
    "Q4 - What is the highest level of formal education that you have attained or plan to attain within the next 2 years?",
    "Q5 - Select the title most similar to your current role (or most recent title if retired): ",
    'Q6 - For how many years have you been writing code and/or programming?',
    'Q7 - What programming languages do you use on a regular basis? (Select all that apply) ',
    'Q8 - What programming language would you recommend an aspiring data scientist to learn first? ',
    "Q9 - Which of the following integrated development environments (IDE's) do you use on a regular basis?  (Select all that apply) ",
    'Q10 - Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) ',
    'Q11 - What type of computing platform do you use most often for your data science projects? ',
    'Q12 - Which types of specialized hardware do you use on a regular basis?  (Select all that apply) ',
    'Q13 - Approximately how many times have you used a TPU (tensor processing unit)?',
    'Q14 - What data visualization libraries or tools do you use on a regular basis?  (Select all that apply) ',
    'Q15 - For how many years have you used machine learning methods?',
    'Q16 - Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply) ',
    'Q17 - Which of the following ML algorithms do you use on a regular basis? (Select all that apply): ',
    'Q18 - Which categories of computer vision methods do you use on a regular basis?  (Select all that apply) ',
    'Q19 - Which of the following natural language processing (NLP) methods do you use on a regular basis?  (Select all that apply) ',
    'Q20 - In what industry is your current employer/contract (or your most recent employer if retired)? ',
    'Q21 - What is the size of the company where you are employed?',
    'Q22 - Approximately how many individuals are responsible for data science workloads at your place of business?',
    'Q23 - Does your current employer incorporate machine learning methods into their business?',
    'Q24 - Select any activities that make up an important part of your role at work: (Select all that apply) ',
    #'Q25 - What is your current yearly compensation (approximate $USD)?',
    'Q26 - Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?',
    'Q27_A - Which of the following cloud computing platforms do you use on a regular basis? (Select all that apply) ',
    #'Q27_B - Which of the following cloud computing platforms do you hope to become more familiar with in the next 2 years? ',
    'Q28 - Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? ',
    'Q29_A - Do you use any of the following cloud computing products on a regular basis? (Select all that apply) ',
    #'Q29_B - In the next 2 years, do you hope to become more familiar with any of these specific cloud computing products? (Select all that apply) ',
    'Q30_A - Do you use any of the following data storage products on a regular basis? (Select all that apply) ',
    #'Q30_B - In the next 2 years, do you hope to become more familiar with any of these specific data storage products? (Select all that apply) ',
    'Q31_A - Do you use any of the following managed machine learning products on a regular basis? (Select all that apply) ',
    #'Q31_B - In the next 2 years, do you hope to become more familiar with any of these managed machine learning products? (Select all that apply) ',
    'Q32_A - Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis? (Select all that apply) ',
    #'Q32_B - Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you hope to become more familiar with in the next 2 years? (Select all that apply) ',
    'Q33 - Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? ',
    'Q34_A - Which of the following business intelligence tools do you use on a regular basis? (Select all that apply) ',
    #'Q34_B - Which of the following business intelligence tools do you hope to become more familiar with in the next 2 years? (Select all that apply) ',
    'Q35 - Which of the following business intelligence tools do you use most often? ',
    'Q36_A - Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis?  (Select all that apply) ',
    #'Q36_B - Which categories of automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years?  (Select all that apply) ',
    'Q37_A - Which of the following automated machine learning tools (or partial AutoML tools) do you use on a regular basis?  (Select all that apply) ',
    #'Q37_B - Which specific automated machine learning tools (or partial AutoML tools) do you hope to become more familiar with in the next 2 years?  (Select all that apply) ',
    'Q38_A - Do you use any tools to help manage machine learning experiments? (Select all that apply) ',
    #'Q38_B - In the next 2 years, do you hope to become more familiar with any of these tools for managing ML experiments? (Select all that apply) ',
    'Q39 - Where do you publicly share your data analysis or machine learning applications? (Select all that apply) ',
    'Q40 - On which platforms have you begun or completed data science courses? (Select all that apply) ',
    'Q41 - What is the primary tool that you use at work or school to analyze data? (Include text response) ',
    'Q42 - Who/what are your favorite media sources that report on data science topics? (Select all that apply) '
]

In [None]:
ordered_questions = ["Q1 - What is your age (# years)?",
                     "Q4 - What is the highest level of formal education that you have attained or plan to attain within the next 2 years?",
                     'Q6 - For how many years have you been writing code and/or programming?',
                     'Q13 - Approximately how many times have you used a TPU (tensor processing unit)?',
                     'Q15 - For how many years have you used machine learning methods?',
                     'Q21 - What is the size of the company where you are employed?',
                     'Q22 - Approximately how many individuals are responsible for data science workloads at your place of business?',
                     'Q23 - Does your current employer incorporate machine learning methods into their business?',
                     'Q26 - Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?'            
                    ]

non_ordered_questions = [item for item in unique_questions if item not in ordered_questions]

In [None]:
def create_dataset(question):
    question_vis = []
    for col in survey_compensation.columns:
        if question.split(" - ")[1] in col:
            question_vis.append(col)
    question_vis.append('respondent_id')
    question_vis.append('compensation_segment')
    question_dataset = survey_compensation[question_vis]
    
    if "(Select all that apply)" in question:
    # multiple choice question
        question_aggregate = question_dataset.groupby(['compensation_segment']).count().reset_index()
        question_aggregate = question_aggregate.iloc[:,:-1]
        question_col = list(question_aggregate.iloc[:,1:].columns)
        question_aggregate['sum'] = question_aggregate[question_col].sum(axis=1)
        new_cols = []
        for col in question_col:
            new_col = col.split(' - ')[2]
            new_cols.append(new_col)
            question_aggregate[new_col] = round(100 * question_aggregate[col] / question_aggregate['sum'],1)
        new_cols.append('compensation_segment')
        question_dataset = question_aggregate[new_cols].T.reset_index()
        dataset_vis = question_dataset.loc[question_dataset['index'] != 'compensation_segment'].rename(columns={'index': question, 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
        dataset_vis['question'] = question
        dataset_vis.columns = ['choice', 'Less than 70K', 'More than 70K','question']

    else:
        # single choice question
        question_col = list(question_dataset.iloc[:,[0,-1]].columns)
        aggregate_dataset = question_dataset.groupby(question_col).count().reset_index()
        aggregate_dataset['nr_respondents_segment'] = aggregate_dataset.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
        aggregate_dataset['share_of_total'] = round(100 * aggregate_dataset['respondent_id'] / aggregate_dataset['nr_respondents_segment'],1)
        dataset_vis = aggregate_dataset.pivot( index=aggregate_dataset.columns[0],
                                                                          columns='compensation_segment', 
                                                                          values='share_of_total').reset_index()
        dataset_vis['question'] = question
        dataset_vis.columns = ['choice', 'Less than 70K', 'More than 70K','question']



    return dataset_vis

In [None]:
# create a dataset containing all the question and aggregated answers
dfs = []
for question in unique_questions:
    df = create_dataset(question)        
    df = df.fillna(0)
    if question == 'Q1 - What is your age (# years)?':
         df.choice = pd.Categorical(df.choice, 
                      categories=['18-21',
                                  '22-24',
                                  '25-29',
                                  '30-34',
                                  '35-39',
                                  '40-44',
                                  '45-49',
                                  '50-54',
                                  '55-59',
                                  '60-69',
                                  '70+'],
                      ordered=True)
         df.sort_values('choice', inplace=True)
    elif question == 'Q4 - What is the highest level of formal education that you have attained or plan to attain within the next 2 years?':
         df.choice = pd.Categorical(df.choice, 
                      categories=['I prefer not to answer',
                                  'No formal education past high school',
                                  'Some college/university study without earning a bachelor’s degree',
                                  'Bachelor’s degree',
                                  'Master’s degree',
                                  'Professional doctorate',
                                  'Doctoral degree'],
                      ordered=True)
         df.sort_values('choice', inplace=True)
    elif question ==  'Q6 - For how many years have you been writing code and/or programming?':
         df.choice = pd.Categorical(df.choice, 
                    categories=['I have never written code',
                                  '< 1 years',
                                  '1-3 years',
                                  '3-5 years',
                                  '5-10 years',
                                  '10-20 years',
                                  '20+ years'],
                      ordered=True)
         df.sort_values('choice', inplace=True)
    elif question ==  'Q13 - Approximately how many times have you used a TPU (tensor processing unit)?':
         df.choice = pd.Categorical(df.choice, 
                    categories=['Never', 
                                'Once',
                                '2-5 times', 
                                '6-25 times', 
                                'More than 25 times'],
                      ordered=True)
         df.sort_values('choice', inplace=True)
    elif question ==  'Q15 - For how many years have you used machine learning methods?':
         df.choice = pd.Categorical(df.choice, 
                    categories=['I do not use machine learning methods',
                                'Under 1 year'
                                '1-2 years',
                                '2-3 years',
                                '3-4 years',
                                '4-5 years',
                                '5-10 years',
                                '10-20 years',
                                '20 or more years'],
                      ordered=True)
         df.sort_values('choice', inplace=True)
    elif question ==  'Q21 - What is the size of the company where you are employed?':
         df.choice = pd.Categorical(df.choice, 
                    categories=['0-49 employees',
                                '50-249 employees',
                                '250-999 employees',
                                '1000-9,999 employees',
                                '10,000 or more employees'],
                      ordered=True)
         df.sort_values('choice', inplace=True)
    elif question ==  'Q22 - Approximately how many individuals are responsible for data science workloads at your place of business?':
         df.choice = pd.Categorical(df.choice, 
                    categories=['0', 
                                '1-2', 
                                '3-4', 
                                '5-9',
                                '10-14', 
                                '15-19', 
                                '20+'],
                      ordered=True)
         df.sort_values('choice', inplace=True)
    elif question ==  'Q23 - Does your current employer incorporate machine learning methods into their business?':
         df.choice = pd.Categorical(df.choice, 
                    categories=['I do not know',
                                'No (we do not use ML methods)',
                                'We are exploring ML methods (and may one day put a model into production)',
                                'We use ML methods for generating insights (but do not put working models into production)',
                                'We recently started using ML methods (i.e., models in production for less than 2 years)',
                                'We have well established ML methods (i.e., models in production for more than 2 years)'],
                      ordered=True)
         df.sort_values('choice', inplace=True)
    elif question ==  'Q26 - Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?':
         df.choice = pd.Categorical(df.choice,categories=['$0 ($USD)',
                                '$1-$99',
                                '$100-$999',
                                '$1000-$9,999',
                                '$10,000-$99,999',
                                '$100,000 or more ($USD)'],ordered=True)
         df.sort_values('choice', inplace=True)
    else:
         df['diff'] = df['More than 70K'] - df['Less than 70K']
         df.sort_values('diff', inplace=True)
    dfs.append(df)

result = pd.concat(dfs)


Feel free to play with this interactive chart which can show you how high and low earners distribute in each question.

You can **select** a question in the dropdown menu.

In [None]:
#### data visualization
# dropdown button
# question
questions = unique_questions
fig=go.Figure()
question_plot_names = []
question_plot_names_2 = []
buttons=[]
shapes = {}
default_state = "Q1 - What is your age (# years)?"
for question_name in questions:
    dataset = result.loc[result['question'] == question_name]
    
    for i in range(dataset.shape[0]):
        fig.add_shape(
            type='line',
            x0=dataset['Less than 70K'].iloc[i], 
            y0=dataset['choice'].iloc[i], 
            x1=dataset['More than 70K'].iloc[i], 
            y1=dataset['choice'].iloc[i],
            line_color="#cccccc",
            visible=(question_name==default_state)
        )
    
    #print(question_name, default_state)
    fig.add_trace(go.Scatter(x=dataset["Less than 70K"], 
                             y=dataset["choice"], 
                             hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K chose <b>%{y}</b>',
                             mode='markers',
                             name = "",
                             marker=dict(size=[10] * dataset.shape[0], color=["#DEBAE6"] * dataset.shape[0]),
                             visible=(question_name==default_state)
                            ))
    
    fig.add_trace(go.Scatter(x=dataset["More than 70K"], 
                             y=dataset["choice"], 
                             hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K chose <b>%{y}</b>',
                             mode='markers',
                             name = "",
                             marker=dict(size=[10] * dataset.shape[0],color=["#C54DFD"] * dataset.shape[0]),
                             visible=(question_name==default_state)))
    shape_list = []
    for i in range(dataset.shape[0]):
        shape = dict(
            type='line',
            line=dict(color="#cccccc"),
            x0=dataset['Less than 70K'].iloc[i], 
            y0=dataset['choice'].iloc[i], 
            x1=dataset['More than 70K'].iloc[i], 
            y1=dataset['choice'].iloc[i]
        )
        shape_list.append(shape)

    shapes[question_name] = shape_list
    question_plot_names.extend([question_name]* 2) 

for question_name in questions:
    buttons.append(dict(method='update',
                        label= "          " * 2 + question_name ,
                        args = [{'visible': [question_name==r for r in question_plot_names]},
                               #{"title":question_name},
                               {"shapes": shapes[question_name]}]
                       ))


# Add dropdown menus to the figure
fig.update_layout(showlegend=False, 
                  #title = 'Choose a question you are interested in',
                  margin=dict(
                      l=100, 
                      r=100, 
                      t=20, 
                      b=20
                  ),
                  updatemenus=[{"buttons": buttons, 
                                                  "direction": "down", 
                                                  "active": questions.index(default_state), 
                                                  "showactive": True,
                                                  "borderwidth" : 0.9,
                                                  "x": 0.75, 
                                                  "y": 1.1,
                                                   "xanchor":"center"}],
                  xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)

fig.show()

# Why do some people earn more than other do? 
Let's go deep dive on the profile of those top 20 earners. We divide the questions into four sections and compare the top 20s with bottoms 80s on each of them and offer some tips to those who want earn more.

The sections are
* Demographic
1. Where do they live/work?
2. Gender
3. Age
4. Education

* Work
1. industry
2. Company size
3. Role
4. Ml status of the business
5. Size of DS team in the company

* Skill
1. Year of coding / ml
2. Where do they learn / share knowledge?
3. What field/model/framework do they usually use?

* Daily tool and what they want to get familiar with in next two years
1. Environment
2. Product
3. Tool

# Demographics
There are four questions regarding this section

Q1|What is your age (# years)?

Q2|What is your gender? 

Q3|In which country do you currently reside?

Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?

In [None]:
# demographics dataset
demo_questions = []
demo_questions.append('respondent_id')
demo_questions.append('compensation_segment')
for col in survey_compensation.columns:
    if col.split("|")[0] in ('Q1','Q2','Q3','Q4'):
        demo_questions.append(col)
    else:
        pass

demographics_data = survey_compensation[demo_questions]

In [None]:
# country
survey_demographics_country = demographics_data.groupby(['compensation_segment',
                                                         'Q3|In which country do you currently reside?']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q3|In which country do you currently reside?',
                                                                                                              'respondent_id']]
survey_demographics_country['nr_respondents_segment'] = survey_demographics_country.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_demographics_country['share_of_total'] = round(100 * survey_demographics_country['respondent_id'] / survey_demographics_country['nr_respondents_segment'],1)

survey_demographics_country_pivot = survey_demographics_country.pivot(index='Q3|In which country do you currently reside?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_demographics_country_pivot= survey_demographics_country_pivot.fillna(0)
survey_demographics_country_pivot['diff'] = survey_demographics_country_pivot['More than 70K'] - survey_demographics_country_pivot['Less than 70K']
survey_demographics_country_pivot = survey_demographics_country_pivot.sort_values(by = 'diff')
survey_demographics_country_pivot = survey_demographics_country_pivot.replace('United States of America', 'U.S.')
survey_demographics_country_pivot = survey_demographics_country_pivot.replace('United Kingdom of Great Britain and Northern Ireland', 'U.K.')
survey_demographics_country_pivot = survey_demographics_country_pivot.replace('Iran, Islamic Republic of...', 'Iran')
survey_demographics_country_pivot = survey_demographics_country_pivot.replace('I do not wish to disclose my location', 'Not disclosed')


# visualization - country
fig = go.Figure()

for i in range(survey_demographics_country_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_demographics_country_pivot['Less than 70K'].iloc[i], 
        y0=survey_demographics_country_pivot['Q3|In which country do you currently reside?'].iloc[i], 
        x1=survey_demographics_country_pivot['More than 70K'].iloc[i], 
        y1=survey_demographics_country_pivot['Q3|In which country do you currently reside?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_demographics_country_pivot['Less than 70K'],
            y=survey_demographics_country_pivot['Q3|In which country do you currently reside?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_demographics_country_pivot.shape[0],
                color=["#DEBAE6"] * survey_demographics_country_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K are residing in <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_demographics_country_pivot['More than 70K'],
            y=survey_demographics_country_pivot['Q3|In which country do you currently reside?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_demographics_country_pivot.shape[0],
                color=["#C54DFD"] * survey_demographics_country_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K are residing in <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=1280, 
                  width=1280, 
                  barmode='stack',
                  title_text="In which country do you currently reside?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="If you are living in the U.S., <br>you are most likely to earn over 70K USD than people live in the rest of the world.</br>",
                  xref="paper", yref="paper",
                  x=0.95, y=0.935, showarrow=False)


fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# gender
survey_demographics_gender = demographics_data.groupby(['compensation_segment',
                                                         'Q2|What is your gender? - Selected Choice']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q2|What is your gender? - Selected Choice',
                                                                                                              'respondent_id']]
survey_demographics_gender['nr_respondents_segment'] = survey_demographics_gender.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_demographics_gender['share_of_total'] = round(100 * survey_demographics_gender['respondent_id'] / survey_demographics_gender['nr_respondents_segment'],1)

survey_demographics_gender_pivot = survey_demographics_gender.pivot(index='Q2|What is your gender? - Selected Choice', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_demographics_gender_pivot= survey_demographics_gender_pivot.fillna(0)
survey_demographics_gender_pivot['diff'] = survey_demographics_gender_pivot['More than 70K'] - survey_demographics_country_pivot['Less than 70K']
survey_demographics_gender_pivot = survey_demographics_gender_pivot.sort_values(by = 'diff')

# visualization - gender
fig = go.Figure()

for i in range(survey_demographics_gender_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_demographics_gender_pivot['Less than 70K'].iloc[i], 
        y0=survey_demographics_gender_pivot['Q2|What is your gender? - Selected Choice'].iloc[i], 
        x1=survey_demographics_gender_pivot['More than 70K'].iloc[i], 
        y1=survey_demographics_gender_pivot['Q2|What is your gender? - Selected Choice'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_demographics_gender_pivot['Less than 70K'],
            y=survey_demographics_gender_pivot['Q2|What is your gender? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_demographics_gender_pivot.shape[0],
                color=["#DEBAE6"] * survey_demographics_gender_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_demographics_gender_pivot['More than 70K'],
            y=survey_demographics_gender_pivot['Q2|What is your gender? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_demographics_gender_pivot.shape[0],
                color=["#C54DFD"] * survey_demographics_gender_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="What is your gender?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="Men dominates the industry in absolute shares<br>And they are more likely to earn more than 70K USD than others</br>",
                  xref="paper", yref="paper",
                  x=0.95, y=0.935, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Age
survey_demographics_age = demographics_data.groupby(['compensation_segment',
                                                         'Q1|What is your age (# years)?']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q1|What is your age (# years)?',
                                                                                                              'respondent_id']]
survey_demographics_age['nr_respondents_segment'] = survey_demographics_age.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_demographics_age['share_of_total'] = round(100 * survey_demographics_age['respondent_id'] / survey_demographics_age['nr_respondents_segment'],1)

survey_demographics_age_pivot = survey_demographics_age.pivot(index='Q1|What is your age (# years)?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_demographics_age_pivot= survey_demographics_age_pivot.fillna(0)
survey_demographics_age_pivot['diff'] = survey_demographics_age_pivot['More than 70K'] - survey_demographics_age_pivot['Less than 70K']

# visualization - age
fig = go.Figure()

for i in range(survey_demographics_age_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_demographics_age_pivot['Less than 70K'].iloc[i], 
        y0=survey_demographics_age_pivot['Q1|What is your age (# years)?'].iloc[i], 
        x1=survey_demographics_age_pivot['More than 70K'].iloc[i], 
        y1=survey_demographics_age_pivot['Q1|What is your age (# years)?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_demographics_age_pivot['Less than 70K'],
            y=survey_demographics_age_pivot['Q1|What is your age (# years)?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_demographics_age_pivot.shape[0],
                color=["#DEBAE6"] * survey_demographics_age_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_demographics_age_pivot['More than 70K'],
            y=survey_demographics_age_pivot['Q1|What is your age (# years)?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_demographics_age_pivot.shape[0],
                color=["#C54DFD"] * survey_demographics_age_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="What is your age (# years)?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="Over 30, chances of earning over 70K<br> starts to be higher than<br>earning less than 70K.</br>",
                  xref="paper", yref="paper",
                  x=0.95, y=0.38, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Education
demographics_data = demographics_data.replace('No formal education past high school', "No Bachelor's degree")
demographics_data = demographics_data.replace('Some college/university study without earning a bachelor’s degree', "No Bachelor's degree")


survey_demographics_edu = demographics_data.groupby(['compensation_segment',
                                                         'Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?',
                                                                                                              'respondent_id']]
survey_demographics_edu['nr_respondents_segment'] = survey_demographics_edu.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_demographics_edu['share_of_total'] = round(100 * survey_demographics_edu['respondent_id'] / survey_demographics_edu['nr_respondents_segment'],1)

survey_demographics_edu_pivot = survey_demographics_edu.pivot(index='Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_demographics_edu_pivot= survey_demographics_edu_pivot.fillna(0)
survey_demographics_edu_pivot['Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'] = pd.Categorical(survey_demographics_edu_pivot['Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'], 
                      categories=["I prefer not to answer",
                                  "No Bachelor's degree",
                                  #"No formal education past high school",
                                  #"Some college/university study without earning a bachelor’s degree",
                                  "Bachelor’s degree",
                                  "Master’s degree",
                                  "Professional doctorate",
                                  "Doctoral degree"],
                      ordered=True)

survey_demographics_edu_pivot = survey_demographics_edu_pivot.sort_values(by='Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?')

# visualization - education
fig = go.Figure()

for i in range(survey_demographics_edu_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_demographics_edu_pivot['Less than 70K'].iloc[i], 
        y0=survey_demographics_edu_pivot['Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'].iloc[i], 
        x1=survey_demographics_edu_pivot['More than 70K'].iloc[i], 
        y1=survey_demographics_edu_pivot['Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_demographics_edu_pivot['Less than 70K'],
            y=survey_demographics_edu_pivot['Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_demographics_edu_pivot.shape[0],
                color=["#DEBAE6"] * survey_demographics_edu_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_demographics_edu_pivot['More than 70K'],
            y=survey_demographics_edu_pivot['Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_demographics_edu_pivot.shape[0],
                color=["#C54DFD"] * survey_demographics_edu_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q4|What is the highest level of formal education that you have attained or plan to attain within the next 2 years?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="Higher academic degrees help you get higher pay with higher chance<br> But the difference between a Master and a Doctor isn't that much.</br>",
                  xref="paper", yref="paper",
                  x=0.45, y=0.58, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

* Work
1. industry
2. Company size
3. Role
4. Ml status of the business
5. Size of DS team in the company
6. activities at work

In [None]:
# work dataset
work_questions = []
work_questions.append('respondent_id')
work_questions.append('compensation_segment')
for col in survey_compensation.columns:
    for q in ('Q5','Q20','Q21','Q22','Q23','Q24'):
        if q in col.split("|")[0]:
            work_questions.append(col)
        else:
            pass

work_data = survey_compensation[work_questions]

In [None]:
# title/role
survey_work_title = work_data.groupby(['compensation_segment',
                                     'Q5|Select the title most similar to your current role (or most recent title if retired): - Selected Choice']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q5|Select the title most similar to your current role (or most recent title if retired): - Selected Choice',
                                                                                                              'respondent_id']]
survey_work_title['nr_respondents_segment'] = survey_work_title.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_work_title['share_of_total'] = round(100 * survey_work_title['respondent_id'] / survey_work_title['nr_respondents_segment'],1)

survey_work_title_pivot = survey_work_title.pivot(index='Q5|Select the title most similar to your current role (or most recent title if retired): - Selected Choice', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_work_title_pivot= survey_work_title_pivot.fillna(0)
survey_work_title_pivot['diff'] = survey_work_title_pivot['More than 70K'] - survey_work_title_pivot['Less than 70K']
survey_work_title_pivot = survey_work_title_pivot.sort_values(by='diff')
# visualization - title
fig = go.Figure()

for i in range(survey_work_title_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_work_title_pivot['Less than 70K'].iloc[i], 
        y0=survey_work_title_pivot['Q5|Select the title most similar to your current role (or most recent title if retired): - Selected Choice'].iloc[i], 
        x1=survey_work_title_pivot['More than 70K'].iloc[i], 
        y1=survey_work_title_pivot['Q5|Select the title most similar to your current role (or most recent title if retired): - Selected Choice'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_work_title_pivot['Less than 70K'],
            y=survey_work_title_pivot['Q5|Select the title most similar to your current role (or most recent title if retired): - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_work_title_pivot.shape[0],
                color=["#DEBAE6"] * survey_work_title_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_work_title_pivot['More than 70K'],
            y=survey_work_title_pivot['Q5|Select the title most similar to your current role (or most recent title if retired): - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_work_title_pivot.shape[0],
                color=["#C54DFD"] * survey_work_title_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q5|Select the title most similar to your current role",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="Data scientists takes up most of  the participants<br>And they are most likely to earn more than 70K USD than others</br>",
                  xref="paper", yref="paper",
                  x=0.95, y=1.1, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# activities - nr of activities
Q24 = []
Q24.append('compensation_segment')
Q24.append('respondent_id')
for col in work_data.columns:
    if 'Q24' in col:
        Q24.append(col)
    else:
        pass

Q24_compensation = survey_compensation[Q24]
Q24_compensation['nr_activities'] = Q24_compensation[Q24].count(axis = 1) - 2

# nr of acts
survey_work_nrAct = Q24_compensation.groupby(['compensation_segment','nr_activities']).count().reset_index()[['compensation_segment',
                                                                                                              'nr_activities',
                                                                                                              'respondent_id']]
survey_work_nrAct['nr_respondents_segment'] = survey_work_nrAct.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_work_nrAct['share_of_total'] = round(100 * survey_work_nrAct['respondent_id'] / survey_work_nrAct['nr_respondents_segment'],1)

survey_work_nrAct_pivot = survey_work_nrAct.pivot(index='nr_activities', columns='compensation_segment', values='share_of_total').reset_index()
#survey_work_nrAct_pivot['diff'] = survey_work_nrAct_pivot['More than 70K'] - survey_work_nrAct_pivot['Less than 70K']
survey_work_nrAct_pivot = survey_work_nrAct_pivot.sort_values(by='nr_activities')

In [None]:
# visualization - nr of activities
fig = go.Figure()

for i in range(survey_work_nrAct_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_work_nrAct_pivot['Less than 70K'].iloc[i], 
        y0=survey_work_nrAct_pivot['nr_activities'].iloc[i], 
        x1=survey_work_nrAct_pivot['More than 70K'].iloc[i], 
        y1=survey_work_nrAct_pivot['nr_activities'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_work_nrAct_pivot['Less than 70K'],
            y=survey_work_nrAct_pivot['nr_activities'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_work_nrAct_pivot.shape[0],
                color=["#DEBAE6"] * survey_work_nrAct_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_work_nrAct_pivot['More than 70K'],
            y=survey_work_nrAct_pivot['nr_activities'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_work_nrAct_pivot.shape[0],
                color=["#C54DFD"] * survey_work_nrAct_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="How many activities do you do in your work",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="If you are responsible for three activities or more, <br> you are more likely to be one of the high earners.</br>",
                  xref="paper", yref="paper",
                  x=0.65, y=0.3, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# What single activity help you earn more?
Q24_compensation_count = Q24_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q24_compensation_count.columns
new_cols = []
for col in Q24_compensation_count[cols].columns:
    if 'Q24' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q24_compensation_count[new_col] = round(100 * Q24_compensation_count[col] / Q24_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q24_compensation_count_share = Q24_compensation_count[new_cols].T.reset_index()
Q24_compensation_count_share_pivot = Q24_compensation_count_share.loc[Q24_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'Activity at work', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q24_compensation_count_share_pivot= Q24_compensation_count_share_pivot.fillna(0)

Q24_compensation_count_share_pivot['diff'] = Q24_compensation_count_share_pivot['More than 70K'] - Q24_compensation_count_share_pivot['Less than 70K']
Q24_compensation_count_share_pivot = Q24_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - one activity
fig = go.Figure()

for i in range(Q24_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q24_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q24_compensation_count_share_pivot['Activity at work'].iloc[i], 
        x1=Q24_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q24_compensation_count_share_pivot['Activity at work'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q24_compensation_count_share_pivot['Less than 70K'],
            y=Q24_compensation_count_share_pivot['Activity at work'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q24_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q24_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q24_compensation_count_share_pivot['More than 70K'],
            y=Q24_compensation_count_share_pivot['Activity at work'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q24_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q24_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Select any activities that make up an important part of your role at work",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="If your work is related<br> with machine learning, <br> you are more likely to be one of the high earners.</br>",
                  xref="paper", yref="paper",
                  x=1.2, y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

What data scientists do are different from others are applying machine learning knowledge and skills into their work.
That makes them earn more month than others.

In [None]:
# Q23|Does your current employer incorporate machine learning methods into their business?
survey_work_ml = work_data.groupby(['compensation_segment',
                                     'Q23|Does your current employer incorporate machine learning methods into their business?']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q23|Does your current employer incorporate machine learning methods into their business?',
                                                                                                              'respondent_id']]
survey_work_ml['nr_respondents_segment'] = survey_work_ml.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_work_ml['share_of_total'] = round(100 * survey_work_ml['respondent_id'] / survey_work_ml['nr_respondents_segment'],1)

survey_work_ml_pivot = survey_work_ml.pivot(index='Q23|Does your current employer incorporate machine learning methods into their business?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_work_ml_pivot= survey_work_ml_pivot.fillna(0)
survey_work_ml_pivot['diff'] = survey_work_ml_pivot['More than 70K'] - survey_work_ml_pivot['Less than 70K']
survey_work_ml_pivot = survey_work_ml_pivot.sort_values(by='diff')

# visualization - ML in the company
fig = go.Figure()

for i in range(survey_work_ml_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_work_ml_pivot['Less than 70K'].iloc[i], 
        y0=survey_work_ml_pivot['Q23|Does your current employer incorporate machine learning methods into their business?'].iloc[i], 
        x1=survey_work_ml_pivot['More than 70K'].iloc[i], 
        y1=survey_work_ml_pivot['Q23|Does your current employer incorporate machine learning methods into their business?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_work_ml_pivot['Less than 70K'],
            y=survey_work_ml_pivot['Q23|Does your current employer incorporate machine learning methods into their business?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_work_ml_pivot.shape[0],
                color=["#DEBAE6"] * survey_work_ml_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_work_ml_pivot['More than 70K'],
            y=survey_work_ml_pivot['Q23|Does your current employer incorporate machine learning methods into their business?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_work_ml_pivot.shape[0],
                color=["#C54DFD"] * survey_work_ml_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q23|Does your current employer incorporate machine learning methods into their business?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="More mature your employer applies<br> machine learning into the business model,<br> more likely you earn more than 70K USD than others</br>",
                  xref="paper", yref="paper",
                  x=1.2, y=0.85, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q22|Approximately how many individuals are responsible for data science workloads at your place of business?
survey_work_ds = work_data.groupby(['compensation_segment',
                                     'Q22|Approximately how many individuals are responsible for data science workloads at your place of business?']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q22|Approximately how many individuals are responsible for data science workloads at your place of business?',
                                                                                                              'respondent_id']]
survey_work_ds['nr_respondents_segment'] = survey_work_ds.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_work_ds['share_of_total'] = round(100 * survey_work_ds['respondent_id'] / survey_work_ds['nr_respondents_segment'],1)

survey_work_ds_pivot = survey_work_ds.pivot(index='Q22|Approximately how many individuals are responsible for data science workloads at your place of business?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_work_ds_pivot= survey_work_ds_pivot.fillna(0)
survey_work_ds_pivot['Q22|Approximately how many individuals are responsible for data science workloads at your place of business?'] = pd.Categorical(survey_work_ds_pivot['Q22|Approximately how many individuals are responsible for data science workloads at your place of business?'], 
                      categories=["0",
                                 "1-2",
                                 "3-4",
                                 "5-9",
                                 "10-14",
                                 "15-19",
                                 "20+"],
                      ordered=True)

#survey_demographics_ds_pivot['diff'] = survey_demographics_ds_pivot['top 20'] - survey_demographics_ds_pivot['bottom 80']
survey_work_ds_pivot = survey_work_ds_pivot.sort_values(by = 'Q22|Approximately how many individuals are responsible for data science workloads at your place of business?')

In [None]:
# visualization - DS team in the company
fig = go.Figure()

for i in range(survey_work_ds_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_work_ds_pivot['Less than 70K'].iloc[i], 
        y0=survey_work_ds_pivot['Q22|Approximately how many individuals are responsible for data science workloads at your place of business?'].iloc[i], 
        x1=survey_work_ds_pivot['More than 70K'].iloc[i], 
        y1=survey_work_ds_pivot['Q22|Approximately how many individuals are responsible for data science workloads at your place of business?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_work_ds_pivot['Less than 70K'],
            y=survey_work_ds_pivot['Q22|Approximately how many individuals are responsible for data science workloads at your place of business?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_work_ds_pivot.shape[0],
                color=["#DEBAE6"] * survey_work_ds_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_work_ds_pivot['More than 70K'],
            y=survey_work_ds_pivot['Q22|Approximately how many individuals are responsible for data science workloads at your place of business?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_work_ds_pivot.shape[0],
                color=["#C54DFD"] * survey_work_ds_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q22|Approximately how many individuals are responsible for data science workloads at your place of business?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="Join a company that has as many people working in the data science as possible",
                  xref="paper", yref="paper",
                  x=0.95, y=0.85, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# 'Q21|What is the size of the company where you are employed?'
survey_work_size = work_data.groupby(['compensation_segment',
                                     'Q21|What is the size of the company where you are employed?']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q21|What is the size of the company where you are employed?',
                                                                                                              'respondent_id']]
survey_work_size['nr_respondents_segment'] = survey_work_size.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_work_size['share_of_total'] = round(100 * survey_work_size['respondent_id'] / survey_work_size['nr_respondents_segment'],1)

survey_work_size_pivot = survey_work_size.pivot(index='Q21|What is the size of the company where you are employed?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_work_size_pivot= survey_work_size_pivot.fillna(0)
survey_work_size_pivot['Q21|What is the size of the company where you are employed?'] = pd.Categorical(survey_work_size_pivot['Q21|What is the size of the company where you are employed?'], 
                      categories=["0-49 employees",
                                 "50-249 employees",
                                 "250-999 employees",
                                 "1000-9,999 employees",
                                 "10,000 or more employees"],
                      ordered=True)
#survey_demographics_ds_pivot['diff'] = survey_demographics_ds_pivot['top 20'] - survey_demographics_ds_pivot['bottom 80']
survey_work_size_pivot = survey_work_size_pivot.sort_values(by = 'Q21|What is the size of the company where you are employed?')

In [None]:
# visualization - size
fig = go.Figure()

for i in range(survey_work_size_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_work_size_pivot['Less than 70K'].iloc[i], 
        y0=survey_work_size_pivot['Q21|What is the size of the company where you are employed?'].iloc[i], 
        x1=survey_work_size_pivot['More than 70K'].iloc[i], 
        y1=survey_work_size_pivot['Q21|What is the size of the company where you are employed?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_work_size_pivot['Less than 70K'],
            y=survey_work_size_pivot['Q21|What is the size of the company where you are employed?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_work_size_pivot.shape[0],
                color=["#DEBAE6"] * survey_work_size_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_work_size_pivot['More than 70K'],
            y=survey_work_size_pivot['Q21|What is the size of the company where you are employed?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_work_size_pivot.shape[0],
                color=["#C54DFD"] * survey_work_size_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q21|What is the size of the company where you are employed?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="The bigger the company, the more money you would be paid.",
                  xref="paper", yref="paper",
                  x=0.5, y=0.85, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# 'Q20|In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice'
survey_work_industry = work_data.groupby(['compensation_segment',
                                     'Q20|In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q20|In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice',
                                                                                                              'respondent_id']]
survey_work_industry['nr_respondents_segment'] = survey_work_industry.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_work_industry['share_of_total'] = round(100 * survey_work_industry['respondent_id'] / survey_work_industry['nr_respondents_segment'],1)

survey_work_industry_pivot = survey_work_industry.pivot(index='Q20|In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_work_industry_pivot= survey_work_industry_pivot.fillna(0)
survey_work_industry_pivot['diff'] = survey_work_industry_pivot['More than 70K'] - survey_work_industry_pivot['Less than 70K']
survey_work_industry_pivot = survey_work_industry_pivot.sort_values(by = 'diff')
# visualization - industry
fig = go.Figure()

for i in range(survey_work_industry_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_work_industry_pivot['Less than 70K'].iloc[i], 
        y0=survey_work_industry_pivot['Q20|In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice'].iloc[i], 
        x1=survey_work_industry_pivot['More than 70K'].iloc[i], 
        y1=survey_work_industry_pivot['Q20|In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_work_industry_pivot['Less than 70K'],
            y=survey_work_industry_pivot['Q20|In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_work_industry_pivot.shape[0],
                color=["#DEBAE6"] * survey_work_industry_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_work_industry_pivot['More than 70K'],
            y=survey_work_industry_pivot['Q20|In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_work_industry_pivot.shape[0],
                color=["#C54DFD"] * survey_work_industry_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q20|In what industry is your current employer/contract (or your most recent employer if retired)? - Selected Choice",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="Medical/Pharmaceutical and Accounting/Finance are high paid industry.",
                  xref="paper", yref="paper",
                  x=1, y=0.95, showarrow=False)

fig.add_annotation(text="Computers/Technology take up the most amount of participants <br> and it has low/high paid on either end.",
                  xref="paper", yref="paper",
                  x=1, y=0.45, showarrow=False)

fig.add_annotation(text="Acadimics/Education has the highest amount people who earned less than 70K.",
                  xref="paper", yref="paper",
                  x=1, y=0.1, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

* Skill
1. Year of coding / ml
2. Where do they learn / share knowledge?
3. What field/model/framework do they usually use?

In [None]:
# Q6: For how many years have you been writing code and/or programming?
# Q7: What programming languages do you use on a regular basis? (Select all that apply)
# Q8: What programming language would you recommend an aspiring data scientist to learn first?
# Q15: For how many years have you used machine learning methods?
# Q16: Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply)
# Q17: Which of the following ML algorithms do you use on a regular basis? (Select all that apply):
# Q18: Which categories of computer vision methods do you use on a regular basis? (Select all that apply)
# Q19: Which of the following natural language processing (NLP) methods do you use on a regular basis? (Select all that apply)
# Q39: Where do you publicly share or deploy your data analysis or machine learning applications? (Select all that apply)
# Q40: On which platforms have you begun or completed data science courses? (Select all that apply)
# Q42: Who/what are your favorite media sources that report on data science topics? (Select all that apply)

In [None]:
# skill/knowledge dataset
skill_knowledge_questions = []
skill_knowledge_questions.append('respondent_id')
skill_knowledge_questions.append('compensation_segment')
for col in survey_compensation.columns:
    for q in ('Q6','Q7','Q8','Q15','Q16','Q17','Q18','Q19','Q39','Q40','Q42'):
        if q in col.split("|")[0]:
            skill_knowledge_questions.append(col)
        else:
            pass

skill_knowledge_data = survey_compensation[skill_knowledge_questions]

In [None]:
# Q6: For how many years have you been writing code and/or programming?
survey_skill_knowledge_pro_year = skill_knowledge_data.groupby(['compensation_segment',
                                                         'Q6|For how many years have you been writing code and/or programming?']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q6|For how many years have you been writing code and/or programming?',
                                                                                                              'respondent_id']]
survey_skill_knowledge_pro_year['nr_respondents_segment'] = survey_skill_knowledge_pro_year.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_skill_knowledge_pro_year['share_of_total'] = round(100 * survey_skill_knowledge_pro_year['respondent_id'] / survey_skill_knowledge_pro_year['nr_respondents_segment'],1)

survey_skill_knowledge_pro_year_pivot = survey_skill_knowledge_pro_year.pivot(index='Q6|For how many years have you been writing code and/or programming?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_skill_knowledge_pro_year_pivot= survey_skill_knowledge_pro_year_pivot.fillna(0)
survey_skill_knowledge_pro_year_pivot['diff'] = survey_skill_knowledge_pro_year_pivot['More than 70K'] - survey_skill_knowledge_pro_year_pivot['Less than 70K']
survey_skill_knowledge_pro_year_pivot['Q6|For how many years have you been writing code and/or programming?'] = pd.Categorical(survey_skill_knowledge_pro_year_pivot['Q6|For how many years have you been writing code and/or programming?'], 
                      categories=["I have never written code",
                                  "< 1 years",
                                  "1-3 years",
                                  "3-5 years",
                                  "5-10 years",
                                 "10-20 years",
                                 "20+ years"],
                      ordered=True)
survey_skill_knowledge_pro_year_pivot = survey_skill_knowledge_pro_year_pivot.sort_values(by = 'Q6|For how many years have you been writing code and/or programming?')

# visualization - year of programming
fig = go.Figure()

for i in range(survey_skill_knowledge_pro_year_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_skill_knowledge_pro_year_pivot['Less than 70K'].iloc[i], 
        y0=survey_skill_knowledge_pro_year_pivot['Q6|For how many years have you been writing code and/or programming?'].iloc[i], 
        x1=survey_skill_knowledge_pro_year_pivot['More than 70K'].iloc[i], 
        y1=survey_skill_knowledge_pro_year_pivot['Q6|For how many years have you been writing code and/or programming?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_skill_knowledge_pro_year_pivot['Less than 70K'],
            y=survey_skill_knowledge_pro_year_pivot['Q6|For how many years have you been writing code and/or programming?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_skill_knowledge_pro_year_pivot.shape[0],
                color=["#DEBAE6"] * survey_skill_knowledge_pro_year_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_skill_knowledge_pro_year_pivot['More than 70K'],
            y=survey_skill_knowledge_pro_year_pivot['Q6|For how many years have you been writing code and/or programming?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_skill_knowledge_pro_year_pivot.shape[0],
                color=["#C54DFD"] * survey_skill_knowledge_pro_year_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q6|For how many years have you been writing code and/or programming?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="The longer years you wrote code <br> the more you likely to be paid high.",
                  xref="paper", yref="paper",
                  x=0.05, y=0.65, showarrow=False)


fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# activities - nr of languages
Q7 = []
Q7.append('compensation_segment')
Q7.append('respondent_id')
for col in skill_knowledge_data.columns:
    if 'Q7' in col:
        Q7.append(col)
    else:
        pass

Q7_compensation = survey_compensation[Q7]
Q7_compensation['nr_prog_langs'] = Q7_compensation[Q7].count(axis = 1) - 2

# nr of languages
survey_skill_nrLan = Q7_compensation.groupby(['compensation_segment','nr_prog_langs']).count().reset_index()[['compensation_segment',
                                                                                                              'nr_prog_langs',
                                                                                                              'respondent_id']]
survey_skill_nrLan['nr_respondents_segment'] = survey_skill_nrLan.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_skill_nrLan['share_of_total'] = round(100 * survey_skill_nrLan['respondent_id'] / survey_skill_nrLan['nr_respondents_segment'],1)

nr_prog_langs_pivot = survey_skill_nrLan.pivot(index='nr_prog_langs', columns='compensation_segment', values='share_of_total').reset_index()
#survey_work_nrAct_pivot['diff'] = survey_work_nrAct_pivot['More than 70K'] - survey_work_nrAct_pivot['Less than 70K']
nr_prog_langs_pivot = nr_prog_langs_pivot.fillna(0)
nr_prog_langs_pivot = nr_prog_langs_pivot.sort_values(by='nr_prog_langs')

In [None]:
# visualization - nr of programming languages
fig = go.Figure()

for i in range(nr_prog_langs_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=nr_prog_langs_pivot['Less than 70K'].iloc[i], 
        y0=nr_prog_langs_pivot['nr_prog_langs'].iloc[i], 
        x1=nr_prog_langs_pivot['More than 70K'].iloc[i], 
        y1=nr_prog_langs_pivot['nr_prog_langs'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=nr_prog_langs_pivot['Less than 70K'],
            y=nr_prog_langs_pivot['nr_prog_langs'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * nr_prog_langs_pivot.shape[0],
                color=["#DEBAE6"] * nr_prog_langs_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=nr_prog_langs_pivot['More than 70K'],
            y=nr_prog_langs_pivot['nr_prog_langs'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * nr_prog_langs_pivot.shape[0],
                color=["#C54DFD"] * nr_prog_langs_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="How many programming languages do you use on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="If you use two programming languages or more, <br> you are more likely to be one of the high earners.</br>",
                  xref="paper", yref="paper",
                  x=0.65, y=0.3, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# What single language help you earn more?
Q7_compensation_count = Q7_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q7_compensation_count.columns
new_cols = []
for col in Q7_compensation_count[cols].columns:
    if 'Q7' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q7_compensation_count[new_col] = round(100 * Q7_compensation_count[col] / Q7_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q7_compensation_count_share = Q7_compensation_count[new_cols].T.reset_index()
Q7_compensation_count_share_pivot = Q7_compensation_count_share.loc[Q7_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'Programming Language', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q7_compensation_count_share_pivot= Q7_compensation_count_share_pivot.fillna(0)

Q7_compensation_count_share_pivot['diff'] = Q7_compensation_count_share_pivot['More than 70K'] - Q7_compensation_count_share_pivot['Less than 70K']
Q7_compensation_count_share_pivot = Q7_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - one activity
fig = go.Figure()

for i in range(Q7_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q7_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q7_compensation_count_share_pivot['Programming Language'].iloc[i], 
        x1=Q7_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q7_compensation_count_share_pivot['Programming Language'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q7_compensation_count_share_pivot['Less than 70K'],
            y=Q7_compensation_count_share_pivot['Programming Language'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q7_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q7_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q7_compensation_count_share_pivot['More than 70K'],
            y=Q7_compensation_count_share_pivot['Programming Language'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q7_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q7_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="What programming language do you use on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="If you use Bash, SQL or R <br> you are more likely to be one of the high earners.</br>",
                  xref="paper", yref="paper",
                  x=0.8 , y=1, showarrow=False)

fig.add_annotation(text="Python is the most used language.",
                  xref="paper", yref="paper",
                  x=0.9 , y=0.6, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
#'Q8|What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice',
survey_skill_language_recommend = skill_knowledge_data.groupby(['compensation_segment',
                                     'Q8|What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q8|What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice',
                                                                                                              'respondent_id']]
survey_skill_language_recommend['nr_respondents_segment'] = survey_skill_language_recommend.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_skill_language_recommend['share_of_total'] = round(100 * survey_skill_language_recommend['respondent_id'] / survey_skill_language_recommend['nr_respondents_segment'],1)

survey_skill_language_recommend_pivot = survey_skill_language_recommend.pivot(index='Q8|What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_skill_language_recommend_pivot= survey_skill_language_recommend_pivot.fillna(0)
survey_skill_language_recommend_pivot['diff'] = survey_skill_language_recommend_pivot['More than 70K'] - survey_work_industry_pivot['Less than 70K']
survey_skill_language_recommend_pivot = survey_skill_language_recommend_pivot.sort_values(by = 'diff')
# visualization - recommend language
fig = go.Figure()

for i in range(survey_skill_language_recommend_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_skill_language_recommend_pivot['Less than 70K'].iloc[i], 
        y0=survey_skill_language_recommend_pivot['Q8|What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice'].iloc[i], 
        x1=survey_skill_language_recommend_pivot['More than 70K'].iloc[i], 
        y1=survey_skill_language_recommend_pivot['Q8|What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_skill_language_recommend_pivot['Less than 70K'],
            y=survey_skill_language_recommend_pivot['Q8|What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_skill_language_recommend_pivot.shape[0],
                color=["#DEBAE6"] * survey_skill_language_recommend_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_skill_language_recommend_pivot['More than 70K'],
            y=survey_skill_language_recommend_pivot['Q8|What programming language would you recommend an aspiring data scientist to learn first? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_skill_language_recommend_pivot.shape[0],
                color=["#C54DFD"] * survey_skill_language_recommend_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q8|What programming language would you recommend an aspiring data scientist to learn first?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="Python is still the most recommended, <br>though fewer high paid people in this group.",
                  xref="paper", yref="paper",
                  x=0.85, y=0.95, showarrow=False)

fig.add_annotation(text="SQL and R are preffered by more high paid people",
                  xref="paper", yref="paper",
                  x=0.2, y=0.8, showarrow=False)

fig.add_annotation(text="Only a few amount of people recommend <br>people learn Bash as the first language<br>Is it because its difficulty?",
                  xref="paper", yref="paper",
                  x=0.2, y=0.10, showarrow=False)


fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
#       'Q15|For how many years have you used machine learning methods?'
# Q6: For how many years have you been writing code and/or programming?
survey_skill_knowledge_ml_year = skill_knowledge_data.groupby(['compensation_segment',
                                                         'Q15|For how many years have you used machine learning methods?']).count().reset_index()[[
                                                                                                              'compensation_segment',
                                                                                                              'Q15|For how many years have you used machine learning methods?',
                                                                                                              'respondent_id']]
survey_skill_knowledge_ml_year['nr_respondents_segment'] = survey_skill_knowledge_ml_year.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
survey_skill_knowledge_ml_year['share_of_total'] = round(100 * survey_skill_knowledge_ml_year['respondent_id'] / survey_skill_knowledge_ml_year['nr_respondents_segment'],1)

survey_skill_knowledge_ml_year_pivot = survey_skill_knowledge_ml_year.pivot(index='Q15|For how many years have you used machine learning methods?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
survey_skill_knowledge_ml_year_pivot= survey_skill_knowledge_ml_year_pivot.fillna(0)
survey_skill_knowledge_ml_year_pivot['diff'] = survey_skill_knowledge_ml_year_pivot['More than 70K'] - survey_skill_knowledge_ml_year_pivot['Less than 70K']
survey_skill_knowledge_ml_year_pivot['Q15|For how many years have you used machine learning methods?'] = pd.Categorical(survey_skill_knowledge_ml_year_pivot['Q15|For how many years have you used machine learning methods?'], 
                      categories=['I do not use machine learning methods',
                                  'Under 1 year',
                                  "1-2 years",
                                  "2-3 years",
                                  "3-4 years",
                                 "4-5 years",
                                 "5-10 years",
                                 "10-20 years",
                                 "20 or more years"],                                                                                                                   
                      ordered=True)
survey_skill_knowledge_ml_year_pivot = survey_skill_knowledge_ml_year_pivot.sort_values(by = 'Q15|For how many years have you used machine learning methods?')

# visualization - year of ML
fig = go.Figure()

for i in range(survey_skill_knowledge_ml_year_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=survey_skill_knowledge_ml_year_pivot['Less than 70K'].iloc[i], 
        y0=survey_skill_knowledge_ml_year_pivot['Q15|For how many years have you used machine learning methods?'].iloc[i], 
        x1=survey_skill_knowledge_ml_year_pivot['More than 70K'].iloc[i], 
        y1=survey_skill_knowledge_ml_year_pivot['Q15|For how many years have you used machine learning methods?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=survey_skill_knowledge_ml_year_pivot['Less than 70K'],
            y=survey_skill_knowledge_ml_year_pivot['Q15|For how many years have you used machine learning methods?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * survey_skill_knowledge_ml_year_pivot.shape[0],
                color=["#DEBAE6"] * survey_skill_knowledge_ml_year_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=survey_skill_knowledge_ml_year_pivot['More than 70K'],
            y=survey_skill_knowledge_ml_year_pivot['Q15|For how many years have you used machine learning methods?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * survey_skill_knowledge_ml_year_pivot.shape[0],
                color=["#C54DFD"] * survey_skill_knowledge_ml_year_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q15|For how many years have you used machine learning methods?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="The longer years you use machine learning methods, <br> the more you likely to be paid high.<br> At least three years experience is competitive",
                  xref="paper", yref="paper",
                  x=0.55, y=0.5, showarrow=False)


fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# 'Q16_Part_1|Which of the following machine learning frameworks do you use on a regular basis? (Select all that apply)'
# activities - nr of languages
Q16 = []
Q16.append('compensation_segment')
Q16.append('respondent_id')
for col in skill_knowledge_data.columns:
    if 'Q16' in col:
        Q16.append(col)
    else:
        pass

Q16_compensation = survey_compensation[Q16]
Q16_compensation_count = Q16_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q16_compensation_count.columns
new_cols = []
for col in Q16_compensation_count[cols].columns:
    if 'Q16' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q16_compensation_count[new_col] = round(100 * Q16_compensation_count[col] / Q16_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q16_compensation_count_share = Q16_compensation_count[new_cols].T.reset_index()
Q16_compensation_count_share_pivot = Q16_compensation_count_share.loc[Q16_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'frameworks', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q16_compensation_count_share_pivot= Q16_compensation_count_share_pivot.fillna(0)

Q16_compensation_count_share_pivot['diff'] = Q16_compensation_count_share_pivot['More than 70K'] - Q16_compensation_count_share_pivot['Less than 70K']
Q16_compensation_count_share_pivot = Q16_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - one activity
fig = go.Figure()

for i in range(Q16_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q16_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q16_compensation_count_share_pivot['frameworks'].iloc[i], 
        x1=Q16_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q16_compensation_count_share_pivot['frameworks'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q16_compensation_count_share_pivot['Less than 70K'],
            y=Q16_compensation_count_share_pivot['frameworks'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q16_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q16_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q16_compensation_count_share_pivot['More than 70K'],
            y=Q16_compensation_count_share_pivot['frameworks'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q16_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q16_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="What machine learning framework do you use on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="If you use Xgboost, Scikit-learn or LightGBM <br> you are more likely to be one of the high earners.</br>",
                  xref="paper", yref="paper",
                  x=0.9 , y=1, showarrow=False)

fig.add_annotation(text="Scikit-learn is the most used language.",
                  xref="paper", yref="paper",
                  x=0.95 , y=0.85, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# 'Q17_Part_1|Which of the following ML algorithms do you use on a regular basis? (Select all that apply)',
# What are these algorithms?
Q17 = []
Q17.append('compensation_segment')
Q17.append('respondent_id')
for col in skill_knowledge_data.columns:
    if 'Q17' in col:
        Q17.append(col)
    else:
        pass

Q17_compensation = survey_compensation[Q17]
Q17_compensation_count = Q17_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q17_compensation_count.columns
new_cols = []
for col in Q17_compensation_count[cols].columns:
    if 'Q17' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q17_compensation_count[new_col] = round(100 * Q17_compensation_count[col] / Q17_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q17_compensation_count_share = Q17_compensation_count[new_cols].T.reset_index()
Q17_compensation_count_share_pivot = Q17_compensation_count_share.loc[Q17_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'algorithm', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q17_compensation_count_share_pivot= Q17_compensation_count_share_pivot.fillna(0)

Q17_compensation_count_share_pivot['diff'] = Q17_compensation_count_share_pivot['More than 70K'] - Q17_compensation_count_share_pivot['Less than 70K']
Q17_compensation_count_share_pivot = Q17_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - Algorithm
fig = go.Figure()

for i in range(Q17_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q17_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q17_compensation_count_share_pivot['algorithm'].iloc[i], 
        x1=Q17_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q17_compensation_count_share_pivot['algorithm'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q17_compensation_count_share_pivot['Less than 70K'],
            y=Q17_compensation_count_share_pivot['algorithm'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q17_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q17_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q17_compensation_count_share_pivot['More than 70K'],
            y=Q17_compensation_count_share_pivot['algorithm'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q17_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q17_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="What machine learning algorithm do you use on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )

fig.add_annotation(text="If you use Gradient Boosting Machine or Decision Trees <br> or Linear or Logistic Regression <br> you are more likely to be one of the high earners.</br>",
                  xref="paper", yref="paper",
                  x=0.1 , y=1, showarrow=False)

fig.add_annotation(text="Those three are also most used algorithm.",
                  xref="paper", yref="paper",
                  x=0.95 , y=0.75, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# 'Q18 Which categories of computer vision methods do you use on a regular basis?',
Q18 = []
Q18.append('compensation_segment')
Q18.append('respondent_id')
for col in skill_knowledge_data.columns:
    if 'Q18' in col:
        Q18.append(col)
    else:
        pass

Q18_compensation = survey_compensation[Q18]
Q18_compensation_count = Q18_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q18_compensation_count.columns
new_cols = []
for col in Q18_compensation_count[cols].columns:
    if 'Q18' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q18_compensation_count[new_col] = round(100 * Q18_compensation_count[col] / Q18_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q18_compensation_count_share = Q18_compensation_count[new_cols].T.reset_index()
Q18_compensation_count_share_pivot = Q18_compensation_count_share.loc[Q18_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'compVision', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q18_compensation_count_share_pivot= Q18_compensation_count_share_pivot.fillna(0)

Q18_compensation_count_share_pivot['diff'] = Q18_compensation_count_share_pivot['More than 70K'] - Q18_compensation_count_share_pivot['Less than 70K']
Q18_compensation_count_share_pivot = Q18_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - Computer vision
fig = go.Figure()

for i in range(Q18_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q18_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q18_compensation_count_share_pivot['compVision'].iloc[i], 
        x1=Q18_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q18_compensation_count_share_pivot['compVision'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q18_compensation_count_share_pivot['Less than 70K'],
            y=Q18_compensation_count_share_pivot['compVision'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q18_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q18_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q18_compensation_count_share_pivot['More than 70K'],
            y=Q18_compensation_count_share_pivot['compVision'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q18_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q18_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Which categories of computer vision methods do you use on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="High paid people are less <br>likely to work in the computer vision?",
                  xref="paper", yref="paper",
                  x=1.1 , y=0.95, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# 'Q19: Which of the following natural language processing (NLP) methods do you use on a regular basis?',
Q19 = []
Q19.append('compensation_segment')
Q19.append('respondent_id')
for col in skill_knowledge_data.columns:
    if 'Q19' in col:
        Q19.append(col)
    else:
        pass

Q19_compensation = survey_compensation[Q19]
Q19_compensation_count = Q19_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q19_compensation_count.columns
new_cols = []
for col in Q19_compensation_count[cols].columns:
    if 'Q19' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q19_compensation_count[new_col] = round(100 * Q19_compensation_count[col] / Q19_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q19_compensation_count_share = Q19_compensation_count[new_cols].T.reset_index()
Q19_compensation_count_share_pivot = Q19_compensation_count_share.loc[Q19_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'NLP', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q19_compensation_count_share_pivot= Q19_compensation_count_share_pivot.fillna(0)

Q19_compensation_count_share_pivot['diff'] = Q19_compensation_count_share_pivot['More than 70K'] - Q19_compensation_count_share_pivot['Less than 70K']
Q19_compensation_count_share_pivot = Q19_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - Computer vision
fig = go.Figure()

for i in range(Q19_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q19_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q19_compensation_count_share_pivot['NLP'].iloc[i], 
        x1=Q19_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q19_compensation_count_share_pivot['NLP'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q19_compensation_count_share_pivot['Less than 70K'],
            y=Q19_compensation_count_share_pivot['NLP'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q19_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q19_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q19_compensation_count_share_pivot['More than 70K'],
            y=Q19_compensation_count_share_pivot['NLP'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q19_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q19_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Which categories of NLP methods do you use on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="Transformer language models and <br>Word embeddings/vectors are paid higher if you use",
                  xref="paper", yref="paper",
                  x=0.2 , y=0.95, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# 'Q39_Part_1|Where do you publicly share your data analysis or machine learning applications? (Select all that apply) ',
Q39 = []
Q39.append('compensation_segment')
Q39.append('respondent_id')
for col in skill_knowledge_data.columns:
    if 'Q39' in col:
        Q39.append(col)
    else:
        pass

Q39_compensation = survey_compensation[Q39]
Q39_compensation_count = Q39_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q39_compensation_count.columns
new_cols = []
for col in Q39_compensation_count[cols].columns:
    if 'Q39' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q39_compensation_count[new_col] = round(100 * Q39_compensation_count[col] / Q39_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q39_compensation_count_share = Q39_compensation_count[new_cols].T.reset_index()
Q39_compensation_count_share_pivot = Q39_compensation_count_share.loc[Q39_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'publicShare', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q39_compensation_count_share_pivot= Q39_compensation_count_share_pivot.fillna(0)

Q39_compensation_count_share_pivot['diff'] = Q39_compensation_count_share_pivot['More than 70K'] - Q39_compensation_count_share_pivot['Less than 70K']
Q39_compensation_count_share_pivot = Q39_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - public share
fig = go.Figure()

for i in range(Q39_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q39_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q39_compensation_count_share_pivot['publicShare'].iloc[i], 
        x1=Q39_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q39_compensation_count_share_pivot['publicShare'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q39_compensation_count_share_pivot['Less than 70K'],
            y=Q39_compensation_count_share_pivot['publicShare'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q39_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q39_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q39_compensation_count_share_pivot['More than 70K'],
            y=Q39_compensation_count_share_pivot['publicShare'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q39_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q39_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Where do you publicly share your data analysis or machine learning applications?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="Higher paid people are less likely to share their work",
                  xref="paper", yref="paper",
                  x=0.1 , y=0.95, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# 'Q40_Part_10|On which platforms have you begun or completed data science courses? (Select all that apply) 
Q40 = []
Q40.append('compensation_segment')
Q40.append('respondent_id')
for col in skill_knowledge_data.columns:
    if 'Q40' in col:
        Q40.append(col)
    else:
        pass

Q40_compensation = survey_compensation[Q40]
Q40_compensation_count = Q40_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q40_compensation_count.columns
new_cols = []
for col in Q40_compensation_count[cols].columns:
    if 'Q40' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q40_compensation_count[new_col] = round(100 * Q40_compensation_count[col] / Q40_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q40_compensation_count_share = Q40_compensation_count[new_cols].T.reset_index()
Q40_compensation_count_share_pivot = Q40_compensation_count_share.loc[Q40_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'courses', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q40_compensation_count_share_pivot= Q40_compensation_count_share_pivot.fillna(0)

Q40_compensation_count_share_pivot['diff'] = Q40_compensation_count_share_pivot['More than 70K'] - Q40_compensation_count_share_pivot['Less than 70K']
Q40_compensation_count_share_pivot = Q40_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - courses
fig = go.Figure()

for i in range(Q40_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q40_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q40_compensation_count_share_pivot['courses'].iloc[i], 
        x1=Q40_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q40_compensation_count_share_pivot['courses'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q40_compensation_count_share_pivot['Less than 70K'],
            y=Q40_compensation_count_share_pivot['courses'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q40_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q40_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q40_compensation_count_share_pivot['More than 70K'],
            y=Q40_compensation_count_share_pivot['courses'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q40_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q40_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="On which platforms have you begun or completed data science courses?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="lifelong learning is important for people work in this area.",
                  xref="paper", yref="paper",
                  x=0.1 , y=0.95, showarrow=False)

fig.add_annotation(text="University learning is still essential if you want to earn more",
                  xref="paper", yref="paper",
                  x=0.6 , y=0.85, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q42_Part_1|Who/what are your favorite media sources that report on data science topics?'

Q42 = []
Q42.append('compensation_segment')
Q42.append('respondent_id')
for col in skill_knowledge_data.columns:
    if 'Q42' in col:
        Q42.append(col)
    else:
        pass

Q42_compensation = survey_compensation[Q42]
Q42_compensation_count = Q42_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q42_compensation_count.columns
new_cols = []
for col in Q42_compensation_count[cols].columns:
    if 'Q42' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q42_compensation_count[new_col] = round(100 * Q42_compensation_count[col] / Q42_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q42_compensation_count_share = Q42_compensation_count[new_cols].T.reset_index()
Q42_compensation_count_share_pivot = Q42_compensation_count_share.loc[Q42_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'media', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q42_compensation_count_share_pivot= Q42_compensation_count_share_pivot.fillna(0)

Q42_compensation_count_share_pivot['diff'] = Q42_compensation_count_share_pivot['More than 70K'] - Q42_compensation_count_share_pivot['Less than 70K']
Q42_compensation_count_share_pivot = Q42_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - courses
fig = go.Figure()

for i in range(Q42_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q42_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q42_compensation_count_share_pivot['media'].iloc[i], 
        x1=Q42_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q42_compensation_count_share_pivot['media'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q42_compensation_count_share_pivot['Less than 70K'],
            y=Q42_compensation_count_share_pivot['media'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q42_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q42_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q42_compensation_count_share_pivot['More than 70K'],
            y=Q42_compensation_count_share_pivot['media'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q42_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q42_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Who/what are your favorite media sources that report on data science topics?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="Popularity in reading blogs is aligned with publicly share source",
                  xref="paper", yref="paper",
                  x=0.1 , y=0.95, showarrow=False)

fig.add_annotation(text="Reading Journal publications is related with degree.",
                  xref="paper", yref="paper",
                  x=0.6 , y=0.85, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

* Daily tool and what they want to get familiar with in next two years
1. Environment
2. Product
3. Tool

In [None]:
# too/product dataset
tool_product_questions = []
tool_product_questions.append('respondent_id')
tool_product_questions.append('compensation_segment')
for col in survey_compensation.columns:
    for q in ('Q9','Q10','Q11',
              'Q12','Q13','Q14',
              'Q26','Q27_A','Q28',
              'Q29_A','Q30_A','Q31_A',
              'Q32_A','Q33','Q34_A',
              'Q35','Q36_A','Q37_A',
              'Q38_A','Q41'):
        if q in col.split("|")[0]:
            tool_product_questions.append(col)
        else:
            pass

tool_product_data = survey_compensation[tool_product_questions]

In [None]:
# Q26|Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?
invest = tool_product_data.groupby(['compensation_segment',
                                                  'Q26|Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?']).count().reset_index()[['compensation_segment',
                                                  'Q26|Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?',
                                                  'respondent_id']]

invest['nr_respondents_segment'] = invest.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
invest['share_of_total'] = round(100 * invest['respondent_id'] / invest['nr_respondents_segment'],1)

invest_pivot = invest.pivot(index='Q26|Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
invest_pivot= invest_pivot.fillna(0)
invest_pivot['diff'] = invest_pivot['More than 70K'] - invest_pivot['Less than 70K']
invest_pivot = invest_pivot.sort_values(by = 'diff')

invest_pivot = invest_pivot.replace('$0 ($USD)','0')
invest_pivot = invest_pivot.replace('$1-$99','1-99')
invest_pivot = invest_pivot.replace('$100-$999','100-999')
invest_pivot = invest_pivot.replace('$1000-$9,999','1000-9,999')
invest_pivot = invest_pivot.replace('$10,000-$99,999','10,000-99,999')
invest_pivot = invest_pivot.replace('$100,000 or more ($USD)','100,000 or more')



# visualization - invest
fig = go.Figure()

for i in range(invest_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=invest_pivot['Less than 70K'].iloc[i], 
        y0=invest_pivot['Q26|Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?'].iloc[i], 
        x1=invest_pivot['More than 70K'].iloc[i], 
        y1=invest_pivot['Q26|Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=invest_pivot['Less than 70K'],
            y=invest_pivot['Q26|Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * invest_pivot.shape[0],
                color=["#DEBAE6"] * invest_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=invest_pivot['More than 70K'],
            y=invest_pivot['Q26|Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * invest_pivot.shape[0],
                color=["#C54DFD"] * invest_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q26|Approximately how much money have you (or your team) spent on machine learning and/or cloud computing services at home (or at work) in the past 5 years (approximate $USD)?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="The more you invest, the higher chance you get a higher pay.",
                  xref="paper", yref="paper",
                  x=0.9 , y=0.95, showarrow=False)

fig.add_annotation(text="Most peole pay nothing",
                  xref="paper", yref="paper",
                  x=0.6 , y=0.15, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q41|What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice
tool_primary = tool_product_data.groupby(['compensation_segment',
                                                  'Q41|What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice']).count().reset_index()[['compensation_segment',
                                                  'Q41|What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice',
                                                  'respondent_id']]

tool_primary['nr_respondents_segment'] = tool_primary.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
tool_primary['share_of_total'] = round(100 * tool_primary['respondent_id'] / tool_primary['nr_respondents_segment'],1)

tool_primary_pivot = tool_primary.pivot(index='Q41|What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
tool_primary_pivot= tool_primary_pivot.fillna(0)
tool_primary_pivot['diff'] = tool_primary_pivot['More than 70K'] - tool_primary_pivot['Less than 70K']
tool_primary_pivot = tool_primary_pivot.sort_values(by = 'diff')

# visualization - primary tool
fig = go.Figure()

for i in range(tool_primary_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=tool_primary_pivot['Less than 70K'].iloc[i], 
        y0=tool_primary_pivot['Q41|What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice'].iloc[i], 
        x1=tool_primary_pivot['More than 70K'].iloc[i], 
        y1=tool_primary_pivot['Q41|What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=tool_primary_pivot['Less than 70K'],
            y=tool_primary_pivot['Q41|What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * tool_primary_pivot.shape[0],
                color=["#DEBAE6"] * tool_primary_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=tool_primary_pivot['More than 70K'],
            y=tool_primary_pivot['Q41|What is the primary tool that you use at work or school to analyze data? (Include text response) - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * tool_primary_pivot.shape[0],
                color=["#C54DFD"] * tool_primary_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q41|What is the primary tool that you use at work or school to analyze data?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="Local development software is needed.",
                  xref="paper", yref="paper",
                  x=0.3 , y=0.95, showarrow=False)

fig.add_annotation(text="Basic statistical software if popular, but everyone seems know how to use it",
                  xref="paper", yref="paper",
                  x=0.6 , y=0.15, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q9_Part_1|Which of the following integrated development environments (IDE's) do you use on a regular basis? 

Q9 = []
Q9.append('compensation_segment')
Q9.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q9' in col:
        Q9.append(col)
    else:
        pass

Q9_compensation = survey_compensation[Q9]
Q9_compensation_count = Q9_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q9_compensation_count.columns
new_cols = []
for col in Q9_compensation_count[cols].columns:
    if 'Q9' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q9_compensation_count[new_col] = round(100 * Q9_compensation_count[col] / Q9_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q9_compensation_count_share = Q9_compensation_count[new_cols].T.reset_index()
Q9_compensation_count_share_pivot = Q9_compensation_count_share.loc[Q9_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'IDE', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q9_compensation_count_share_pivot= Q9_compensation_count_share_pivot.fillna(0)

Q9_compensation_count_share_pivot['diff'] = Q9_compensation_count_share_pivot['More than 70K'] - Q9_compensation_count_share_pivot['Less than 70K']
Q9_compensation_count_share_pivot = Q9_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - IDE
fig = go.Figure()

for i in range(Q9_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q9_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q9_compensation_count_share_pivot['IDE'].iloc[i], 
        x1=Q9_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q9_compensation_count_share_pivot['IDE'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q9_compensation_count_share_pivot['Less than 70K'],
            y=Q9_compensation_count_share_pivot['IDE'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q9_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q9_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q9_compensation_count_share_pivot['More than 70K'],
            y=Q9_compensation_count_share_pivot['IDE'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q9_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q9_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Which of the following integrated development environments (IDE's) do you use on a regular basis? ",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )




fig.add_annotation(text="Vim/Emacs, Jupyter Notebook and Rstudio",
                  xref="paper", yref="paper",
                  x=0.6 , y=0.85, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q10_Part_1|Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply) 

Q10 = []
Q10.append('compensation_segment')
Q10.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q10' in col:
        Q10.append(col)
    else:
        pass

Q10_compensation = survey_compensation[Q10]
Q10_compensation_count = Q10_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q10_compensation_count.columns
new_cols = []
for col in Q10_compensation_count[cols].columns:
    if 'Q10' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q10_compensation_count[new_col] = round(100 * Q10_compensation_count[col] / Q10_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q10_compensation_count_share = Q10_compensation_count[new_cols].T.reset_index()
Q10_compensation_count_share_pivot = Q10_compensation_count_share.loc[Q10_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'hostedNotebook', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q10_compensation_count_share_pivot= Q10_compensation_count_share_pivot.fillna(0)

Q10_compensation_count_share_pivot['diff'] = Q10_compensation_count_share_pivot['More than 70K'] - Q10_compensation_count_share_pivot['Less than 70K']
Q10_compensation_count_share_pivot = Q10_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - hosted notedbook
fig = go.Figure()

for i in range(Q10_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q10_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q10_compensation_count_share_pivot['hostedNotebook'].iloc[i], 
        x1=Q10_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q10_compensation_count_share_pivot['hostedNotebook'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q10_compensation_count_share_pivot['Less than 70K'],
            y=Q10_compensation_count_share_pivot['hostedNotebook'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q10_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q10_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q10_compensation_count_share_pivot['More than 70K'],
            y=Q10_compensation_count_share_pivot['hostedNotebook'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q10_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q10_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q10_Part_1|Which of the following hosted notebook products do you use on a regular basis?  (Select all that apply)",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )




fig.add_annotation(text="Why chose none gaining more paid?",
                  xref="paper", yref="paper",
                  x=0.8 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q11|What type of computing platform do you use most often for your data science projects? - Selected Choice

computingPlatform = tool_product_data.groupby(['compensation_segment',
                                                  'Q11|What type of computing platform do you use most often for your data science projects? - Selected Choice']).count().reset_index()[['compensation_segment',
                                                  'Q11|What type of computing platform do you use most often for your data science projects? - Selected Choice',
                                                  'respondent_id']]

computingPlatform['nr_respondents_segment'] = computingPlatform.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
computingPlatform['share_of_total'] = round(100 * computingPlatform['respondent_id'] / computingPlatform['nr_respondents_segment'],1)

computingPlatform_pivot = computingPlatform.pivot(index='Q11|What type of computing platform do you use most often for your data science projects? - Selected Choice', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
computingPlatform_pivot= computingPlatform_pivot.fillna(0)
computingPlatform_pivot['diff'] = computingPlatform_pivot['More than 70K'] - computingPlatform_pivot['Less than 70K']
computingPlatform_pivot = computingPlatform_pivot.sort_values(by = 'diff')

# visualization - computing platform
fig = go.Figure()

for i in range(computingPlatform_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=computingPlatform_pivot['Less than 70K'].iloc[i], 
        y0=computingPlatform_pivot['Q11|What type of computing platform do you use most often for your data science projects? - Selected Choice'].iloc[i], 
        x1=computingPlatform_pivot['More than 70K'].iloc[i], 
        y1=computingPlatform_pivot['Q11|What type of computing platform do you use most often for your data science projects? - Selected Choice'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=computingPlatform_pivot['Less than 70K'],
            y=computingPlatform_pivot['Q11|What type of computing platform do you use most often for your data science projects? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * computingPlatform_pivot.shape[0],
                color=["#DEBAE6"] * computingPlatform_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=computingPlatform_pivot['More than 70K'],
            y=computingPlatform_pivot['Q11|What type of computing platform do you use most often for your data science projects? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * computingPlatform_pivot.shape[0],
                color=["#C54DFD"] * computingPlatform_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q11|What type of computing platform do you use most often for your data science projects? - Selected Choice",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="A cloud computing platform is a must",
                  xref="paper", yref="paper",
                  x=0.9 , y=0.95, showarrow=False)

fig.add_annotation(text="A laptop is popular, but it also <br>takes up a large share of users earn in the bottom 80%.",
                  xref="paper", yref="paper",
                  x=0.6 , y=0.1, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q12_Part_1|Which types of specialized hardware do you use on a regular basis?  (Select all that apply) - Selected Choice -  NVIDIA GPUs 

Q12 = []
Q12.append('compensation_segment')
Q12.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q12' in col:
        Q12.append(col)
    else:
        pass

Q12_compensation = survey_compensation[Q12]
Q12_compensation_count = Q12_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q12_compensation_count.columns
new_cols = []
for col in Q12_compensation_count[cols].columns:
    if 'Q12' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q12_compensation_count[new_col] = round(100 * Q12_compensation_count[col] / Q12_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q12_compensation_count_share = Q12_compensation_count[new_cols].T.reset_index()
Q12_compensation_count_share_pivot = Q12_compensation_count_share.loc[Q12_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'hardware', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q12_compensation_count_share_pivot= Q12_compensation_count_share_pivot.fillna(0)

Q12_compensation_count_share_pivot['diff'] = Q12_compensation_count_share_pivot['More than 70K'] - Q12_compensation_count_share_pivot['Less than 70K']
Q12_compensation_count_share_pivot = Q12_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - hardware 
fig = go.Figure()

for i in range(Q12_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q12_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q12_compensation_count_share_pivot['hardware'].iloc[i], 
        x1=Q12_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q12_compensation_count_share_pivot['hardware'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q12_compensation_count_share_pivot['Less than 70K'],
            y=Q12_compensation_count_share_pivot['hardware'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q12_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q12_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q12_compensation_count_share_pivot['More than 70K'],
            y=Q12_compensation_count_share_pivot['hardware'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q12_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q12_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q12_Part_1|Which types of specialized hardware do you use on a regular basis?  (Select all that apply)",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="GPU is more popular or maybe enough for you earn a lot?",
                  xref="paper", yref="paper",
                  x=0.8 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q13|Approximately how many times have you used a TPU (tensor processing unit)?

TPUtimes = tool_product_data.groupby(['compensation_segment',
                                                  'Q13|Approximately how many times have you used a TPU (tensor processing unit)?']).count().reset_index()[['compensation_segment',
                                                  'Q13|Approximately how many times have you used a TPU (tensor processing unit)?',
                                                  'respondent_id']]

TPUtimes['nr_respondents_segment'] = TPUtimes.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
TPUtimes['share_of_total'] = round(100 * TPUtimes['respondent_id'] / TPUtimes['nr_respondents_segment'],1)

TPUtimes_pivot = TPUtimes.pivot(index='Q13|Approximately how many times have you used a TPU (tensor processing unit)?', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
TPUtimes_pivot= TPUtimes_pivot.fillna(0)

TPUtimes_pivot['Q13|Approximately how many times have you used a TPU (tensor processing unit)?'] = pd.Categorical(TPUtimes_pivot['Q13|Approximately how many times have you used a TPU (tensor processing unit)?'], 
                      categories=['Never',
                                  'Once',
                                  "2-5 times",
                                  "6-25 times",
                                  "More than 25 times"],                                                                                                                                                                                            
                      ordered=True)
TPUtimes_pivot = TPUtimes_pivot.sort_values(by = 'Q13|Approximately how many times have you used a TPU (tensor processing unit)?')


# visualization - times of using TPU?
fig = go.Figure()

for i in range(TPUtimes_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=TPUtimes_pivot['Less than 70K'].iloc[i], 
        y0=TPUtimes_pivot['Q13|Approximately how many times have you used a TPU (tensor processing unit)?'].iloc[i], 
        x1=TPUtimes_pivot['More than 70K'].iloc[i], 
        y1=TPUtimes_pivot['Q13|Approximately how many times have you used a TPU (tensor processing unit)?'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=TPUtimes_pivot['Less than 70K'],
            y=TPUtimes_pivot['Q13|Approximately how many times have you used a TPU (tensor processing unit)?'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * TPUtimes_pivot.shape[0],
                color=["#DEBAE6"] * TPUtimes_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=TPUtimes_pivot['More than 70K'],
            y=TPUtimes_pivot['Q13|Approximately how many times have you used a TPU (tensor processing unit)?'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * TPUtimes.shape[0],
                color=["#C54DFD"] * TPUtimes.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q13|Approximately how many times have you used a TPU (tensor processing unit)?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )


fig.add_annotation(text="The more times you used a TPU, more chance you'll get higher pay.",
                  xref="paper", yref="paper",
                  x=0.2 , y=0.95, showarrow=False)


fig.add_annotation(text="Most people never used it.",
                  xref="paper", yref="paper",
                  x=0.9 , y=0.15, showarrow=False)



fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q14_Part_1|What data visualization libraries or tools do you use on a regular basis?  (Select all that apply)
Q14 = []
Q14.append('compensation_segment')
Q14.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q14' in col:
        Q14.append(col)
    else:
        pass

Q14_compensation = survey_compensation[Q14]
Q14_compensation_count = Q14_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q14_compensation_count.columns
new_cols = []
for col in Q14_compensation_count[cols].columns:
    if 'Q14' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q14_compensation_count[new_col] = round(100 * Q14_compensation_count[col] / Q14_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q14_compensation_count_share = Q14_compensation_count[new_cols].T.reset_index()
Q14_compensation_count_share_pivot = Q14_compensation_count_share.loc[Q14_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'dataVis', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q14_compensation_count_share_pivot= Q14_compensation_count_share_pivot.fillna(0)

Q14_compensation_count_share_pivot['diff'] = Q14_compensation_count_share_pivot['More than 70K'] - Q14_compensation_count_share_pivot['Less than 70K']
Q14_compensation_count_share_pivot = Q14_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - data Vis 
fig = go.Figure()

for i in range(Q14_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q14_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q14_compensation_count_share_pivot['dataVis'].iloc[i], 
        x1=Q14_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q14_compensation_count_share_pivot['dataVis'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q14_compensation_count_share_pivot['Less than 70K'],
            y=Q14_compensation_count_share_pivot['dataVis'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q14_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q14_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q14_compensation_count_share_pivot['More than 70K'],
            y=Q14_compensation_count_share_pivot['dataVis'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q14_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q14_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q14_Part_1|What data visualization libraries or tools do you use on a regular basis? ",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="GGplot or Plotly/Plotly Express",
                  xref="paper", yref="paper",
                  x=0.9 , y=0.9, showarrow=False)

fig.add_annotation(text="Matplotlib and Seaborn are popular but not grant you higher pay.",
                  xref="paper", yref="paper",
                  x=0.9 , y=0.3, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q27_A_Part_1|Which of the following cloud computing platforms do you use on a regular basis? (Select all that apply) 
Q27 = []
Q27.append('compensation_segment')
Q27.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q27' in col:
        Q27.append(col)
    else:
        pass

Q27_compensation = survey_compensation[Q27]
Q27_compensation_count = Q27_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q27_compensation_count.columns
new_cols = []
for col in Q27_compensation_count[cols].columns:
    if 'Q27' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q27_compensation_count[new_col] = round(100 * Q27_compensation_count[col] / Q27_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q27_compensation_count_share = Q27_compensation_count[new_cols].T.reset_index()
Q27_compensation_count_share_pivot = Q27_compensation_count_share.loc[Q27_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'CloudComputePlatform', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q27_compensation_count_share_pivot= Q27_compensation_count_share_pivot.fillna(0)

Q27_compensation_count_share_pivot['diff'] = Q27_compensation_count_share_pivot['More than 70K'] - Q27_compensation_count_share_pivot['Less than 70K']
Q27_compensation_count_share_pivot = Q27_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - data Vis 
fig = go.Figure()

for i in range(Q27_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q27_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q27_compensation_count_share_pivot['CloudComputePlatform'].iloc[i], 
        x1=Q27_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q27_compensation_count_share_pivot['CloudComputePlatform'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q27_compensation_count_share_pivot['Less than 70K'],
            y=Q27_compensation_count_share_pivot['CloudComputePlatform'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q27_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q27_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q27_compensation_count_share_pivot['More than 70K'],
            y=Q27_compensation_count_share_pivot['CloudComputePlatform'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q27_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q27_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q27_A_Part_1|Which of the following cloud computing platforms do you use on a regular basis? (Select all that apply)",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="AWS / Microsoft Azure / Google Cloud Platform",
                  xref="paper", yref="paper",
                  x=0.9 , y=0.9, showarrow=False)


fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q28|Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice
cloudPlatformExp = tool_product_data.groupby(['compensation_segment',
                                                  'Q28|Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice']).count().reset_index()[['compensation_segment',
                                                  'Q28|Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice',
                                                  'respondent_id']]

cloudPlatformExp['nr_respondents_segment'] = cloudPlatformExp.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
cloudPlatformExp['share_of_total'] = round(100 * cloudPlatformExp['respondent_id'] / cloudPlatformExp['nr_respondents_segment'],1)

cloudPlatformExp_pivot = cloudPlatformExp.pivot(index='Q28|Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
cloudPlatformExp_pivot= cloudPlatformExp_pivot.fillna(0)
cloudPlatformExp_pivot['diff'] = cloudPlatformExp_pivot['More than 70K'] - cloudPlatformExp_pivot['Less than 70K']
cloudPlatformExp_pivot = cloudPlatformExp_pivot.sort_values(by = 'diff')

# visualization - cloudPlatformExp
fig = go.Figure()

for i in range(cloudPlatformExp_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=cloudPlatformExp_pivot['Less than 70K'].iloc[i], 
        y0=cloudPlatformExp_pivot['Q28|Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice'].iloc[i], 
        x1=cloudPlatformExp_pivot['More than 70K'].iloc[i], 
        y1=cloudPlatformExp_pivot['Q28|Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=cloudPlatformExp_pivot['Less than 70K'],
            y=cloudPlatformExp_pivot['Q28|Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * cloudPlatformExp_pivot.shape[0],
                color=["#DEBAE6"] * cloudPlatformExp_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=cloudPlatformExp_pivot['More than 70K'],
            y=cloudPlatformExp_pivot['Q28|Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * cloudPlatformExp_pivot.shape[0],
                color=["#C54DFD"] * cloudPlatformExp_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q28|Of the cloud platforms that you are familiar with, which has the best developer experience (most enjoyable to use)? - Selected Choice",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )




fig.add_annotation(text="High comment on AWS explains its popularity.",
                  xref="paper", yref="paper",
                  x=0.6 , y=0.95, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q29_A_Part_1|Do you use any of the following cloud computing products on a regular basis? (Select all that apply) - Selected Choice -  Amazon Elastic Compute Cloud (EC2) 
Q29 = []
Q29.append('compensation_segment')
Q29.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q29' in col:
        Q29.append(col)
    else:
        pass

Q29_compensation = survey_compensation[Q29]
Q29_compensation_count = Q29_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q29_compensation_count.columns
new_cols = []
for col in Q29_compensation_count[cols].columns:
    if 'Q29' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q29_compensation_count[new_col] = round(100 * Q29_compensation_count[col] / Q29_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q29_compensation_count_share = Q29_compensation_count[new_cols].T.reset_index()
Q29_compensation_count_share_pivot = Q29_compensation_count_share.loc[Q29_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'CloudCompProd', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q29_compensation_count_share_pivot= Q29_compensation_count_share_pivot.fillna(0)

Q29_compensation_count_share_pivot['diff'] = Q29_compensation_count_share_pivot['More than 70K'] - Q29_compensation_count_share_pivot['Less than 70K']
Q29_compensation_count_share_pivot = Q29_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - CloudCompProd
fig = go.Figure()

for i in range(Q29_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q29_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q29_compensation_count_share_pivot['CloudCompProd'].iloc[i], 
        x1=Q29_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q29_compensation_count_share_pivot['CloudCompProd'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q29_compensation_count_share_pivot['Less than 70K'],
            y=Q29_compensation_count_share_pivot['CloudCompProd'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q29_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q29_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q29_compensation_count_share_pivot['More than 70K'],
            y=Q29_compensation_count_share_pivot['CloudCompProd'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q29_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q29_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q29_A_Part_1|Do you use any of the following cloud computing products on a regular basis? (Select all that apply)",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="Amazon Elastic Compute Cloud",
                  xref="paper", yref="paper",
                  x=0.9 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q30_A_Part_1|Do you use any of the following data storage products on a regular basis? (Select all that apply) - Selected Choice - Microsoft Azure Data Lake Storage 
Q30 = []
Q30.append('compensation_segment')
Q30.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q30' in col:
        Q30.append(col)
    else:
        pass

Q30_compensation = survey_compensation[Q30]
Q30_compensation_count = Q30_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q30_compensation_count.columns
new_cols = []
for col in Q30_compensation_count[cols].columns:
    if 'Q30' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q30_compensation_count[new_col] = round(100 * Q30_compensation_count[col] / Q30_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q30_compensation_count_share = Q30_compensation_count[new_cols].T.reset_index()
Q30_compensation_count_share_pivot = Q30_compensation_count_share.loc[Q30_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'dataStorageProd', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q30_compensation_count_share_pivot= Q30_compensation_count_share_pivot.fillna(0)

Q30_compensation_count_share_pivot['diff'] = Q30_compensation_count_share_pivot['More than 70K'] - Q30_compensation_count_share_pivot['Less than 70K']
Q30_compensation_count_share_pivot = Q30_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - dataStorageProd
fig = go.Figure()

for i in range(Q30_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q30_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q30_compensation_count_share_pivot['dataStorageProd'].iloc[i], 
        x1=Q30_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q30_compensation_count_share_pivot['dataStorageProd'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q30_compensation_count_share_pivot['Less than 70K'],
            y=Q30_compensation_count_share_pivot['dataStorageProd'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q30_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q30_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q30_compensation_count_share_pivot['More than 70K'],
            y=Q30_compensation_count_share_pivot['dataStorageProd'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q30_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q30_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q30_A_Part_1|Do you use any of the following data storage products on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="Amazon S3 is popular and earns most.",
                  xref="paper", yref="paper",
                  x=0.9 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q31_A_Part_3|Do you use any of the following managed machine learning products on a regular basis? (Select all that apply) - Selected Choice -  Google Cloud Vertex AI
Q31 = []
Q31.append('compensation_segment')
Q31.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q31' in col:
        Q31.append(col)
    else:
        pass

Q31_compensation = survey_compensation[Q31]
Q31_compensation_count = Q31_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q31_compensation_count.columns
new_cols = []
for col in Q31_compensation_count[cols].columns:
    if 'Q31' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q31_compensation_count[new_col] = round(100 * Q31_compensation_count[col] / Q31_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q31_compensation_count_share = Q31_compensation_count[new_cols].T.reset_index()
Q31_compensation_count_share_pivot = Q31_compensation_count_share.loc[Q31_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'MLProd', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q31_compensation_count_share_pivot= Q31_compensation_count_share_pivot.fillna(0)

Q31_compensation_count_share_pivot['diff'] = Q31_compensation_count_share_pivot['More than 70K'] - Q31_compensation_count_share_pivot['Less than 70K']
Q31_compensation_count_share_pivot = Q31_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - MLProd
fig = go.Figure()

for i in range(Q31_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q31_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q31_compensation_count_share_pivot['MLProd'].iloc[i], 
        x1=Q31_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q31_compensation_count_share_pivot['MLProd'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q31_compensation_count_share_pivot['Less than 70K'],
            y=Q31_compensation_count_share_pivot['MLProd'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q31_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q31_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q31_compensation_count_share_pivot['More than 70K'],
            y=Q31_compensation_count_share_pivot['MLProd'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q31_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q31_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=480, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q31_A_Part_3|Do you use any of the following managed machine learning products on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="Not many people use it.",
                  xref="paper", yref="paper",
                  x=1 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q32_A_Part_1|Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis? (Select all that apply)
Q32 = []
Q32.append('compensation_segment')
Q32.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q32' in col:
        Q32.append(col)
    else:
        pass

Q32_compensation = survey_compensation[Q32]
Q32_compensation_count = Q32_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q32_compensation_count.columns
new_cols = []
for col in Q32_compensation_count[cols].columns:
    if 'Q32' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q32_compensation_count[new_col] = round(100 * Q32_compensation_count[col] / Q32_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q32_compensation_count_share = Q32_compensation_count[new_cols].T.reset_index()
Q32_compensation_count_share_pivot = Q32_compensation_count_share.loc[Q32_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'bigDataProd', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q32_compensation_count_share_pivot= Q32_compensation_count_share_pivot.fillna(0)

Q32_compensation_count_share_pivot['diff'] = Q32_compensation_count_share_pivot['More than 70K'] - Q32_compensation_count_share_pivot['Less than 70K']
Q32_compensation_count_share_pivot = Q32_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - bigDataProd
fig = go.Figure()

for i in range(Q32_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q32_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q32_compensation_count_share_pivot['bigDataProd'].iloc[i], 
        x1=Q32_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q32_compensation_count_share_pivot['bigDataProd'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q32_compensation_count_share_pivot['Less than 70K'],
            y=Q32_compensation_count_share_pivot['bigDataProd'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q32_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q32_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q32_compensation_count_share_pivot['More than 70K'],
            y=Q32_compensation_count_share_pivot['bigDataProd'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q32_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q32_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=600, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q32_A_Part_1|Which of the following big data products (relational databases, data warehouses, data lakes, or similar) do you use on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="PostgreSQL and Redshift",
                  xref="paper", yref="paper",
                  x=1 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q33|Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice
bigDataProdMost = tool_product_data.groupby(['compensation_segment',
                                                  'Q33|Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice']).count().reset_index()[['compensation_segment',
                                                  'Q33|Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice',
                                                  'respondent_id']]

bigDataProdMost['nr_respondents_segment'] = bigDataProdMost.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
bigDataProdMost['share_of_total'] = round(100 * bigDataProdMost['respondent_id'] / bigDataProdMost['nr_respondents_segment'],1)

bigDataProdMost_pivot = bigDataProdMost.pivot(index='Q33|Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
bigDataProdMost_pivot= bigDataProdMost_pivot.fillna(0)
bigDataProdMost_pivot['diff'] = bigDataProdMost_pivot['More than 70K'] - bigDataProdMost_pivot['Less than 70K']
bigDataProdMost_pivot = bigDataProdMost_pivot.sort_values(by = 'diff')

# visualization - bigDataProdMost
fig = go.Figure()

for i in range(bigDataProdMost_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=bigDataProdMost_pivot['Less than 70K'].iloc[i], 
        y0=bigDataProdMost_pivot['Q33|Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice'].iloc[i], 
        x1=bigDataProdMost_pivot['More than 70K'].iloc[i], 
        y1=bigDataProdMost_pivot['Q33|Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=bigDataProdMost_pivot['Less than 70K'],
            y=bigDataProdMost_pivot['Q33|Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * bigDataProdMost_pivot.shape[0],
                color=["#DEBAE6"] * bigDataProdMost_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=bigDataProdMost_pivot['More than 70K'],
            y=bigDataProdMost_pivot['Q33|Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * bigDataProdMost_pivot.shape[0],
                color=["#C54DFD"] * bigDataProdMost_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=600, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q33|Which of the following big data products (relational database, data warehouse, data lake, or similar) do you use most often? - Selected Choice",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )




fig.add_annotation(text="",
                  xref="paper", yref="paper",
                  x=0.6 , y=0.95, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q34_A_Part_1|Which of the following business intelligence tools do you use on a regular basis? (Select all that apply) - Selected Choice - Amazon QuickSight
Q34 = []
Q34.append('compensation_segment')
Q34.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q34' in col:
        Q34.append(col)
    else:
        pass

Q34_compensation = survey_compensation[Q34]
Q34_compensation_count = Q34_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q34_compensation_count.columns
new_cols = []
for col in Q34_compensation_count[cols].columns:
    if 'Q34' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q34_compensation_count[new_col] = round(100 * Q34_compensation_count[col] / Q34_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q34_compensation_count_share = Q34_compensation_count[new_cols].T.reset_index()
Q34_compensation_count_share_pivot = Q34_compensation_count_share.loc[Q34_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'businessIntellegence', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q34_compensation_count_share_pivot= Q34_compensation_count_share_pivot.fillna(0)

Q34_compensation_count_share_pivot['diff'] = Q34_compensation_count_share_pivot['More than 70K'] - Q34_compensation_count_share_pivot['Less than 70K']
Q34_compensation_count_share_pivot = Q34_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - businessIntellegence
fig = go.Figure()

for i in range(Q34_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q34_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q34_compensation_count_share_pivot['businessIntellegence'].iloc[i], 
        x1=Q34_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q34_compensation_count_share_pivot['businessIntellegence'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q34_compensation_count_share_pivot['Less than 70K'],
            y=Q34_compensation_count_share_pivot['businessIntellegence'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q34_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q34_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q34_compensation_count_share_pivot['More than 70K'],
            y=Q34_compensation_count_share_pivot['businessIntellegence'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q34_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q34_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=600, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q34_A_Part_1|Which of the following business intelligence tools do you use on a regular basis?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="",
                  xref="paper", yref="paper",
                  x=1 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q35|Which of the following business intelligence tools do you use most often? - Selected Choice
businessIntell = tool_product_data.groupby(['compensation_segment',
                                                  'Q35|Which of the following business intelligence tools do you use most often? - Selected Choice']).count().reset_index()[['compensation_segment',
                                                  'Q35|Which of the following business intelligence tools do you use most often? - Selected Choice',
                                                  'respondent_id']]

businessIntell['nr_respondents_segment'] = businessIntell.groupby('compensation_segment', sort=False)["respondent_id"].transform('sum')
businessIntell['share_of_total'] = round(100 * businessIntell['respondent_id'] / businessIntell['nr_respondents_segment'],1)

businessIntell_pivot = businessIntell.pivot(index='Q35|Which of the following business intelligence tools do you use most often? - Selected Choice', 
                                                                      columns='compensation_segment', 
                                                                      values='share_of_total').reset_index()
businessIntell_pivot= businessIntell_pivot.fillna(0)
businessIntell_pivot['diff'] = businessIntell_pivot['More than 70K'] - businessIntell_pivot['Less than 70K']
businessIntell_pivot = businessIntell_pivot.sort_values(by = 'diff')

# visualization - businessIntell_pivot
fig = go.Figure()

for i in range(businessIntell_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=businessIntell_pivot['Less than 70K'].iloc[i], 
        y0=businessIntell_pivot['Q35|Which of the following business intelligence tools do you use most often? - Selected Choice'].iloc[i], 
        x1=businessIntell_pivot['More than 70K'].iloc[i], 
        y1=businessIntell_pivot['Q35|Which of the following business intelligence tools do you use most often? - Selected Choice'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=businessIntell_pivot['Less than 70K'],
            y=businessIntell_pivot['Q35|Which of the following business intelligence tools do you use most often? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * businessIntell_pivot.shape[0],
                color=["#DEBAE6"] * businessIntell_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=businessIntell_pivot['More than 70K'],
            y=businessIntell_pivot['Q35|Which of the following business intelligence tools do you use most often? - Selected Choice'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * businessIntell_pivot.shape[0],
                color=["#C54DFD"] * businessIntell_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=600, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q35|Which of the following business intelligence tools do you use most often? - Selected Choice",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )




fig.add_annotation(text="",
                  xref="paper", yref="paper",
                  x=0.6 , y=0.95, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q36_A_Part_1|Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis?   
Q36 = []
Q36.append('compensation_segment')
Q36.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q36' in col:
        Q36.append(col)
    else:
        pass

Q36_compensation = survey_compensation[Q36]
Q36_compensation_count = Q36_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q36_compensation_count.columns
new_cols = []
for col in Q36_compensation_count[cols].columns:
    if 'Q36' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q36_compensation_count[new_col] = round(100 * Q36_compensation_count[col] / Q36_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q36_compensation_count_share = Q36_compensation_count[new_cols].T.reset_index()
Q36_compensation_count_share_pivot = Q36_compensation_count_share.loc[Q36_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'autoMLTool', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q36_compensation_count_share_pivot= Q36_compensation_count_share_pivot.fillna(0)

Q36_compensation_count_share_pivot['diff'] = Q36_compensation_count_share_pivot['More than 70K'] - Q36_compensation_count_share_pivot['Less than 70K']
Q36_compensation_count_share_pivot = Q36_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - autoMLTool
fig = go.Figure()

for i in range(Q36_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q36_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q36_compensation_count_share_pivot['autoMLTool'].iloc[i], 
        x1=Q36_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q36_compensation_count_share_pivot['autoMLTool'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q36_compensation_count_share_pivot['Less than 70K'],
            y=Q36_compensation_count_share_pivot['autoMLTool'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q36_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q36_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q36_compensation_count_share_pivot['More than 70K'],
            y=Q36_compensation_count_share_pivot['autoMLTool'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q36_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q36_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=600, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q36_A_Part_1|Do you use any automated machine learning tools (or partial AutoML tools) on a regular basis? ",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="",
                  xref="paper", yref="paper",
                  x=1 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q37_A_Part_1|Which of the following automated machine learning tools (or partial AutoML tools) do you use on a regular basis?  (Select all that apply) - Selected Choice -  Google Cloud AutoML 
"""
Question 37-A (which specific product) was only asked to respondents that answered affirmatively to
Question 36-A (which of the following categories of products).
"""
Q37 = []
Q37.append('compensation_segment')
Q37.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q37' in col:
        Q37.append(col)
    else:
        pass

Q37_compensation = survey_compensation[Q37]
Q37_compensation_count = Q37_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q37_compensation_count.columns
new_cols = []
for col in Q37_compensation_count[cols].columns:
    if 'Q37' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q37_compensation_count[new_col] = round(100 * Q37_compensation_count[col] / Q37_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q37_compensation_count_share = Q37_compensation_count[new_cols].T.reset_index()
Q37_compensation_count_share_pivot = Q37_compensation_count_share.loc[Q37_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'autoMLToolRegular', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q37_compensation_count_share_pivot= Q37_compensation_count_share_pivot.fillna(0)

Q37_compensation_count_share_pivot['diff'] = Q37_compensation_count_share_pivot['More than 70K'] - Q37_compensation_count_share_pivot['Less than 70K']
Q37_compensation_count_share_pivot = Q37_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - autoMLToolRegular
fig = go.Figure()

for i in range(Q37_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q37_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q37_compensation_count_share_pivot['autoMLToolRegular'].iloc[i], 
        x1=Q37_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q37_compensation_count_share_pivot['autoMLToolRegular'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q37_compensation_count_share_pivot['Less than 70K'],
            y=Q37_compensation_count_share_pivot['autoMLToolRegular'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q37_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q37_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q37_compensation_count_share_pivot['More than 70K'],
            y=Q37_compensation_count_share_pivot['autoMLToolRegular'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q37_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q37_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=600, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q37_A_Part_1|Which of the following automated machine learning tools (or partial AutoML tools) do you use on a regular basis? ",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="",
                  xref="paper", yref="paper",
                  x=1 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()

In [None]:
# Q38_A_Part_1|Do you use any tools to help manage machine learning experiments? (Select all that apply) - Selected Choice

Q38 = []
Q38.append('compensation_segment')
Q38.append('respondent_id')
for col in tool_product_data.columns:
    if 'Q38' in col:
        Q38.append(col)
    else:
        pass

Q38_compensation = survey_compensation[Q38]
Q38_compensation_count = Q38_compensation.groupby(['compensation_segment']).count().reset_index()
cols = Q38_compensation_count.columns
new_cols = []
for col in Q38_compensation_count[cols].columns:
    if 'Q38' in col:
        new_col = col.split(' - ')[2]
        new_cols.append(new_col)
        Q38_compensation_count[new_col] = round(100 * Q38_compensation_count[col] / Q38_compensation_count['respondent_id'],1)
new_cols.append('compensation_segment')
Q38_compensation_count_share = Q38_compensation_count[new_cols].T.reset_index()
Q38_compensation_count_share_pivot = Q38_compensation_count_share.loc[Q38_compensation_count_share['index'] != 'compensation_segment'].rename(columns={'index': 'MLExp', 
                                                                                                                0: 'Less than 70K',
                                                                                                               1:'More than 70K'})
Q38_compensation_count_share_pivot= Q38_compensation_count_share_pivot.fillna(0)

Q38_compensation_count_share_pivot['diff'] = Q38_compensation_count_share_pivot['More than 70K'] - Q38_compensation_count_share_pivot['Less than 70K']
Q38_compensation_count_share_pivot = Q38_compensation_count_share_pivot.sort_values(by = 'diff')

# visualization - MLExp
fig = go.Figure()

for i in range(Q38_compensation_count_share_pivot.shape[0]):
    fig.add_shape(
        type='line',
        x0=Q38_compensation_count_share_pivot['Less than 70K'].iloc[i], 
        y0=Q38_compensation_count_share_pivot['MLExp'].iloc[i], 
        x1=Q38_compensation_count_share_pivot['More than 70K'].iloc[i], 
        y1=Q38_compensation_count_share_pivot['MLExp'].iloc[i],
        line_color="#cccccc"        
    )
fig.add_trace(
        go.Scatter(
            x=Q38_compensation_count_share_pivot['Less than 70K'],
            y=Q38_compensation_count_share_pivot['MLExp'], 
            #fill="toself",
            mode='markers',
            name='Less than 70K',
            marker=dict(size=[10] * Q38_compensation_count_share_pivot.shape[0],
                color=["#DEBAE6"] * Q38_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>LESS</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )
    
fig.add_trace(
        go.Scatter(
            x=Q38_compensation_count_share_pivot['More than 70K'],
            y=Q38_compensation_count_share_pivot['MLExp'], 
            #fill="toself",
            mode='markers',
            name='More than 70K', 
            marker=dict(size=[10] * Q38_compensation_count_share_pivot.shape[0],
                color=["#C54DFD"] * Q38_compensation_count_share_pivot.shape[0]),
            hovertemplate='<b>%{x:.2f}%</b> of participants earning <b>MORE</b> than 70K selected <b>%{y}</b>'
            #opacity=1
        )
    )

fig.update_layout(height=600, 
                  width=1200, 
                  #barmode='stack',
                  title_text="Q38_A_Part_1|Do you use any tools to help manage machine learning experiments?",
                    xaxis_title = "% of respondents earning yearly compensation MORE/LESS than 70K USD",
                  #hovermode='x unified',
                  #paper_bgcolor='rgba(0,0,0,0)',
                 plot_bgcolor='rgba(0,0,0,0)')

fig.update_yaxes(title=None
                 #visible=True, 
                 #showticklabels=True
                )
fig.add_annotation(text="",
                  xref="paper", yref="paper",
                  x=1 , y=0.9, showarrow=False)

fig.update_xaxes(showspikes=True)
fig.update_yaxes(showspikes=True)
fig.show()