In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import geopandas as gpd
from descartes import PolygonPatch
import glob
import matplotlib.cm as cm
import plotly.express as px
warnings.filterwarnings('ignore')

In [None]:
dt = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")
dt_nq = dt.iloc[1:]

dt_nq.Q3 = dt_nq.Q3.replace('Iran, Islamic Republic of...', 'Iran')
dt_nq.Q3 = dt_nq.Q3.replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom')
dt_nq.Q3 = dt_nq.Q3.replace('Republic of Korea', 'South Korea')

dt_19 = pd.read_csv("../input/kaggle-survey-2019/multiple_choice_responses.csv")
dt_nq_19 = dt_19.iloc[1:]

dt_nq_19.Q3 = dt_nq_19.Q3.replace('Iran, Islamic Republic of...', 'Iran')
dt_nq_19.Q3 = dt_nq_19.Q3.replace('United Kingdom of Great Britain and Northern Ireland', 'United Kingdom')
dt_nq_19.Q3 = dt_nq_19.Q3.replace('Republic of Korea', 'South Korea')

In [None]:
conts = { 'Asia & Australia' : ['Japan','India','China','Indonesia','Singapore','Pakistan', 'Nepal', 'South Korea', 
                    'Saudi Arabia', 'Taiwan', 'UAE','Viet Nam', 'Thailand',
                    'Philippines','Turkey','Iran','Republic of Korea', 
                    'Bangladesh','Malaysia','Israel','Sri Lanka', 'Australia'
                   ],
         'Europe' : [ 'Germany','Switzerland','Russia','Netherlands','Poland', 'Belarus','Portugal',
                     'Ukraine', 'Belgium','Italy', 'Spain', 'Ireland', 'Romania','Sweden',
                     'United Kingdom', 'France', 'Greece',
                    ],
         'America' : ['Colombia', 'United States of America', 'Argentina','Brazil', 
                     'Canada','Mexico','Peru','Chile',
                     ],
         'Africa' :['South Africa','Egypt','Tunisia','Nigeria','Morocco','Ghana','Kenya', 'Algeria'],
         
}

def getCont(ctr):
    if ctr in conts['Asia & Australia']:
        return 'Asia & Australia'
    elif ctr in conts['Europe']:
        return 'Europe'
    elif ctr in conts['America']:
        return 'America'
    elif ctr in conts['Africa']:
        return 'Africa'
    else:
        return 'UND'
    
dt_nq['Continent'] = dt_nq['Q3'].apply(lambda x: getCont(x))
dt_nq_19['Continent'] = dt_nq_19['Q3'].apply(lambda x: getCont(x))

# Know Your (Potential) Co-Workers

### Gender Representation

First, let's explore the gender composition of data-related employees.

In [None]:
m = {'Prefer not to say':'Other', 'Prefer to self-describe':'Other', 'Nonbinary':'Other',
    'Man': 'Man', 'Woman':'Woman'}
dt_nq['Q2'] = dt_nq['Q2'].apply(lambda x : m[x])

colors = cm.Blues(np.linspace(0.4, 1, len(dt_nq['Q2'].unique())))[::-1]
VC = dt_nq['Q2'].value_counts().plot.pie(subplots=True, figsize=(8, 5), colors=colors,
                                         autopct='%1.0f%%', pctdistance=1.1, labeldistance=1.3)
_ = plt.title('Distribution of Gender for Kaggle Survey 2020 Respondents', fontweight ='bold')
plt.ylabel('')
_ = plt.legend(bbox_to_anchor=(1.5, 0.8))

Seen from the plot above, most of the respondents are men, but let's see which countries have the highest proportion of women as the respondents.

In [None]:
dt_gd_ctry = dt_nq.groupby(['Q3','Q2']).size().unstack(level=1, fill_value=0)

for c in dt_gd_ctry.index:
    dt_gd_ctry.loc[c] = (dt_gd_ctry.loc[c]/sum(dt_gd_ctry.loc[c])*100).astype(int)

dt_gd_ctry['Continent'] = list(map(lambda x: getCont(x), list(dt_gd_ctry.index)))

In [None]:
top15_female = dt_gd_ctry.sort_values('Woman', ascending = False).head(15)['Woman']

# Set position of bar on X axis

barWidth = 0
range_top15 = np.arange(len(top15_female))

# Create Masks for coloring the chart, returning indices when artist-song topped most charts
masks = []

top15_female_countries = top15_female.index
for t in conts.keys():
    masks.append(top15_female.index.isin(conts[t]))

colors = cm.Blues(np.linspace(0.4, 0.9, len(conts.keys())))[::-1]
# Make the plot
f, ax = plt.subplots(figsize=(8, 6))

for i, mask in enumerate(masks):
    plt.barh(range_top15[mask], top15_female[mask], color=colors[i],  \
            label = list(conts.keys())[i])

# Add xticks on the middle of the group bars
plt.ylabel('Countries', fontweight='bold')
plt.xlabel('Percentage of Women Respondents (%)', fontweight='bold')
plt.yticks([r + barWidth for r in range(len(top15_female))], top15_female.index)
_ = plt.title('Which Countries Have The Highest Percentange of Women Respondents?', fontweight='bold')

ax.legend(handles=ax.lines[::len(conts.keys())+1], labels=conts.keys(), bbox_to_anchor=(1.4, 0.8))
for y, x in enumerate(top15_female):
    plt.annotate(str(x) + "% ", xy=(x, y), va='center', ha = 'right', fontsize=12, fontweight ='bold', color = 'white')
plt.gca().invert_yaxis()
plt.show()

Malaysia has the highest percentage of women respondents in 2020 and most of the countries in this list are from Asia. Ireland and Canada, each, is the country with the highest percentage of women respondents from Europe and America, respectively.

### Age

In [None]:
dt_jt_age = dt_nq.groupby('Q1')['Q5'].value_counts().unstack()

for jt in dt_jt_age.columns:
    dt_jt_age[jt] = (dt_jt_age[jt]/sum(dt_jt_age[jt])*100)
dt_jt_age = dt_jt_age.drop(['Currently not employed'], axis = 1)

colorz = dict(zip(dt_jt_age.index, cm.Blues(np.linspace(0, 1, len(dt_jt_age.index)))[::-1]))
ax = dt_jt_age.T.plot.barh(stacked=True, figsize=(8, 10), color = colorz, 
                           edgecolor = 'white',width = 0.7)

firstcol = dt_jt_age.loc['18-21'].values

for p in ax.patches:
    left, bottom, width, height = p.get_bbox().bounds
    if width > 2.0:
        if np.isin(width, firstcol):
            ax.annotate(str(int(width)), xy=(left+width/2, bottom+height/2), 
                        ha='center', va='center', size = 10, color = 'white')  
        else:
            ax.annotate(str(int(width)), xy=(left+width/2, bottom+height/2), 
                        ha='center', va='center', size = 10)
        
ax.legend(title='Age Group', bbox_to_anchor=(1, 0.5))
ax.set_ylabel('Job Title', fontweight = 'bold')
ax.set_xlabel('Portion from the Job Title (%)', fontweight = 'bold')

_ = plt.title('Age Group by Job Title', fontweight = 'bold')

Based in this dataset, most of the data fellows are millenials. For nearly all data-related job titles, you will find these folks are most frequently 22-29 years old, but for Product Manager and Research Scientist positions, they are mostly 25-34 years old.

### Formal Education

In [None]:
dt_jt_edu = dt_nq.groupby(['Q4','Q5']).size().unstack(level=1, fill_value=0)
dt_jt_edu = dt_jt_edu.reindex(index =[ 'I prefer not to answer',
                                     'No formal education past high school',
                                     'Some college/university study without earning a bachelor’s degree',
                                     'Professional degree',"Bachelor’s degree", "Master’s degree",
                                     'Doctoral degree'
                                    ])

for edu in dt_jt_edu.columns:
    dt_jt_edu[edu] = (dt_jt_edu[edu]/sum(dt_jt_edu[edu])*100)
dt_jt_edu = dt_jt_edu.drop(['Currently not employed'], axis = 1)

colorz = dict(zip(dt_jt_edu.index,  cm.Blues(np.linspace(0.3, 1, len(dt_jt_edu.index)))[::-1]))
ax = dt_jt_edu.T.plot.barh(stacked=True, figsize=(8, 12), color = colorz, edgecolor='white',width = 0.7)

firstcol = dt_jt_edu.loc['I prefer not to answer'].values

for p in ax.patches:
    left, bottom, width, height = p.get_bbox().bounds
    if width > 2.0:
        if (np.isin(width, firstcol)):
            ax.annotate(str(int(width)), xy=(left+width/2, bottom+height/2),ha='center', va='center',
                       color ='white')   
        else:
            ax.annotate(str(int(width)), xy=(left+width/2, bottom+height/2),ha='center', va='center')
        
ax.legend(title='Formal Education', bbox_to_anchor=(1,-0.1))
ax.set_ylabel('Job Title', fontweight = 'bold')
ax.set_xlabel('Portion from the Job Title (%)', fontweight = 'bold')
_ = plt.title('Formal Education Background by Job Title', fontweight = 'bold')

It can be concluded that most of the data folks have earned Master's degree for nearly every job title, while most of the Research Scientists have Doctoral degree.

### Programming Experience

In [None]:
dt_jt_prog = dt_nq.groupby(['Q6','Q5']).size().unstack(level=1, fill_value=0)
dt_jt_prog = dt_jt_prog.reindex(index =['I have never written code', '< 1 years', '1-2 years',  
                     '3-5 years', '5-10 years', '10-20 years','20+ years'])

for jt in dt_jt_prog.columns:
    dt_jt_prog[jt] = (dt_jt_prog[jt]/sum(dt_jt_prog[jt])*100)
dt_jt_prog = dt_jt_prog.drop(['Currently not employed'], axis = 1)

colorz = dict(zip(dt_jt_prog.index, cm.Blues(np.linspace(0, 1, len(dt_jt_prog.index)))[::-1]))
ax = dt_jt_prog.T.plot.barh(stacked=True, figsize=(8, 10), color = colorz, 
                            edgecolor = 'white',width = 0.7)

firstcol = dt_jt_prog.loc['I have never written code'].values
for p in ax.patches:
    left, bottom, width, height = p.get_bbox().bounds
    if width > 1.5:
        if (np.isin(width, firstcol)):
            ax.annotate(str(int(width)), xy=(left+width/2, bottom+height/2),ha='center', va='center',
                       color ='white')   
        else:
            ax.annotate(str(int(width)), xy=(left+width/2, bottom+height/2), 
                        ha='center', va='center')

ax.legend(title='Years of Programming', bbox_to_anchor=(1, 0.5))
ax.set_ylabel('Job Title', fontweight = 'bold')
ax.set_xlabel('Portion from the Job Title (%)', fontweight = 'bold')
_ = plt.title('Experience in Programming by Job Title', fontweight = 'bold')

Generally, 3-5 years is the most frequent period for programming experience, but there are still some people (with big percentage from each job title) have less than 3 years of programming experience. The most frequent period of programming experience for Data Analyst is 1-2 years, while for business analyst is less than 1 year.

### Experience Using ML Methods

In [None]:
dt_jt_ml = dt_nq.groupby(['Q15','Q5']).size().unstack(level=1, fill_value=0)
dt_jt_ml = dt_jt_ml.reindex(index =['I do not use machine learning methods',
                                        'Under 1 year', '1-2 years', '2-3 years',
                                        '3-4 years', '4-5 years',
                                        '5-10 years', '10-20 years','20 or more years'])

for jt in dt_jt_ml.columns:
    dt_jt_ml[jt] = (dt_jt_ml[jt]/sum(dt_jt_ml[jt])*100)

dt_jt_ml = dt_jt_ml.drop(['Currently not employed'], axis = 1)

colorz = dict(zip(dt_jt_ml.index, cm.Blues(np.linspace(0, 1, len(dt_jt_ml.index)))[::-1]))
ax = dt_jt_ml.T.plot.barh(stacked=True, figsize=(8, 10), color = colorz, 
                            edgecolor = 'white',width = 0.7)

firstcol = dt_jt_ml.loc['I do not use machine learning methods'].values
for p in ax.patches:
    left, bottom, width, height = p.get_bbox().bounds
    if width > 1.5:
        if (np.isin(width, firstcol)):
            ax.annotate(str(int(width)), xy=(left+width/2, bottom+height/2),ha='center', va='center',
                       color ='white')   
        else:
            ax.annotate(str(int(width)), xy=(left+width/2, bottom+height/2), 
                        ha='center', va='center')

ax.legend(title='Years of Using ML Methods', bbox_to_anchor=(1, 0.5))
ax.set_ylabel('Job Title', fontweight = 'bold')
ax.set_xlabel('Portion from the Job Title (%)', fontweight = 'bold')
_ = plt.title('Experience in Using ML Methods by Job Title', fontweight = 'bold')

In general, less than 1 year is the most frequent value of experience in using ML Methods, except for Data Scientist, Data Engineer, Machine Learning Engineer and Database Engineer. Most of the respondents with the first three job titles have 1-2 years of experience using ML methods, while 30% Database Engineers have not used any ML methods yet.

# Getting to Know Better Your Employers



When pursuing an employment in data-related job, one might wonder about the expected compensation as data professionals. Not only that, but information about the state of machine learning/data science application of the prospective companies also can be insightful for some job seekers.

### Current State of ML Application in Enterprises

In [None]:
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

world['ML_state'] = 'No Data'

mls_map = {
    'No Data': 0,
    'We have well established ML methods (i.e., models in production for more than 2 years)' : 6,
    'No (we do not use ML methods)' : 2,
    'We use ML methods for generating insights (but do not put working models into production)' : 4,
    'We are exploring ML methods (and may one day put a model into production)' : 3,
    'We recently started using ML methods (i.e., models in production for less than 2 years)' : 5,
    'I do not know' : 1
}

for c in dt_nq.Q3.unique():
    state = dt_nq.loc[dt_nq.Q3 == c, 'Q22'].value_counts().index[0]
    world.loc[world['name'] == c, 'ML_state'] = state

world.ML_state = world.ML_state.map(lambda x: mls_map[x])

fig, ax = plt.subplots(1, figsize = (20,15))
world.plot(column='ML_state', ax=ax, categorical=True, cmap = 'Blues',
             legend_kwds={'bbox_to_anchor':(0.3, 0.5),'fontsize':9,'title':'ML Application'}, 
             legend = True, edgecolor="black")
ax.axis('off')

def replace_legend_items(legend, mapping):
    for txt in legend.texts:
        for k,v in mapping.items():
            if txt.get_text() == str(k):
                txt.set_text(v)
replace_legend_items(ax.get_legend(), {v: k for k, v in mls_map.items()})

ax.set_title('Countries by Most Frequent Current State of ML Application in Companies', fontweight = 'bold')
plt.tight_layout()

Respondents from US, Australia & some European countries mostly work in the companies where ML methods have been well established. In Asia, the companies where the respondents work are still in early stage at applying ML methods.

### Budget on Machine Learning and/or Cloud Computing Services

It can be insightful to know how much an enterprise has spent their budget in Machine Learning/Cloud Computing Service depending on their current state of Machine Learning application.

In [None]:
#Q22 = ML incorporation
#Q25 = Budget in ML/CCS

cols = [col for col in dt_nq if col.startswith('Q22') or col.startswith('Q25') ]
dt_ent = dt_nq[cols]
dt_ent = dt_ent.loc[(dt_ent.Q22.notnull()) & (dt_ent.Q25.notnull())]


dt_agg_ent = dt_ent.groupby(['Q22'])['Q25'].value_counts().unstack()
dt_agg_ent = dt_agg_ent[['$0 ($USD)', '$1-$99', '$100-$999','$1000-$9,999',
                         '$10,000-$99,999', '$100,000 or more ($USD)']]
dt_agg_ent = dt_agg_ent.reindex(index =[
    'I do not know', 'No (we do not use ML methods)', 
    'We are exploring ML methods (and may one day put a model into production)',
    'We use ML methods for generating insights (but do not put working models into production)',
    'We recently started using ML methods (i.e., models in production for less than 2 years)',
    'We have well established ML methods (i.e., models in production for more than 2 years)'])

f,ax = plt.subplots(figsize=(8, 5))
ax = sns.heatmap(dt_agg_ent, linewidths=1, linecolor='white', annot = True, fmt='d', cmap = 'Blues') 
ax.set_ylabel('ML Incorporation in The Company', fontweight = 'bold')
ax.set_xlabel('Budget Spent for ML/Cloud Computing', fontweight = 'bold')
_  = plt.title("Budget Spent for ML/Cloud Computing\nBased on ML Incorporation in The Company", fontweight = 'bold')

Based on the plot above, there are still a lot of employers that have not applied ML methods to their business yet (which might also reflected on their budget for ML/Cloud computing services). For the employers that have well established ML methods, 410 of them have spent around USD 100,000 or more for ML/Cloud computing services.

### Number of People Responsible for Data Science Workloads in The Company

In [None]:
world['DS People'] = 'No Data'

for c in dt_nq.Q3.unique():  
    state = dt_nq.loc[dt_nq.Q3 == c, 'Q21'].value_counts().index[0]
    world.loc[world['name'] == c, 'DS People'] = state
    
dsp_map = {'No Data': 0, '20+':7, '0':1, '5-9':4, '1-2':2, '3-4':3, '10-14':5, '15-19':6}
    
world['DS People'] = world['DS People'].map(lambda x: dsp_map[x])
            
fig, ax = plt.subplots(1, figsize = (20,15))
world.plot(column='DS People', ax=ax, categorical=True,cmap = 'Blues',
             legend_kwds={'bbox_to_anchor':(.15, .6),'fontsize':9,'title':'No. of Data Science People'}, 
             legend = True, edgecolor="black")

replace_legend_items(ax.get_legend(), {v: k for k, v in dsp_map.items()})
ax.axis('off')
ax.set_title('Countries by Most Frequent Number of Data Science People in Companies', fontweight = 'bold')
plt.tight_layout()

Respondents from US, India, South Africa, and some European countries countries mostly work in the companies that employ 20+ people for data science roles. From this dataset, respondents from other countries work for companies that have smaller data science team.

### Yearly Compensation by Job Title

For this part, the measurement is using median. Let's see how much each job title earns for a living, globally.

In [None]:
map_value = { '$0-999' : 0, '1,000-1,999' : 1 ,'2,000-2,999' : 2,'3,000-3,999' : 3, 
              '4,000-4,999' : 4,'5,000-7,499' : 5, '7,500-9,999' : 6, '10,000-14,999' : 7,
              '15,000-19,999' : 8 ,'20,000-24,999': 9,'25,000-29,999' : 10,'30,000-39,999' : 11,
              '40,000-49,999' : 12,'50,000-59,999': 13,'60,000-69,999' : 14,'70,000-79,999' : 15,
              '80,000-89,999' :16,'90,000-99,999' : 17,'100,000-124,999' :18, '125,000-149,999': 19,
              '150,000-199,999' :20, '200,000-249,999' : 21,'250,000-299,999' : 22,'300,000-500,000' : 23,
              '> $500,000' : 25
}

In [None]:
dt_wg = dt_nq.loc[dt_nq['Q24'].notnull()]
dt_wg['wg_cat'] = dt_wg.Q24.apply(lambda x : map_value[x])

jt = []
meds_20 = []

for job in dt_wg.Q5.unique():
    med_20 = int(dt_wg.loc[dt_wg["Q5"] == job, "wg_cat"].median())
    cat_20 = (list(map_value.keys())[list(map_value.values()).index(med_20)])
    jt.append(job)
    meds_20.append(med_20)
    
# set width of bar
barWidth = 0.8
 
# Set position of bar on X axis
r1 = np.arange(len(meds_20))

# Make the plot
plt.figure(figsize=(8,5))
plt.barh(r1, meds_20, color='lightblue', height=barWidth, edgecolor='white')
plt.yticks([r + 0.03 for r in range(len(meds_20))], jt)
plt.tick_params(
    axis='x',         
    which='both',     
    bottom=False,     
    top=False,        
    labelbottom=False)
plt.title('Global Yearly Compensation for Each Job Title (2020)', fontweight = 'bold')
plt.xlabel('Yearly Compensation (USD)', fontweight='bold')
plt.ylabel('Job Title', fontweight='bold')
for y, x in enumerate(meds_20):
    plt.annotate(str(list(map_value.keys())[list(map_value.values()).index(x)]) + ' ', 
                 xy=(x, y), va='center', ha = 'right', fontsize=9, fontweight ='bold')

# plt.show()

Data Scientists & Data Engineers have the highest median yearly compensation (among the working level with similar job title). 
Now, let's see the yearly compensation in 2019.

In [None]:
dt_wg_19 = dt_nq_19.loc[dt_nq_19['Q10'].notnull()]
dt_wg_19['wg_cat'] = dt_wg_19.Q10.apply(lambda x : map_value[x])

jt = []
meds_19 = []

for job in dt_wg_19.Q5.unique():
    med_19 = int(dt_wg_19.loc[dt_wg_19["Q5"] == job, "wg_cat"].median())
    cat_19 = (list(map_value.keys())[list(map_value.values()).index(med_19)])
    jt.append(job)
    meds_19.append(med_19)
    
# set width of bar
barWidth = 0.8
 
# Set position of bar on X axis
r1 = np.arange(len(meds_19))

# Make the plot
plt.figure(figsize=(8,5))
plt.barh(r1, meds_19, color='lightblue', height=barWidth, edgecolor='white')
plt.yticks([r + 0.03 for r in range(len(meds_19))], jt)
plt.tick_params(
    axis='x',         
    which='both',     
    bottom=False,     
    top=False,        
    labelbottom=False)

plt.title('Global Yearly Compensation for Each Job Title (2019)', fontweight = 'bold')
plt.xlabel('Yearly Compensation (USD)', fontweight='bold')
plt.ylabel('Job Title', fontweight='bold')
for y, x in enumerate(meds_19):
    plt.annotate(str(list(map_value.keys())[list(map_value.values()).index(x)]) + ' ', 
                 xy=(x, y), va='center', ha = 'right', fontsize=9, fontweight ='bold')

# plt.show()

In 2019, median yearly compensation for Data Scientist is the same as Product/Project Manager's. The values of yearly compensation in 2019 was higher than in 2020 for nearly every job title.

### Highest Paid Countries

This part is also using median as the measurement.

In [None]:
country = []
meds_20 = []

for ctr in dt_nq.Q3.unique():
    #get the country list
    med_20 = int(dt_wg.loc[dt_wg.Q3 == ctr, "wg_cat"].median())
    cat_20 = (list(map_value.keys())[list(map_value.values()).index(med_20)])
    
    country.append(ctr)
    meds_20.append(med_20)

dt_wg_ctry = pd.DataFrame(data = {'Country': country, '2020': meds_20})
dt_wg_20_sort = dt_wg_ctry[['Country', '2020']].sort_values('2020', ascending = False).head(10)

fig,ax = plt.subplots(figsize=(8,5))

ax.barh(dt_wg_20_sort['Country'], dt_wg_20_sort['2020'],color='lightblue')
ax.invert_yaxis()

plt.tick_params(
    axis='x',          
    which='both',      
    bottom=False,      
    top=False,         
    labelbottom=False)
plt.title('Top 10 Countries with Highest Yearly Compensation', fontweight = 'bold')
plt.xlabel('Yearly Compensation (USD)', fontweight='bold')
plt.ylabel('Country', fontweight='bold')

for y, x in enumerate(dt_wg_20_sort['2020']):
    plt.annotate(str(list(map_value.keys())[list(map_value.values()).index(x)]) + ' ', 
                 xy=(x, y), va='center', ha='right', fontsize=9, fontweight ='bold')

# Care to Prepare

So, you now know at a gist about the data folks and current state of your potential employers. What's next? Here are several things that might help you become more ready to roll.

### The Most Important Part of The Job

As a start, what do the data folks consider as the most important part of their work? 

In [None]:
job_task_col = [col for col in dt_nq if col.startswith('Q23') or col.startswith('Q5') ]
dt_job_task = dt_nq[job_task_col]

job_task_cols = ['Job Title', 'Q23_Part_1', 'Q23_Part_2', 'Q23_Part_3', 'Q23_Part_4',
       'Q23_Part_5', 'Q23_Part_6', 'Q23_Part_7', 'Q23_OTHER']

dt_job_task.columns = job_task_cols

# Replace NaN with 0, else with 1 (?)
for c in job_task_cols[1:-2]:
    dt_job_task[c] = 1 - dt_job_task[c].isnull().astype(int)
    
dt_agg_job_task = dt_job_task[job_task_cols[:-2]].groupby(['Job Title']).agg('sum')
dt_agg_job_task = dt_agg_job_task.drop(['Currently not employed', 'Student'], axis = 0)
# Set position of bar on X axis
barWidth = 0

x = dt_agg_job_task.index #Job Title 
y = dt_agg_job_task.max(axis = 1) #Total Respondents
lgds = dt_agg_job_task.idxmax(axis=1) #Most Important Task 
x_idx = np.arange(len(x))

# Create Masks for coloring the chart
masks = []

for l in lgds.unique():
    masks.append(lgds == l)


activities = ['Analyze and understand data to influence product or business decisions',
              'Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data',
              'Build prototypes to explore applying machine learning to new areas',
              'Do research that advances the state of the art of machine learning']
colors = cm.Blues(np.linspace(0.4, 1, len(activities)))[::-1]

# Make the plot
f, ax = plt.subplots(figsize=(8, 6))

for i, mask in enumerate(masks):
    plt.barh(x_idx[mask], y[mask], color=colors[i], edgecolor='white', label = activities[i])

# Add xticks on the middle of the group bars
plt.ylabel('Job Title', fontweight='bold')
plt.xlabel('Total Respondents', fontweight='bold')
plt.yticks([r + barWidth for r in range(len(x))], x)
plt.title('What Role Is Considered The Most Important at Your Work?', fontweight='bold')

for z, x in enumerate(y):
    plt.annotate('' + str(x), xy=(x, z), va='center', fontsize=9, fontweight ='bold')
    
ax.legend(handles=ax.lines[::len(x_idx)+1], labels=activities, fontsize=8, title='Tasks', bbox_to_anchor=(1, -0.1))
plt.show()

Most of these data-related job titles consider analyzing and understanding data to influence product or business decisions as the most important part of their job.

### Programming Language

Getting involved with data most of the time requires programming to deal with it. Let's see what programming languages are regularly used by the data folks. These numbers were derived from dividing the number of people from each job title selecting that programming language, by total population of each job title.

In [None]:
pl_col = [col for col in dt_nq if col.startswith('Q7') or col.startswith('Q5') ]
dt_pl = dt_nq[pl_col]

pl_cols = ['Job Title', 'Python', 'R', 'SQL', 'C', 'C++', 'Java',
       'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other']
dt_pl.columns = pl_cols
dt_pl = dt_pl.drop(['None', 'Other'], axis = 1)

# Replace NaN with 0, else with 1 (?)
for c in dt_pl.columns[1:]:
    dt_pl[c] = 1 - dt_pl[c].isnull().astype(int)
    
dt_agg_pl = dt_pl.groupby(['Job Title']).agg('sum')
for i in dt_agg_pl.index:
   dt_agg_pl.loc[i] = dt_agg_pl.loc[i]/len(dt_pl.loc[dt_pl['Job Title'] == i])

dt_agg_pl = dt_agg_pl.drop(['Currently not employed'], axis = 0)

f,ax = plt.subplots(figsize=(10, 6))
ax = sns.heatmap(dt_agg_pl, linewidths=1, linecolor='white', annot = True, fmt='.0%', cmap = 'Blues') 
ax.set_xlabel('Programming Language', fontweight = 'bold')
ax.set_ylabel('Job Title', fontweight = 'bold')
_  = plt.title("Programming Language Regularly Used\nBased on Job Title", fontweight = 'bold')

Most of these data-related job titles use Python as the most regulary used programming language. Meanwhile, Statisticians mostly use R and Database Engineers use SQL, and Python is the second most regularly used programming language for both job titles.

### ML Algorithms Regularly Used

Machine Learning is now widely used to solve some targets which requires data. These machine learning methods might help to get prepared of what to learn when we are assigned to solve data-related targets. These numbers were also derived from dividing the number of people from each job title selecting that algorithm, by total population of each job title.

In [None]:
#Q17 = Algorithm regularly used
#Q5 = Job Title

alg_job_col = [col for col in dt_nq if col.startswith('Q17') or col.startswith('Q5') ]
dt_alg_job = dt_nq[alg_job_col]

#Change Col Names
new_cols_2 = ['Job', 'Regressions', 'Decision Tress/Random Forest', 'Gradient Boosting Machines', 
              'Bayesian Approaches', 'Evolutionary Approaches', 'Deep Neural Networks', 
              'Convolutional Neural Networks', 'Generative Adversarial Networks', 
              'Recurrent Neural Network', 'Transformer Networks', 'None', 'Other']
dt_alg_job.columns = new_cols_2

# Replace NaN with 0, else with 1 (?)
for c in new_cols_2[1:-2]:
    dt_alg_job[c] = 1 - dt_alg_job[c].isnull().astype(int)
    
dt_agg_alg_job = dt_alg_job[new_cols_2[:-2]].groupby(['Job']).agg('sum')
    
for c in dt_agg_alg_job.index:
    dt_agg_alg_job.loc[c] = (dt_agg_alg_job.loc[c]/len(dt_alg_job.loc[dt_alg_job.Job == c]))
 
dt_agg_alg_job = dt_agg_alg_job.drop(['Currently not employed'], axis = 0)

f,ax = plt.subplots(figsize=(10, 6))
ax = sns.heatmap(dt_agg_alg_job, linewidths=1, linecolor='white', annot = True, fmt='.0%', cmap = 'Blues') 
ax.set_xlabel('ML Algorithms', fontweight = 'bold')
ax.set_ylabel('Job Title', fontweight = 'bold')
_  = plt.title("ML Algorithms Regularly Used\nBased on Job Title", fontweight = 'bold')

Data folks use Regression and Decision Trees/Random Forest the most for any job title. The next mostly used algorithms are the neural network groups, such as DNN, CNN, and RNN. For Machine Learning Engineers, CNN is the second most regularly used ML method.

### Okay, So Where Can I Be Introduced to These Stuffs?

So these methods can help you become more familiar with Data Science

In [None]:
#Q37 = Online course platform
#Q5 = Job Title

ol_job_col = [col for col in dt_nq if col.startswith('Q37') or col.startswith('Q5') ]
dt_ol_job = dt_nq[ol_job_col]

#Change Col Names
new_cols_3 = ['Job', 'Coursera', 'edX', 'Kaggle Learn Courses', 
              'DataCamp', 'Fast.ai', 'Udacity', 
              'Udemy', 'LinkedIn Learning', 
              'Cloud-certification Programs', 'University Courses', 'None', 'Other']
dt_ol_job.columns = new_cols_3

# Replace NaN with 0, else with 1 (?)
for c in new_cols_3[1:-2]:
    dt_ol_job[c] = 1 - dt_ol_job[c].isnull().astype(int)
    
dt_agg_ol_job = dt_ol_job[new_cols_3[:-2]].groupby(['Job']).agg('sum')
    
for c in dt_agg_ol_job.index:
    dt_agg_ol_job.loc[c] = (dt_agg_ol_job.loc[c]/len(dt_ol_job.loc[dt_ol_job.Job == c]))
 
dt_agg_ol_job = dt_agg_ol_job.drop(['Currently not employed'], axis = 0)

f,ax = plt.subplots(figsize=(10, 6))
ax = sns.heatmap(dt_agg_ol_job, linewidths=1, linecolor='white', annot = True, fmt='.0%', cmap = 'Blues') 
ax.set_xlabel('Online Courses', fontweight = 'bold')
ax.set_ylabel('Job Title', fontweight = 'bold')
_  = plt.title("Online Courses Enrolled Based on Job Title", fontweight = 'bold')

Data folks relied on Coursera, Kaggle Learn Courses, and Udemy as their online courses to enrich themselves in Data Science.

In [None]:
#Q39 = Algorithm regularly used
#Q5 = Job Title

med_job_col = [col for col in dt_nq if col.startswith('Q39') or col.startswith('Q5') ]
dt_med_job = dt_nq[med_job_col]

#Change Col Names
new_cols_4 = ['Job','Twitter', 'Email newsletter', 'Reddit', 'Kaggle', 
              'Course Forums', 'YouTube', 'Podcasts', 
              'Blogs', 'Journal Publications', 
              'Slack Communities', 'None', 'Other']
dt_med_job.columns = new_cols_4

# Replace NaN with 0, else with 1 (?)
for c in new_cols_4[1:-2]:
    dt_med_job[c] = 1 - dt_med_job[c].isnull().astype(int)
    
dt_agg_med_job = dt_med_job[new_cols_4[:-2]].groupby(['Job']).agg('sum')
    
for c in dt_agg_med_job.index:
    dt_agg_med_job.loc[c] = (dt_agg_med_job.loc[c]/len(dt_med_job.loc[dt_med_job.Job == c]))
 
dt_agg_med_job = dt_agg_med_job.drop(['Currently not employed'], axis = 0)

f,ax = plt.subplots(figsize=(10, 6))
ax = sns.heatmap(dt_agg_med_job, linewidths=1, linecolor='white', annot = True, fmt='.0%', cmap = 'Blues') 
ax.set_xlabel('Media Sources', fontweight = 'bold')
ax.set_ylabel('Job Title', fontweight = 'bold')
_  = plt.title("Preferred Media Sources about Data Science Topic Based on Job Title", fontweight = 'bold')

Kaggle, YouTube and Blogs are the most preferred media sources for data science topic for any job title, while Journal Publications are also often referred by Research Scientist.

==============================================================================