In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved 
# as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of 
# the current session

# Characterization of Unemployed Data Scientists/Machine Leaning Engineers in 2020

## Data used in this notebook: Kaggle State of Machine Learning and Data Science 2020

The raw data for the survey results can be found at:

https://www.kaggle.com/c/kaggle-survey-2020/data

__Overview:__ To find out in what respects the Kaggle users who self-identified as _Currently not employed_ on the survey, which we can assume to be _data scientists_ since they use Kaggle, differ from the population of data scientists who are currently employed. 

Based on the raw data set of 20,036 Kaggle survey respondents, this report focuses on the 1,652 (\~8%) that are unemployed and compares them to the 3,758 (\~19%) (employed) data scientists/ML engineers.

__Methodology:__ Since the job function of data scientists and Machine Learning engineers are ususally highly overlapping, we decided to merge these two categories into a larger, more generic "data science/ML engineer" category. We sometimes refer to this category simply as _employed_ in this report.

#### Key Results
 - The heavy gender imbalance which characterizes data science is similarly reflected in the unemployed dataset but with a statistically significantly higher fraction of women in the unemployed group then in the employed. <br />
<br />
 -  The fractions of respondents in the age groups: 22-24, and 55-59 are statistically significantly higher in the unemployed sample than in the employed. Close to 50% of survey respondents worldwide age 55-59 are currently unemployed. This should be contrasted with an unemployment percentage of \~24% in the 30-39 age range. <br />
<br /> 
 - For the US, the highest unemployment rate (47%) is for respondents age 60-69, while the lowest (~20%) is for respondents in the 30-44 age range. <br />
<br />
 - Having a Doctoral or Master's degree seems to provide an edge to data scientists with respect to employment. The fraction of unemployed data scientists with a Master's was \~36% in the unemployed group vs. \~49% in the employed. The fraction of unemployed data scientists with a PhD (7.2%) was about half of that of the employed with PhD (16.1%). However, having only a Bachelor's degree does not guarantee employment as a data scientist by far: in the unemployed group, about \~43% of survey respondents had a Bachelor's degree. <br />

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline
plt.style.use('ggplot')

verbose = False # Switch to True to get a more verbose output when running the Notebook

### Read-in the data file

Identify and save the questions in a series where the index is the question labels (Q1, Q2, etc.) and the values are the questions themselves. Create a DataFrame where the columns are the question labels and the values are the answers.

In [None]:
# Read-in the raw survey data (.csv file)
cwd = os.getcwd()
print(cwd)
# First two rows of the dataset are:
#   1st row: time_to_complete_survey followed by question number
#   2nd row: the question asked
survey = pd.read_csv("../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv", low_memory=False)
# Create a series where the index are the question labels, the values are the questions
# (Note: ignore the first column which is the duration to complete the survey)
questions = survey.iloc[0,1:]
if verbose:
    print("survey_questions:\n{}".format(questions))
# Remove the row corresponding to "questions asked" from the DataFrame and keep
# only the question labels as columns and the answers as values
survey.drop([0], inplace=True)
n_survey_resp = survey.shape[0] # number of Kaggle2020 survey respondents
print("# survey responses: {}".format(n_survey_resp))
if verbose >= 2:
    print(survey.drop(["Time from Start to Finish (seconds)"], axis=1).head())
if verbose:
    print(survey.shape)

In [None]:
def test_diff_two_proportions(p1, p2, N1, N2, alpha=0.05, test_type='two_tailed', verbose=False):
    """ Function which computes the p-value associated with the hypothesis test of difference 
    between two proportions p1 and p2 derived from two independent samples of sizes N1, N2 respectively.
    Significance level (alpha) is 0.05 by default. Specify if the test should be: "one_tailed" or "two_tailed" 
    source: https://stattrek.com/hypothesis-test/difference-in-proportions.aspx
    """
    from scipy.stats import norm
    
    # First compute the pooled sample proportion p
    p = (p1 * N1 + p2 * N2)/(N1 + N2)
    if verbose:
        print("p: {}".format(round(p,3)))
    # Compute the standard error (SE) of the sampling distribution difference between two proportions
    SE = np.sqrt(p * (1 - p) * (1.0/N1 + 1.0/N2))
    if verbose:
        print("SE: {}".format(round(SE,3)))
    # Compute the test statistic (z-score)
    z_val = (p1 - p2)/SE
    if verbose:
        print("z: {}".format(round(z_val,3)))
    # If z < 0, find proba = Prob(z < z_val) from the Normal distribution, else find proba = Prob(z > z_val)
    # The probability that z < z_val is the area under the pdf to the left of z_val
    # This area is the integration of the pdf from -inf to z_val, i.e. the value of the CDF 
    # (Cumulative Density Function) at z_val
    if (z_val < 0.0):
        proba = norm(loc = 0.0, scale = 1).cdf(z_val)
    else:
        proba = 1.0 - norm(loc = 0.0, scale = 1).cdf(z_val)
    if test_type == 'two_tailed':
        # Compute the p-value as p_val = 2 * proba (for a 2-tailed test)
        p_val = 2 * proba
    else:
        p_val = proba
    if verbose:
        print("p_value: {}".format(round(p_val,3))) 
        if (p_val < alpha):
            print("We can reject the null hypothesis that the two sample proportions are equal")
        else:
            print("We cannot reject the null hypothesis that the two sample proportions are equal")
    return p_val

### Identify the unemployed survey respondents

Compute the fraction of survey respondents which identified themselves as _Currently not employed_ in Q5

In [None]:
if verbose:
    print(survey["Q5"].unique())
    print(survey["Q5"].nunique())
    print(survey["Q5"].count())
is_unemployed = survey["Q5"] == "Currently not employed"
is_data_scientist = survey["Q5"] == "Data Scientist"
is_ML_engineer = survey["Q5"] == "Machine Learning Engineer"
is_US_based = survey["Q3"] == "United States of America"
if verbose:
    print("US respondents: {}".format(is_US_based.sum()))
is_DS_MLEng = is_data_scientist | is_ML_engineer
is_unemployed_USA = is_unemployed & is_US_based
is_DS_MLEng_USA = is_DS_MLEng & is_US_based
# number of subjects in the 3 subgroups: "Data Scientist", "Machine Learning Engineer", Unemployed"
n_subjects_in_group = {"Data_Scientist":is_data_scientist.sum(), "ML_Eng":is_ML_engineer.sum(), \
                       "Unemployed":is_unemployed.sum(), "DS_MLEng":is_DS_MLEng.sum()}
N1, N2 = n_subjects_in_group["Unemployed"], n_subjects_in_group["DS_MLEng"]
percent_unemployed = round(100 * N1 / n_survey_resp,1)
percent_employed = round(100 * N2 / n_survey_resp,1)
print("percent unemployed: {}%".format(percent_unemployed))
print("percent employed: {}%".format(percent_employed))
if verbose:
    print(n_subjects_in_group)

## Demographics of unemployed survey respondents

Create horizontal bar plot showing the gender identity, age demographics, of the _Currently not employed_ Kaggle survey respondents.


### Gender identity of unemployed survey respondents

In [None]:
# Gender identity question is Q2: What is your gender?
survey_unemployed = survey[is_unemployed]
survey_employed = survey[is_DS_MLEng]
if verbose:
    print(survey_unemployed[["Q2"]].head())
# counts of unemployed and employed women
count_unemployed = survey_unemployed.groupby("Q2")["Q2"].count()
count_employed = survey_employed.groupby("Q2")["Q2"].count()
if verbose:
    print("women unemployed: {}".format(count_unemployed["Woman"]))
    print("women employed: {}".format(count_employed["Woman"]))
p1 = round(count_unemployed["Woman"]/N1, 3)
p2 = round(count_employed["Woman"]/N2, 3)
p_val = test_diff_two_proportions(p1, p2, N1, N2, alpha=0.05, test_type='two_tailed', verbose=False)
print("Test statistical significance of difference in fractions of women in unemployed vs employed groups")
print("  p1: {}, p2: {}, N1: {}, N2: {}; p_val: {}".format(p1, p2, N1, N2, p_val))
gender_demog_unemployed = 100 * count_unemployed / n_subjects_in_group["Unemployed"]
gender_demog_unemployed = gender_demog_unemployed.round(1)
gender_demog_employed = 100 * count_employed / n_subjects_in_group["DS_MLEng"]
gender_demog_employed = gender_demog_employed.round(1)
# Sort the series by descending values
gender_demog_unemployed.sort_values(axis=0, ascending=True, inplace=True)
if verbose:
    print(gender_demog_unemployed)
    print(gender_demog_employed)

# Plot the horizontal bar graph
bar_width = 0.5
idx_percent = np.linspace(0, 100, 11)
percent_labels = [str(int(idx)) + "%" for idx in idx_percent]
fig, axs = plt.subplots(figsize=(12, 4))
rects = axs.barh(gender_demog_unemployed.index, gender_demog_unemployed.values, \
                 bar_width, color='b')
axs.set_xlim(0, 100)
if verbose >= 2:
    print('Current xlim: ',plt.xlim())
axs.set_xticks(idx_percent)
axs.set_xticklabels(percent_labels, rotation=0)
axs.set_ylabel("Gender identity")
axs.set_title("Gender identity of $\it{Currently\ not\ employed}$ Kaggle survey respondents")
# if verbose:
#     for rect in rects:
#         height = rect.get_height()
#         width = rect.get_width()
#         x = rect.get_x()
#         y = rect.get_y()
#         print(x, y, width, height)
        
def autolabel_barh(rects):
    """ Attach a text label to the right of each horizontal bar displaying its width """
    for rect in rects:
        width = rect.get_width()
        axs.text(width + (2 * plt.xlim()[1])/100., rect.get_y() + rect.get_height()/2., 
                '{}%'.format(width), ha='left', va='center')
        
autolabel_barh(rects)


We see that even though women represent only 16.4% of the survey respondents who identified themselves as Data Scientists [1], and 15.3% of those who identified themselves either as Data Scientist or Machine Learning Engineer, a statistically significantly larger percentage (22.8%) of women can be found in the unemployed subset (p-value = 2.5e-11).

### Global (worlwide) age distribution of unemployed survey respondents

In [None]:
# Age range question is Q1: What is your age (# years)?
if verbose >= 2:
    print(survey_unemployed[["Q1"]].head())
age_demog_unemployed = 100 * survey_unemployed.groupby("Q1")["Q1"].count() / n_subjects_in_group["Unemployed"]
age_demog_unemployed = age_demog_unemployed.round(1)
age_demog_employed = 100 * survey_employed.groupby("Q1")["Q1"].count() / n_subjects_in_group["DS_MLEng"]
age_demog_employed = age_demog_employed.round(1)
# Reverse the index of series so that bars are plotted from oldest (bottom) to youngest (top)
age_demog_unemployed.sort_index(ascending=False, inplace=True)
age_demog_employed.sort_index(ascending=False, inplace=True)
n_groups = len(age_demog_unemployed)
if verbose:
    print("num. age bins: {}".format(n_groups))
    print(age_demog_unemployed)
    print(age_demog_employed)
# Determine the age ranges for which the difference in the proportions for employed and unemployed 
# are statistically significant
for age_range in list(age_demog_unemployed.index):
    p1 = age_demog_unemployed[age_range]/100
    p2 = age_demog_employed[age_range]/100
    p_val = test_diff_two_proportions(p1, p2, N1, N2, alpha=0.05, test_type='two_tailed', verbose=False)
    print("age_range: {}; p_val: {}".format(age_range, p_val))

# Plot horizontal bar graph showing distribution of age ranges for Unemployed
# and Employed survey respondents
bar_width = 0.40 # the width of the bars
idx_percent = np.linspace(0, 35, 8)
percent_labels = [str(int(idx)) + "%" for idx in idx_percent]
ind = np.arange(n_groups)  # the y locations for the groups
fig, axs = plt.subplots(figsize=(12, 8))
rects1 = axs.barh(ind, age_demog_unemployed.values, bar_width, color='b')
rects2 = axs.barh(ind - bar_width, age_demog_employed.values, bar_width, color='orange')
axs.set_xlim(0, 35)
autolabel_barh(rects1)
autolabel_barh(rects2)
axs.set_xticks(idx_percent)
axs.set_xticklabels(percent_labels, rotation=0)
axs.set_yticks(ind - bar_width / 2)
axs.set_yticklabels(list(age_demog_unemployed.index))
axs.set_ylabel("Age range")
plt.legend(['Unemployed', 'Data Scientist/ML Eng.'])
axs.set_title("Age ranges of $\it{Currently\ not\ employed}$ Kaggle survey respondents (worldwide)")  


We see from the above bar plot that the fractions of respondents in the two age groups: 22-24, and 55-59 are higher in the unemployed sample. These differences are statistically significant at the 0.05 level with p-values of 1e-13 and 6.8e-05 respectively. On the other hand, we can also see that the fractions of respondents in the age groups: 18-21, 30-34, 35-39 are lower in the unemployed sample. These differences are also statistically significant at the 0.05 level (p-values of 3.4e-05, 5.3e-06 and 6e-04 respectively).

The above bar plots and results shows two things very starkly: 
 - Workers age 50-plus are highly under-represented in the Data Science/Machine Learning space (at least when looking at kaggle users)
 - Covid-19 has significantly impacted the employment of workers age 50-plus, possibly a reflection of ageism in the tech workplace [2]

In [None]:
# Age range question is Q1: What is your age (# years)?
if verbose >= 2:
    print(survey_unemployed[["Q1"]].head())
count_unemployed = survey_unemployed.groupby("Q1")["Q1"].count()
count_employed = survey_employed.groupby("Q1")["Q1"].count()
count_all = count_unemployed + count_employed
percent_unemployed = 100 * count_unemployed / count_all
percent_unemployed = percent_unemployed.round(1)
if verbose >= 2:
    print(count_unemployed)
    print(count_employed)
    print(count_all)
    print(percent_unemployed)

# Plot horizontal bar graph showing percentages of unemployed survey respondents in the various age bins
bar_width = 0.5
idx_percent = np.linspace(0, 70, 8)
percent_labels = [str(int(idx)) + "%" for idx in idx_percent]
fig, axs = plt.subplots(figsize=(12, 8))
rects = axs.barh(percent_unemployed.index, percent_unemployed.values, bar_width, color='b')
axs.set_xlim(0, 70)
# if verbose:
#     print('Current xlim: ',plt.xlim())
axs.set_xticks(idx_percent)
axs.set_xticklabels(percent_labels, rotation=0)
axs.set_ylabel("Age range")

axs.set_title("Percentage of $\it{Currently\ not\ employed}$ Kaggle survey respondents (worldwide) by age range")
autolabel_barh(rects)


The percentage of Kaggle survey respondents that identified themselves as _Currently not employed_ was over 20% in all age ranges. It was highest in the 55-59 age range (49%). The next highest percentages (\~40%) were in the 22-24 and 60-69 age ranges. The lowest percentages of unemployed respondents were in age ranges 18-21 (\~22%) and 30-39 (\~24%).

In [None]:
# Age range question is Q1: What is your age (# years)?
survey_unemployed_USA = survey[is_unemployed_USA]
survey_employed_USA = survey[is_DS_MLEng_USA]
count_unemployed = survey_unemployed_USA.groupby("Q1")["Q1"].count()
count_employed = survey_employed_USA.groupby("Q1")["Q1"].count()
count_all = count_unemployed + count_employed
percent_unemployed = 100 * count_unemployed / count_all
percent_unemployed = percent_unemployed.round(1)
if verbose >= 2:
    print(count_unemployed)
    print(count_employed)
    print(count_all)
    print(percent_unemployed)

# Plot horizontal bar graph showing percentages of unemployed US-based survey respondents in the various age bins
bar_width = 0.5
idx_percent = np.linspace(0, 70, 8)
percent_labels = [str(int(idx)) + "%" for idx in idx_percent]
fig, axs = plt.subplots(figsize=(12, 8))
rects = axs.barh(percent_unemployed.index, percent_unemployed.values, bar_width, color='b')
axs.set_xlim(0, 70)
# if verbose:
#     print('Current xlim: ',plt.xlim())
axs.set_xticks(idx_percent)
axs.set_xticklabels(percent_labels, rotation=0)
axs.set_ylabel("Age range")

axs.set_title("Percentage of $\it{Currently\ not\ employed}$ US-based Kaggle survey respondents by age range")
autolabel_barh(rects)

When we restrict the survey respondents to those that are US-based, we still see percentages of respondents that identified themselves as Currently not employed at rates of 20% or more, almost in all age ranges (except 70+). The percentage of unemployed respondents was highest in the 60-69 age range (\~47%). The next highest percentages (\~40%) were for young adults (in the 22-24 and 18-21 age ranges). The lowest percentages of unemployed respondents were in age ranges 30-44 (\~20%). The unemployment rate for the US-based 55-59 year olds in the US is a little bit less dire than globally (33.3%).

### Education level (worlwide) distribution of unemployed survey respondents

In [None]:
# Highest level of formal education question is Q4: What is the highest level of formal 
# education that you have attained or plan to attain within the next 2 years?
if verbose >= 2:
    print(survey_unemployed[["Q4"]].head())
count_unemployed = survey_unemployed.groupby("Q4")["Q4"].count()
count_employed = survey_employed.groupby("Q4")["Q4"].count()
count_all = count_unemployed + count_employed
percent_unemployed = 100 * count_unemployed / count_all
percent_unemployed = percent_unemployed.round(1)
if verbose >= 2:
    print(count_unemployed)
    print(count_employed)
    print(count_all)
    print(percent_unemployed)
edu_demog_unemployed = 100 * count_unemployed / n_subjects_in_group["Unemployed"]
edu_demog_unemployed = edu_demog_unemployed.round(1)
edu_demog_employed = 100 * count_employed / n_subjects_in_group["DS_MLEng"]
edu_demog_employed = edu_demog_employed.round(1)
# Reindex the two series according to a specific ordering
index_list = ["No formal education past high school", \
              "Some college/university study without earning a bachelor’s degree", \
              "Bachelor’s degree", "Master’s degree", "Doctoral degree", \
              "Professional degree", \
              ]
edu_demog_unemployed = edu_demog_unemployed.reindex(index = index_list)
edu_demog_employed = edu_demog_employed.reindex(index = index_list)
n_groups = len(edu_demog_unemployed)
if verbose:
    print("num. edu level bins: {}".format(n_groups))
    print(list(edu_demog_unemployed.index))
    print(type(edu_demog_unemployed))
    print(edu_demog_unemployed)
    print(edu_demog_employed)
# Determine the age ranges for which the difference in the proportions for employed and unemployed 
# are statistically significant
for edu_level in list(edu_demog_unemployed.index):
    p1 = edu_demog_unemployed[edu_level]/100
    p2 = edu_demog_employed[edu_level]/100
    p_val = test_diff_two_proportions(p1, p2, N1, N2, alpha=0.05, test_type='two_tailed', verbose=False)
    print("edu_level: {}; p_val: {}".format(edu_level, p_val))

# Plot horizontal bar graph showing distribution of education levels for Unemployed
# and Employed survey respondents
bar_width = 0.40 # the width of the bars
idx_percent = np.linspace(0, 60, 7)
percent_labels = [str(int(idx)) + "%" for idx in idx_percent]
ind = np.arange(n_groups)  # the y locations for the groups
fig, axs = plt.subplots(figsize=(12, 8))
rects1 = axs.barh(ind, edu_demog_unemployed.values, bar_width, color='b')
rects2 = axs.barh(ind - bar_width, edu_demog_employed.values, bar_width, color='orange')
axs.set_xlim(0, 60)
autolabel_barh(rects1)
autolabel_barh(rects2)
axs.set_xticks(idx_percent)
axs.set_xticklabels(percent_labels, rotation=0)
axs.set_yticks(ind - bar_width / 2)
axs.set_yticklabels(list(edu_demog_unemployed.index))
axs.set_ylabel("Highest Education level")
plt.legend(['Unemployed', 'Data Scientist/ML Eng.'])
axs.set_title("Education levels of $\it{Currently\ not\ employed}$ Kaggle survey respondents (worldwide)")  


We see from the above bar plot that the fractions of respondents with Master's or Doctoral degree are significantly lower in the unemployed sample (35.5%, 7.2%) than their peers in the employed group (49%, 16.1%). These differences are statistically significant at the 0.05 level with p-values of 3.8e-20 and 8.3e-19 respectively. 

On the other hand, we can also see that the fraction of respondents with only a Bachelor's degree is significantly higher in the unemployed sample (42.7% vs. 26.6%). This difference is also statistically significant at the 0.05 level (p-value of 0).

### References
[1] kaggle, State of Machine Learning and Data Science 2020, available at: https://www.kaggle.com/c/kaggle-survey-2020/data

[2] Ramsey L. Alwin and Jen Schramm, Coronavirus' Devastating Economic Impact on Workers Age 50-Plus, AARP, April 3, 2020 https://www.aarp.org/politics-society/advocacy/info-2020/coronavirus-economic-impact-older-workers.html