In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from matplotlib import pyplot as plt
import matplotlib as mpl
import matplotlib.ticker as ticker
from matplotlib.colors import ListedColormap
import seaborn as sns

!pip install pywaffle
from pywaffle import Waffle

%matplotlib inline

mycolors = ["#00589B", "#00A0B0", "#CF5C78", "#F5DF4D", "#F0EEE9", "#939597"]
mycolors2 = ["#C2BFB5", "#F0A1BF", "#AFA4CE", "#C2BFB5", "#F5DF4D", "#939597"]
color_of_year = ["#88B04B", "#5F4B8B", "#ff6f61", "#0f4c81"]


# Introduction

It's been four years since the first **Kaggle Machine Learning & Data Science Survey** competition held in 2017. How this event has evolved over the past years? Although a 4th anniversary is hardly a memorable milestone. Only four years can hardly make up of a **Chronicle** either. It might be interesting to dig into its ***growth***. 

Since a survey is merely made up by two parts: questions and answers(investigators and respondents). Hence, I will start with this two parts and the draw a conclusion at the end. Hope this notebook can find some useful insights about this event.

# Questions: evolved

There are 39 questions listed in Kaggle Survey 2020, only 10 of them were also be asked(some in different ways) in 2017~2019. The format of questionnaire, the content, the choices and even the style of each year's questions differs from another. 

In [None]:
year_file = {
    "2017": "/kaggle/input/kaggle-survey-2017/schema.csv",
    "2018": "/kaggle/input/kaggle-survey-2018/multipleChoiceResponses.csv",
    "2019": "/kaggle/input/kaggle-survey-2019/multiple_choice_responses.csv",
    "2020": "/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv"
}


def generate_question_text(year):
    file = year_file[year]
    if year == "2017":
        data = pd.read_csv(file)
        df = data
        df["question"] = df["Question"].apply(lambda x: x.split("?")[0])
        df["question"] = df["Question"].apply(lambda x: x.split(":")[0])
        df.drop_duplicates(subset=["question"], inplace=True)
    else:
        data = pd.read_csv(file, nrows=1)
        df = data.T
        df["question"] = df.loc[:,0].apply(lambda x: x.split("?")[0])
        df["question"] = df["question"].apply(lambda x: x.split(":")[0])
        df.drop_duplicates(subset=["question"], inplace=True)
    
    text = " ". join(df["question"].tolist()[1:])
    return text


def test_color_func(word, font_size, position, orientation, font_path, random_state):
    if word in high_words:
        return 'red'
    else:
        r, g, b, alpha = plt.get_cmap('viridis')(np.random.randint(0, 256))
        return (int(r * 255), int(g * 255), int(b * 255))

In [None]:
fig = plt.figure(figsize=(12, 8))
fig.suptitle('Questions over the years', fontsize=20, fontweight=500)

for i in range(4):
    ax = fig.add_subplot(2, 2, i + 1)
    ax.axis('off')
    ax.set_title(f"{2017+i}", y=-0.1, fontdict={"fontsize": 15})
    text = generate_question_text(f"{2017+i}")
    wordcloud = WordCloud(max_font_size=40, scale=5,
                          background_color='white').generate(text)
    high_words = list(wordcloud.words_.keys())[:3]
    wordcloud.recolor(color_func=test_color_func)
    ax.imshow(wordcloud)

plt.tight_layout(h_pad=-5.0, w_pad=5.0)
plt.show()

From the wordcloud map generated from question texts of each year's survey, we can tell:
* Survey 2019 & 2020 pays more attention to Kagglers' habit of using tools and product for machine learning and data science.
* 2017, which is the first year of this event has most questions of "**<font color="#006600">data science</font>**", which only takes a small part of the Qs in the next 3 years.
* Survey 2017 & 2018 cares more about Kagglers' **<font color="#006600">work</font>** than 2019 and 2020.
* **<font color="#006600">Data</font>** is always an focus int Kaggle Survey as well as kagglers' future plan in the **<font color="#006600">following</font>** years.


The 10 questions shared by 4 years can be divided into 3 groups: <span style="background:#00589B; font-weight:bold; color:white">basic personal information</span>, <span style="background:#CF5C78; font-weight:bold; color:white">machine learning/data science experience</span>, <span style="background:#00A0B0; font-weight:bold; color:white">jobs & works</span>. The <span style="background:#c2c2c2; font-weight:bold">other 29 questions</span> of Survey 2020 are either asked in 2019 or in 2018 & 2019, but not in 2017.

In [None]:
fig = plt.figure(
    FigureClass=Waffle,
    rows=4,
    columns=10,  # Either rows or columns could be omitted
    values=[4, 3, 3, 29, 1],
    colors=["#00589B", "#CF5C78", "#00A0B0", "#cdcdcd60", "#ffffff00"]
)

# Answers: diversed

This part explores how one year's answers to the 10 common questions differ from those of another year. 

In [None]:
survey_2017 = pd.read_csv(
    '/kaggle/input/kaggle-survey-2017/multipleChoiceResponses.csv',
    skiprows=[1],
    low_memory=False,
    encoding='latin1')
survey_2018 = pd.read_csv(
    '/kaggle/input/kaggle-survey-2018/multipleChoiceResponses.csv',
    skiprows=[1],
    low_memory=False)
survey_2019 = pd.read_csv(
    '/kaggle/input/kaggle-survey-2019/multiple_choice_responses.csv',
    skiprows=[1],
    low_memory=False)
survey_2020 = pd.read_csv(
    '/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv',
    skiprows=[1],
    low_memory=False)
survey_2020.loc[survey_2020['Q2'] == 'Man', 'Q2'] = 'Male'
survey_2020.loc[survey_2020['Q2'] == 'Woman', 'Q2'] = 'Female'

## Basic information of Kagglers

### Country distribution

**Q: In which country do you currently reside?**

In [None]:
def rename_country(x):
    if x == "United Kingdom of Great Britain and Northern Ireland" or x == "United Kingdom":
        return "UK"
    elif x == "United States of America":
        return "USA"
    elif x == "People 's Republic of China":
        return "China"
    else:
        return x
    
    
var_list = ['Country', 'Q3', 'Q3', 'Q3']
for i in range(4):
    x = str(i + 2017)
    df = globals()['survey_' + x]
    df["Country"] = df[var_list[i]].apply(rename_country)
    df = pd.DataFrame(df["Country"].value_counts(normalize=True, dropna=False))
    df.columns = [x]
    df = df[df.index != 'Other'].head(n=10)
    df.sort_values(by=[x], inplace=True)
    globals()['all' + x] = df
    

fig, ax = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Top 10 countries where Kagglers reside in', fontsize=20, y=1.05, fontweight=500)

for i in range(4):
    x = str(i + 2017)
    df = globals()['all' + x]
    df[x].plot.barh(ax=ax[int(i / 2)][i % 2],
                    color=[color_of_year[i]],
                    legend=True,
                    width=0.4)
    ax[int(i / 2)][i % 2].spines["bottom"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["top"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["right"].set_visible(False)
    ax[int(i / 2)][i % 2].set_yticklabels(df.index, fontsize=12)

    ax[int(i / 2)][i % 2].legend(loc='best',
                                 bbox_to_anchor=(0.45, 1.02),
                                 ncol=1,
                                 borderaxespad=0,
                                 frameon=True,
                                 fontsize=14)

    x_axis = ax[int(i / 2)][i % 2].axes.get_xaxis()
    x_axis.set_visible(False)
    
    j = 1
    for p in ax[int(i / 2)][i % 2].patches:
        width, height = p.get_width(), p.get_height()
        _, y = p.get_xy()
        #print("x=",x, ", y= ", y, " width=", width)
        ax[int(i / 2)][i % 2].annotate(f'{width:.1%}', (width * 1.05, y),
                                       fontsize=12)
        
        if j == 10 or j == 9:
            p.set_hatch("//")

        if (x == "2018" and j == 1) or (x == "2019"
                                        and j == 1) or (x == "2020"
                                                        and j in [1, 4]):
            p.set_color('darkred')
        if (x == "2017" and j == 1) or (x == "2018" and j == 3) or (x == "2019" and j == 2):
            p.set_alpha(0.2)

        if x == "2019" and j == 1:
            p.set_alpha(0.6)
        j += 1

plt.tight_layout(pad=2.0)
plt.show()

The chart above shows how top 10 countries changes over the last four years.
* 40% of the respondents is concentrated in just the top 2 countries - India and USA.
* The top 10 list changes every year with one or two countres coming up/down.

### Gender & Age & Education
1. **Q: What is your gender?**
2. **Q: What is your age (# years)?**
3. **Q: What is the highest level of formal education that you have attained or plan to attain within the next 2 years>**


In [None]:
var_list = ['GenderSelect', 'Q1', 'Q2', 'Q2']
g_df = pd.DataFrame(columns=["year", "Overall", "Male", "Female", "Others"])
for i in range(4):
    x = str(i + 2017)
    g_cnts = pd.DataFrame(
        eval('survey_' + x)[var_list[i]].value_counts(dropna=False))
    new_row = {
        'year':
        2017 + i,
        'Overall':
        int(g_cnts.sum()),
        'Male':
        int(g_cnts.loc["Male", [var_list[i]]]),
        'Female':
        int(g_cnts.loc["Female", [var_list[i]]]),
        'Others':
        int(g_cnts[~g_cnts.index.str.contains('ale', na=False)].sum(axis=0))
    }
    g_df = g_df.append(new_row, ignore_index=True)

g_df["F_percentage"] = g_df['Female']/g_df['Overall']

In [None]:
def get_age_group(x):
    if x == "18-21":
        return "21-"
    if x == "60-69" or x == "70-79" or x == "80+":
        return "60+"
    return x


def get_age_group_2017(x):
    age_list = [22, 25, 30, 35, 40, 45, 50, 55, 60]
    for i, age in enumerate(age_list):
        if x >= 60:
            return "60+"
        if x < 22:
            return "21-"
        if x >= age:
            continue
        else:
            return f"{age_list[i-1]}-{age_list[i]-1}"

In [None]:
var_list = ['Age', 'Q2', 'Q1', 'Q1']
age_groups = [
    "21-", "22-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54",
    "55-59", "60+"
]
age_df = pd.DataFrame(columns=["year"] + age_groups)
for i in range(4):
    x = str(i + 2017)
#     print(x)
    df = globals()[f"survey_{x}"]
    if x == "2017":
        df["AgeGroup"] = df[var_list[i]].apply(get_age_group_2017)
    else:
        df["AgeGroup"] = df[var_list[i]].apply(get_age_group)

    age_cnts = pd.DataFrame(df["AgeGroup"].value_counts(dropna=False))

    new_row = {'year': 2017 + i, 'Overall': int(age_cnts.sum())}
    for gp in age_groups:
        new_row[gp] = int(age_cnts.loc[gp, "AgeGroup"])
    age_df = age_df.append(new_row, ignore_index=True)
    
age_df["Youth_percentage"] = (age_df['21-'] + age_df['22-24'] + age_df['25-29'])/age_df['Overall']
age_df["Elder_percentage"] = (age_df['60+'])/age_df['Overall']

In [None]:
def get_education_level(x):
    x = str(x)
    if "Master" in x:
        return "Master’s"
    elif "Bachelor" in x:
        return "Bachelor’s"
    elif "Doctoral" in x:
        return "Doctoral"
    elif "without" in x:
        return "College Dropout"
    elif "Professional" in x:
        return "Professional"
    elif "high school" in str(x):
        return "High School"
    else:
        return "NaN"

In [None]:
var_list = ['FormalEducation', 'Q4', 'Q4', 'Q4']

edu_groups = [
    "Master’s", "Bachelor’s", "Doctoral", "NaN", "College Dropout",
    "Professional", "High School"
]
edu_df = pd.DataFrame(columns=["year"] + edu_groups)
for i in range(4):
    x = str(i + 2017)
    df = globals()[f"survey_{x}"]
    df["EducationLevel"] = df[var_list[i]].apply(get_education_level)

    edu_cnts = pd.DataFrame(df["EducationLevel"].value_counts(dropna=False))

    new_row = {'year': 2017 + i, 'Overall': int(edu_cnts.sum())}
    for gp in edu_groups:
        new_row[gp] = int(edu_cnts.loc[gp, "EducationLevel"])
    edu_df = edu_df.append(new_row, ignore_index=True)
    
edu_df["HigherEducation_percentage"] = (edu_df['Master’s'] + edu_df['Bachelor’s'] +
                              edu_df['Doctoral']) / edu_df['Overall']

In [None]:
fig, ax = plt.subplots(1, 3, figsize=(18, 5))
fig.suptitle('Gender & Age & Education distributions',
             fontsize=30,
             y=1.15,
             fontweight=500)
dfs = [g_df, age_df, edu_df]
ylabels = ["gender distribution", "age distribution", "education distribution"]
ylabels2 = [
    "female participation (%)", "youth&elderly participation (%)",
    "bachelor+ participation (%)"
]
y_groups = [["Male", "Female", "Others"], age_groups, edu_groups]
line_column = ["F_percentage", ["Youth_percentage", "Elder_percentage"], "HigherEducation_percentage"]
colors = [["#00A0B0", "#CF5C78", "#939597"],
          sns.color_palette("bwr", n_colors=12),
          sns.color_palette("coolwarm")]
line_colors = ['#CF5C78', 'crimson', 'chocolate']

for i in range(3):
    df = dfs[i]
    ax1 = df.plot(x="year",
                  ax=ax[i],
                  y=y_groups[i],
                  kind='bar',
                  rot=0,
                  stacked=True,
                  colormap=ListedColormap(colors[i]))
    ax1.set_ylabel(ylabels[i], fontsize='15')
    ax1.xaxis.label.set_visible(False)
    ax2 = ax1.twinx()
    ax2.plot(df.index,
             df[line_column[i]],
             color=line_colors[i],
             linestyle='--',
#              ms=10,
#              lw=3,
             marker='o')
    # ax2.set_ylim([0, g_df["F_percentage"].max()])
    ax2.set_ylabel(ylabels2[i], fontsize='15')
    ax2.yaxis.set_major_formatter(ticker.PercentFormatter(decimals=0, xmax=1))

plt.tight_layout(pad=2.0)
plt.show()

The three subchart above share one same background: the toal number of respondents reaches at hight point in 2018 and the falls by 3,000~4,000 in 2019. The answers number of the latest two years is almost the same. Inside each subchart:
* **Gender**: **<font color="#00A0B0">Male</font>** respondents have always been the majority of the kaggle community while the proportion of **<font color="#CF5C78">female</font>** respondents has been increased from about 16% to nearly 20%.
* **Age**: Half of kaggle users are **<font color="#225df8">youth</font>**(young people under 30 years old). The proportion of the youth hasn't change much over the years.
* **Education**: Most of kaggle users have has the experience of successfully completing higher education(**<font color="#3298e9">bachelor/master/doctor</font>** degrees). Meanwhile, its proportion decreases in recent two years, which means more ans more people without high degree join the community. **Such trend encourages the popularization of AI technolgies.**


## Machine Learning & Data Science experience

### ML experience
**Q: For how many years have you used machine learning methods?**

In [None]:
def rename_experience(x):
    if x in ["< 1 year", "< 1 years", "Under 1 year"]:
        return "0-1 year"
    elif x in [
            "I have never studied machine learning but plan to learn in the future",
            "I have never studied machine learning and I do not plan to",
            "I do not use machine learning methods"
    ]:
        return "None experience"
    elif x == "20 or more years":
        return "20+ years"
    else:
        return x
    
def add_column_for_sort(x):
    if x == "None experience":
        return 99
    elif str(x) == "nan":
        return 100
    
    tmp = ""
    for ch in x:
        if not ch.isdigit():
            break
        tmp += ch
    return int(tmp)


var_list = ['LearningDataScienceTime', 'Q25', 'Q23', 'Q15']
for i in range(4):
    x = str(i + 2017)
    df = globals()['survey_' + x]
    df["MLExperience"] = df[var_list[i]].apply(rename_experience)
    df = pd.DataFrame(df["MLExperience"].value_counts(normalize=True, dropna=False))
    df.columns = [x]
    df["order"] = list(map(add_column_for_sort, df.index))
    df.sort_values(by=["order"], ascending=False, inplace=True)
    globals()['MLexp_' + x] = df
    
    
fig, ax = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Machine learning experience', fontsize=20, y=1.05, fontweight=500)

for i in range(4):
    x = str(i + 2017)
    df = globals()['MLexp_' + x]
    df[x].plot.barh(ax=ax[int(i / 2)][i % 2],
                    color=[color_of_year[i]],
                    legend=True,
                    width=0.4)
    ax[int(i / 2)][i % 2].spines["bottom"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["top"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["right"].set_visible(False)
    ax[int(i / 2)][i % 2].set_yticklabels(df.index, fontsize=12)
    ax[int(i / 2)][i % 2].legend(loc='best',
                                 bbox_to_anchor=(0.45, 1.02),
                                 ncol=1,
                                 borderaxespad=0,
                                 frameon=True,
                                 fontsize=14)

    x_axis = ax[int(i / 2)][i % 2].axes.get_xaxis()
    x_axis.set_visible(False)

    j = 0
    for p in ax[int(i / 2)][i % 2].patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy()
        #print("x=",x, ", y= ", y, " width=", width)
        ax[int(i / 2)][i % 2].annotate(f'{width:.1%}',
                                       (width + 0.008, y + 0.1),
                                       fontsize=12)
        if j == 0:
            #             p.set_hatch("//")
            p.set_alpha(0.1)
        j += 1

plt.tight_layout(pad=1.0)
plt.show()

Although the choices of the question vary in four years, those who has "0-1 year" experience in machine learning and data science still take the largest part of all respondents. From the perspective of time dimension, this means **a large number of ML newbee enter into kaggle land every year**.

### Recommended programming language
**Q: What programming language would you recommend an aspiring data scientist to learn first?**

In [None]:
def rename_pl(x):
    if x in ["C++", "C", "C#"]:
        return "C/C++/C#"
    else:
        return x
    

var_list = ['LanguageRecommendationSelect', 'Q18', 'Q19', 'Q8']
RecPL_df = pd
for i in range(4):
    x = str(i + 2017)
    df = globals()['survey_' + x]
    df["RecommendedPL"] = df[var_list[i]].apply(rename_pl)
    df = pd.DataFrame(df["RecommendedPL"].value_counts(normalize=True, dropna=True))
    df.columns = [x]
    df.sort_values(by=[x], inplace=True)
    df["pp"] = df[x]
    globals()['RecPL_' + x] = df.tail(5)
    
    
fig, ax = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Top 5 recommended programming language', fontsize=20, y=1.05, fontweight=500)

for i in range(4):
    x = str(i + 2017)
    df = globals()['RecPL_' + x]
    df[x].plot.barh(ax=ax[int(i / 2)][i % 2],
                    color=[color_of_year[i]],
                    legend=True,
                    width=0.4)
    ax[int(i / 2)][i % 2].spines["bottom"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["top"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["right"].set_visible(False)
    ax[int(i / 2)][i % 2].set_yticklabels(df.index, fontsize=12)
    ax[int(i / 2)][i % 2].legend(loc='best',
                                 bbox_to_anchor=(0.45, 1.02),
                                 ncol=1,
                                 borderaxespad=0,
                                 frameon=True,
                                 fontsize=14)

    x_axis = ax[int(i / 2)][i % 2].axes.get_xaxis()
    x_axis.set_visible(False)

    j = 0
    for p in ax[int(i / 2)][i % 2].patches:
        width, height = p.get_width(), p.get_height()
        x, y = p.get_xy()
        #print("x=",x, ", y= ", y, " width=", width)
        ax[int(i / 2)][i % 2].annotate(f'{width:.1%}',
                                       (width + 0.008, y + 0.1),
                                       fontsize=12)

plt.tight_layout(pad=1.0)
plt.show()

* Python has always topped the list and it has only gained in popularity over the years.
* The top 5 recommended programming languages have always been the same 5 languages, so has the order of them: Python, R, SQL, C/C++/C# and Matlab.

### Learning platforms
**Q: On which platforms have you begun or completed data science courses? (Select all that apply)**

In [None]:
def rename_platforms(x):
    if x in [
            "Online University Courses",
            "University Courses (resulting in a university degree)"
    ]:
        return "University Courses"
    elif x in ["Kaggle Courses (i.e. Kaggle Learn)", "Kaggle Learn", "Kaggle Learn Courses"]:
        return "Kaggle"
    elif x == "LinkedIn Learning":
        return "LinkedIn"
    elif x == "Cloud-certification programs (direct from AWS, Azure, GCP, or similar)":
        return "Cloud programs"
    else:
        return x
    
    
def count_platforms_2017(df, year):
    vc = df.value_counts()
    dic = {}
    for key, value in vc.items():
        for plt in key.split(','):
            if plt in dic:
                dic[plt] += value
            else:
                dic[plt] = value
    df = pd.DataFrame.from_dict(dic, orient='index', columns=[year])
    df[year] = df[year] / df[year].sum()
    df.sort_values(by=[year], ascending=True, inplace=True)
    return df


def count_platforms(df, year):
    df = df.describe().T
    df["top"] = df["top"].apply(rename_platforms)
    df.set_index(keys=["top"], drop=True, inplace=True)
    df.sort_values(by=["count"], ascending=True, inplace=True)
    df["count"] = df["count"] / df["count"].sum()
    df.rename(columns={"count": year}, inplace=True)
    return df


var_list = [
    'CoursePlatformSelect',
    [
        'Q36_Part_1', 'Q36_Part_2', 'Q36_Part_3', 'Q36_Part_4', 'Q36_Part_5',
        'Q36_Part_6', 'Q36_Part_7', 'Q36_Part_8', 'Q36_Part_9', 'Q36_Part_10',
        'Q36_Part_11', 'Q36_Part_12', 'Q36_Part_13'
    ],
    [
        'Q13_Part_1', 'Q13_Part_2', 'Q13_Part_3', 'Q13_Part_4', 'Q13_Part_5',
        'Q13_Part_6', 'Q13_Part_7', 'Q13_Part_8', 'Q13_Part_9', 'Q13_Part_10',
        'Q13_Part_11', 'Q13_Part_12'
    ],
    [
        'Q37_Part_1',
        'Q37_Part_2',
        'Q37_Part_3',
        'Q37_Part_4',
        'Q37_Part_5',
        'Q37_Part_6',
        'Q37_Part_7',
        'Q37_Part_8',
        'Q37_Part_9',
        'Q37_Part_10',
        'Q37_Part_11',
        'Q37_OTHER',
    ]
]

for i in range(4):
    x = str(i + 2017)
    df = globals()['survey_' + x]
    if x == "2017":
        df = count_platforms_2017(df[var_list[i]], x)
    else:
        df = count_platforms(df[var_list[i]], x)
    df.index.name = ""
    globals()['pltfm_' + x] = df
    
    
fig, ax = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Platforms where Kagglers favor', fontsize=20, y=1.05, fontweight=500)

for i in range(4):
    x = str(i + 2017)
    df = globals()['pltfm_' + x]
    df[x].plot.barh(ax=ax[int(i / 2)][i % 2],
                    color=[color_of_year[i]],
                    legend=True,
                    width=0.4)
    ax[int(i / 2)][i % 2].spines["bottom"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["top"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["right"].set_visible(False)
    ax[int(i / 2)][i % 2].set_yticklabels(df.index, fontsize=12)

    ax[int(i / 2)][i % 2].legend(loc='best',
                                 bbox_to_anchor=(0.45, 1.02),
                                 ncol=1,
                                 borderaxespad=0,
                                 frameon=True,
                                 fontsize=14)

    x_axis = ax[int(i / 2)][i % 2].axes.get_xaxis()
    x_axis.set_visible(False)
    
    j = 1
    length = len(ax[int(i / 2)][i % 2].patches)
    for p in ax[int(i / 2)][i % 2].patches:
        width, height = p.get_width(), p.get_height()
        _, y = p.get_xy()
        #print("x=",x, ", y= ", y, " width=", width)
        ax[int(i / 2)][i % 2].annotate(f'{width:.1%}', (width * 1.05, y),
                                       fontsize=12)
        
        if x == "2019" and j == 9:
            p.set_color("maroon")
        
        if x != "2017" and j > length - 5:
            p.set_hatch("//")
            
        if x == "2018" and j == 9:
            p.set_alpha(0.2)
        
        
            
        j += 1

plt.tight_layout(pad=1.0)
plt.show()

* In 2017 only four specific paltforms are listed to be selected, among them over 40% of kagglers begun or completed their data science journey on "Coursera".
* "Coursera" has been take the lead over the years.
* The top 5 platforms remains the same in the last two years after "University Courses" replaced "Udacity" in 2019.

## Jobs & Works

### Jobs
**Q: Select the title most similar to your current role (or most recent title if retired)**

In [None]:
def rename_job(x):
    if x in ['Researcher','Scientist/Researcher']:
        return 'Research Scientist'
    elif x == 'Currently not employed':
        return "Unemployed"
    elif x == 'Software Developer/Software Engineer':
        return 'Software Engineer'
    elif x == 'Operations Research Practitioner':
        return 'ORP'
    elif x == 'Machine Learning Engineer':
        return 'ML Engineer'
    return x


var_list = ['CurrentJobTitleSelect', 'Q6', 'Q5', 'Q5']

for i in range(4):
    x = str(i + 2017)
    df = globals()['survey_' + x]
    df["Job"] = df[var_list[i]].apply(rename_job)
    df = pd.DataFrame(df["Job"].value_counts(normalize=True, dropna=True))
    df.columns = [x]
    df.sort_values(by=[x], inplace=True)
    globals()['Jobs_' + x] = df
    
    
fig, ax = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Current role in workplace', fontsize=20, y=1.05, fontweight=500)

for i in range(4):
    x = str(i + 2017)
    df = globals()['Jobs_' + x]
    df[x].plot.barh(ax=ax[int(i / 2)][i % 2],
                    color=[color_of_year[i]],
                    legend=True,
                    width=0.4)
    ax[int(i / 2)][i % 2].spines["bottom"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["top"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["right"].set_visible(False)
    ax[int(i / 2)][i % 2].set_yticklabels(df.index, fontsize=12)
    ax[int(i / 2)][i % 2].legend(loc='best',
                                 bbox_to_anchor=(0.45, 1.02),
                                 ncol=1,
                                 borderaxespad=0,
                                 frameon=True,
                                 fontsize=14)

    x_axis = ax[int(i / 2)][i % 2].axes.get_xaxis()
    x_axis.set_visible(False)

    j = 1
    length = len(ax[int(i / 2)][i % 2].patches)
    for p in ax[int(i / 2)][i % 2].patches:
        width, height = p.get_width(), p.get_height()
        _, y = p.get_xy()
        #print("x=",x, ", y= ", y, " width=", width)
        ax[int(i / 2)][i % 2].annotate(f'{width:.1%}',
                                       (width + 0.008, y + 0.1),
                                       fontsize=12)
        
        if j > length - 5:
            p.set_hatch("\\")
        if (x == "2017" and j == length - 3) or (x == "2018" and j in [length - 4, length - 6]) or (x == "2019" and j in [6, 9]) or (x == "2020" and j in [9, 10]):
            p.set_alpha(0.2)
        j += 1

plt.tight_layout(pad=1.0)
plt.show()

* **Data Scientist** and **Software Engineer** remain in top 5 list over the years.
* **Student** rushes to the top once it is listed as an option in Survey 2018 and remains top 3 in the following years.
* The proportion of **Unemployed** or **Not employed** keeps rising over the years, which corresponds to the big part of the "Student" selection in job questions. This also implies the data science technologies are more accessible and more friendly to the non-professionals.

### Daily task
**Q: Select any activities that make up an important part of your role at work: (Select all that apply)**

In [None]:
def rename_tasks(x):
    x = str(x)
    if x == "Analyze and understand data to influence product or business decisions":
        return "Data analysis"
    elif "prototypes" in x:
        return "Prototypes"
    elif "machine learning service" in x:
        return "ML service"
    elif "data infrastructure" in x:
        return "Data infrastructure"
    elif "esearch" in x:
        return "Research"
    elif "ML models" in x:
        return "ML models"
    elif "None" in x:
        return "None"
    return x


def count_tasks_2017(df, year):
    vc = df.value_counts(normalize=True)
    dic = {}
    for key, value in vc.items():
        dic[str(key)] = value
    df = pd.DataFrame.from_dict(dic, orient='index', columns=[year])
    df[year] = df[year] / df[year].sum()
    df.sort_values(by=[year], ascending=True, inplace=True)
    return df


def count_tasks(df, year):
    df = df.describe().T
    df["top"] = df["top"]
    df.set_index(keys=["top"], drop=True, inplace=True)
    df.sort_values(by=["count"], ascending=True, inplace=True)
    df["count"] = df["count"] / df["count"].sum()
    df.rename(columns={"count": year}, inplace=True)
    return df


var_list = [
    'JobFunctionSelect',
    [
        'Q11_Part_1', 'Q11_Part_2', 'Q11_Part_3', 'Q11_Part_4', 'Q11_Part_5',
        'Q11_Part_6', 'Q11_Part_7'
    ],
    [
        'Q9_Part_1', 'Q9_Part_2', 'Q9_Part_3', 'Q9_Part_4', 'Q9_Part_5',
        'Q9_Part_6', 'Q9_Part_7', 'Q9_Part_8'
    ],
    [
        'Q23_Part_1', 'Q23_Part_2', 'Q23_Part_3', 'Q23_Part_4', 'Q23_Part_5',
        'Q23_Part_6', 'Q23_Part_7', 'Q23_OTHER'
    ]
]

for i in range(4):
    x = str(i + 2017)
    df = globals()['survey_' + x]
    if x == "2017":
        df = count_tasks_2017(df[var_list[i]], x)
    else:
        df = count_tasks(df[var_list[i]], x)
    df.index.name = ""
    globals()['tasks_' + x] = df
    
    
fig, ax = plt.subplots(2, 2, figsize=(12, 10))
fig.suptitle('Daily task in workplace', fontsize=20, y=1.05, fontweight=500)

for i in range(4):
    x = str(i + 2017)
    df = globals()['tasks_' + x]
    df[x].plot.barh(ax=ax[int(i / 2)][i % 2],
                    color=[color_of_year[i]],
                    legend=True,
                    width=0.4)
    ylabels = list(map(rename_tasks, df.index))
    ax[int(i / 2)][i % 2].spines["bottom"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["top"].set_visible(False)
    ax[int(i / 2)][i % 2].spines["right"].set_visible(False)
    ax[int(i / 2)][i % 2].set_yticklabels(ylabels, fontsize=12)
    ax[int(i / 2)][i % 2].legend(loc='best',
                                 bbox_to_anchor=(0.45, 1.02),
                                 ncol=1,
                                 borderaxespad=0,
                                 frameon=True,
                                 fontsize=14)

    x_axis = ax[int(i / 2)][i % 2].axes.get_xaxis()
    x_axis.set_visible(False)

    j = 1
    length = len(ax[int(i / 2)][i % 2].patches)
    for p in ax[int(i / 2)][i % 2].patches:
        width, height = p.get_width(), p.get_height()
        _, y = p.get_xy()
        #print("x=",x, ", y= ", y, " width=", width)
        ax[int(i / 2)][i % 2].annotate(f'{width:.1%}',
                                       (width + 0.008, y + 0.1),
                                       fontsize=12)
        if (x == "2017" and j == 1) or (x in ["2018", "2019", "2020"] and j in [1, 2]):
            p.set_alpha(0.2)
            
        if j == length:
            p.set_hatch("//")
            
        j += 1
        
plt.tight_layout(pad=1.0)
footnote = "*Details: \n  Data analysis: Analyze and understand data to influence product or business decisions\n" \
"  Data infrastructure: Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data\n"\
"  Prototypes: Build prototypes to explore applying machine learning to new areas\n"\
"  ML service: Build and/or run a machine learning service that operationally improves my product or workflows\n"\
"  ML models: Experimentation and iteration to improve existing ML models\n"\
"  Research: Do research that advances the state of the art of machine learning\n"\
"  None: None of these activities are an important part of my role at work"
plt.annotate(footnote, xy=(-1.6, -0.3), xycoords='axes fraction')
plt.show()

* **Data analysis** has consistently topped the list over the years while **Research** has been the bottom.
* **Prototypes** stands still in its second place.
* There is not much difference between the rest tasks.

### Compensation
**Q: What is your current yearly compensation (approximate $USD)?**

In [None]:
salary_list = [
    0, 1000, 2000, 3000, 4000, 5000, 7500, 10000, 15000, 20000, 25000, 30000,
    40000, 50000, 60000, 70000, 80000, 90000, 100000, 125000, 150000, 200000,
    250000, 300000, 500000
]
rate = pd.read_csv("/kaggle/input/kaggle-survey-2017/conversionRates.csv")


def calculate_salary(x):
    x = str(x)
    try:
        amount, cntry = x.split("_")
        amount = float(amount.replace(",", ""))
        if amount < 0:
            return "nan"
        rt = float(rate[rate["originCountry"] == cntry]["exchangeRate"])
        salary = amount * rt
        for i, sal in enumerate(salary_list):
            if salary >= 500000:
                return "500000+"
            if salary < 1000:
                return "0-999"
            if salary >= sal:
                continue
            else:
                return f"{salary_list[i-1]}-{salary_list[i]-1}"
        return "nan"
    except:
        return "nan"


def add_column_for_sort(x):
    if x == "nan":
        return -1
    tmp = ""
    for ch in x:
        if not ch.isdigit():
            return int(tmp)
        tmp += ch


def count_salary_2017(df, year):
    df["Salary"] = df.loc[:,
                          'CompensationAmount'] + "_" + df.loc[:,
                                                               'CompensationCurrency']
    df[year] = df["Salary"].apply(calculate_salary)
    df = pd.DataFrame(df[year].value_counts())
    df["order"] = list(map(add_column_for_sort, df.index))
    df.sort_values(by=["order"], inplace=True)
    df.drop(index="nan", inplace=True)
    return df


var_list = [['CompensationAmount', 'CompensationCurrency'], 'Q9', 'Q10', 'Q24']

for i in range(4):
    x = str(i + 2017)
    df = globals()['survey_' + x]
    if x == "2017":
        df = count_salary_2017(df[var_list[i]].dropna(), x)
    else:
        df = pd.DataFrame(df[var_list[i]].value_counts(dropna=True))
        df.columns = [x]
        if x == "2018":
            df.drop(
                index=
                "I do not wish to disclose my approximate yearly compensation",
                inplace=True)
        if x == "2019" or x == "2020":
            df.drop(index=["$0-999", "> $500,000"], inplace=True)
            df.rename(index={
                "$0-999": "0-999",
                "> $500,000": "500,000+"
            },
                      inplace=True)
        df["order"] = list(map(add_column_for_sort, df.index))
        df.sort_values(by=["order"], inplace=True)

    df.index.name = ""
    globals()['salary_' + x] = df
    
    
fig, ax = plt.subplots(4, 1, figsize=(12, 16))
fig.suptitle('Yearly compensation (approximate $USD)', fontsize=20, y=1.02, fontweight=500)

for i in range(4):
    x = str(i + 2017)
    df = globals()[f"salary_{x}"]
    df[x].plot.bar(ax=ax[i], color=[color_of_year[i]], legend=True,)
    ax[i].spines["top"].set_visible(False)
    ax[i].spines["right"].set_visible(False)
    ax[i].spines["left"].set_visible(False)

    y_axis = ax[i].axes.get_yaxis()
    y_axis.set_visible(False)

    if x == "2017":
        height_offset = 20.
    elif x == "2018":
        height_offset = 150.
    else:
        height_offset = 30.
        
    j = 1
    length = len(ax[i].patches)
    for p in ax[i].patches:
        width, height = p.get_width(), p.get_height()
        xx, y = p.get_xy()
        ax[i].annotate(height, (xx + .2, height + height_offset),
                       fontsize=12,
                       ha='center',
                       weight='normal',
                       size='large')
        
        if (x == "2017" and j > length - 4) or (x == "2018" and j > length - 5) or (x in ["2019", "2020"] and j > length - 3):
            p.set_alpha(0.2)
            
        if j == 1:
            p.set_color("maroon")
            
        j += 1

plt.tight_layout(pad=2.0)
plt.show()

* The compensation distribution in 2019 and 2020 are highly consistent.
* In 2017 most kagglers make money between 10,000 USD to 200,000. In 2019 and 2020, the range is from 1,000 to 200,000. The large number of high income respodents implies that data science jobs or the AI industry pays well.
* By comparison, compensation distribution in 2018 is quite abnormal
* The large number of low income less than 2,000 USD echoes the big proportion of the "Student" selection in the job question.


# Conclusion

What can be learned from this notebook is that writing "**Chronicles**" is much harder than you think. 

Anyways, here are some worth-mentioned points from above:
* As the first year, Survey 2017 is quite different from the other three surveys in both question side and answers side. In another word, there is some continuity from 2018~2019.
* THe design of kaggle surveys tends to focus on what we use and how we practice than what we use for.
* While kaggle remains a relatively professional platform, the survey result shows the machine learning and data science techniques and skill are becoming more accessible and more friendly to those non-professionals. 
* Kaggle attracts a lot of young blood into this community every year.

To be honest, this work is still far from perfect with so many improvements to be done. Nevertheless, I hope you can learn something from my work.