# SLOW AND STEADY

This is rewrite of my original [2019 kernel](https://www.kaggle.com/altprof/slow-and-steady). All the outputs are made in the same manner, so they are completely comparable.

![](https://assets.pokemon.com/assets/cms2/img/pokedex/full/079.png)

We will compare respondents that spent more than time than the 3rd quartile (slow) and ones that spent less time than the 1st quartile (quick). I will take only top-5 countries in the number of responses.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import patches

In [None]:
sns.set_style('whitegrid')
sns.set_context('notebook')
sns.set_palette('colorblind', 10)
sns.set(font_scale=2)

current_palette = sns.color_palette()

In [None]:
data = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv', low_memory=False).drop(0);

india = data[data['Q3'] == 'India'].copy()
usa = data[data['Q3'] == 'United States of America'].copy()
brazil = data[data['Q3'] == 'Brazil'].copy()
japan = data[data['Q3'] == 'Japan'].copy()
russia = data[data['Q3'] == 'Russia'].copy()

In [None]:
print('Number of respondents: ', data.shape[0])

In [None]:
salary = data['Q24'].str.replace(',','').str.replace('$','').str.replace('>','').str.strip(' ').str.split('-', expand=True).astype(np.float32)
salary.columns = ['min', 'max']
salary['midrange'] = (salary['max'] + salary['min']) * 0.5

# Age

In [None]:
loglogtime = np.log(np.log(data['Time from Start to Finish (seconds)'].astype(np.int32)))

india_time = loglogtime.loc[india.index]
usa_time = loglogtime.loc[usa.index]
brazil_time = loglogtime.loc[brazil.index]
japan_time = loglogtime.loc[japan.index]
russia_time = loglogtime.loc[russia.index]

In [None]:
q1_all, q3_all = np.quantile(loglogtime, [0.25, 0.75])

q1_india, q3_india = np.quantile(india_time, [0.25, 0.75])

q1_usa, q3_usa = np.quantile(usa_time, [0.25, 0.75])

q1_brazil, q3_brazil = np.quantile(brazil_time, [0.25, 0.75])

q1_japan, q3_japan = np.quantile(japan_time, [0.25, 0.75])

q1_russia, q3_russia = np.quantile(russia_time, [0.25, 0.75])

In [None]:
fast_all = data.loc[(loglogtime < q1_all)]
slow_all = data.loc[(loglogtime > q3_all)]

fast_india = india.loc[(india_time < q1_india)]
slow_india = india.loc[(india_time > q3_india)]

fast_usa = usa.loc[(usa_time < q1_usa)]
slow_usa = usa.loc[(usa_time > q3_usa)]

fast_brazil = brazil.loc[(brazil_time < q1_brazil)]
slow_brazil = brazil.loc[(brazil_time > q3_brazil)]

fast_japan = japan.loc[(japan_time < q1_japan)]
slow_japan = japan.loc[(japan_time > q3_japan)]

fast_russia = russia.loc[(russia_time < q1_russia)]
slow_russia = russia.loc[(russia_time > q3_russia)]

In [None]:
info_time = pd.DataFrame()
info_time['Countries'] = ['All', 'India', 'USA', 'Brazil', 'Japan', 'Russia']
info_time['Size'] = [fast_all.shape[0], fast_india.shape[0], fast_usa.shape[0], fast_brazil.shape[0], fast_japan.shape[0], fast_russia.shape[0]]
info_time['Median duration for slow respondents, min'] = [slow_all['Time from Start to Finish (seconds)'].median().astype(int) // 60,
                                                          slow_india['Time from Start to Finish (seconds)'].median().astype(int) // 60, 
                                                          slow_usa['Time from Start to Finish (seconds)'].median().astype(int) // 60, 
                                                          slow_brazil['Time from Start to Finish (seconds)'].median().astype(int) // 60, 
                                                          slow_japan['Time from Start to Finish (seconds)'].median().astype(int) // 60, 
                                                          slow_russia['Time from Start to Finish (seconds)'].median().astype(int) // 60]
info_time['Median duration for quick respondents, min'] = [fast_all['Time from Start to Finish (seconds)'].median().astype(int) // 60,
                                                           fast_india['Time from Start to Finish (seconds)'].median().astype(int) // 60, 
                                                           fast_usa['Time from Start to Finish (seconds)'].median().astype(int) // 60, 
                                                           fast_brazil['Time from Start to Finish (seconds)'].median().astype(int) // 60, 
                                                           fast_japan['Time from Start to Finish (seconds)'].median().astype(int) // 60, 
                                                           fast_russia['Time from Start to Finish (seconds)'].median().astype(int) // 60]

In [None]:
info_time

In [None]:
DATAFRAMES = [(slow_all, fast_all), (slow_india, fast_india), (slow_usa, fast_usa), (slow_brazil, fast_brazil), (slow_japan, fast_japan), (slow_russia, fast_russia)]
TITLES = ['All', 'India', 'USA', 'Brazil', 'Japan', 'Russia']

In [None]:
AGE_ORDER = ['18-21', '22-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54', '55-59', '60-69', '70+']

fig, axs = plt.subplots(2, 3, figsize=(30,20));
fig.suptitle('Age comparison', x=0.5, y=1.05)

def plot_results(series, reind, **kwargs):
    return series.value_counts(normalize=True).reindex(reind).plot(alpha=0.8, **kwargs);

for i, pair in enumerate(DATAFRAMES):
    axs[i//3][i-3*(i//3)].set_title(TITLES[i])
    plot_results(pair[0]['Q1'], AGE_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[0], rot=30);
    plot_results(pair[1]['Q1'], AGE_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[1], rot=30);

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

The younger audience is generally faster in completing surveys (shocker)

# Gender

In [None]:
GENDER_ORDER = ['Prefer to self-describe', 'Prefer no to say', 'Nonbinary', 'Woman', 'Man']

fig, axs = plt.subplots(2, 3, figsize=(30,20));
fig.suptitle('Gender comparison', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i//3][i-3*(i//3)].set_title(TITLES[i])
    plot_results(pair[0]['Q2'], GENDER_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[0], rot=30);
    plot_results(pair[1]['Q2'], GENDER_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[1], rot=30);

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

Males have more time to waste on surveys (Russian lost competive edge here)

# Education

In [None]:
EDUCATION_DICT = {'Master’s degree': 'Master', 
                  'Bachelor’s degree': 'Bachelor', 
                  'Doctoral degree': 'Doctor', 
                  'Some college/university study without earning a bachelor’s degree': 'Audition',
                  'No formal education past high school': 'High School',
                  'I prefer not to answer': 'Other',
                  'Professional degree': 'Professional'}

EDUCATION_ORDER = ['High School', 'Professional', 'Audition', 'Bachelor', 'Master', 'Doctor']

fig, axs = plt.subplots(2, 3, figsize=(30,20));
fig.suptitle('Education comparison', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i//3][i-3*(i//3)].set_title(TITLES[i])
    plot_results(pair[0]['Q4'].map(EDUCATION_DICT), EDUCATION_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[0], rot=30);
    plot_results(pair[1]['Q4'].map(EDUCATION_DICT), EDUCATION_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[1], rot=30);

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

Japan Masters are the fastest thinkers this year (sorry Brazilain PhDs)

# Company size

In [None]:
SIZE_ORDER = ['0-49 employees', '50-249 employees', '250-999 employees', '1000-9,999 employees', '10,000 or more employees']

fig, axs = plt.subplots(2, 3, figsize=(30,20));
fig.suptitle('Company size charts', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i//3][i-3*(i//3)].set_title(TITLES[i])
    plot_results(pair[0]['Q20'], SIZE_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[0], rot=30);
    plot_results(pair[1]['Q20'], SIZE_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[1], rot=30);

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

Small-mid sized companies becoming slower

# Size of data science department

In [None]:
DS_NUM_ORDER = ['0', '1-2', '3-4', '5-9', '10-14', '15-19', '20+']

fig, axs = plt.subplots(2, 3, figsize=(30,20));
fig.suptitle('Number of data scientists in a company', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i//3][i-3*(i//3)].set_title(TITLES[i])
    plot_results(pair[0]['Q21'], DS_NUM_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[0], rot=30);
    plot_results(pair[1]['Q21'], DS_NUM_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[1], rot=30);

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

The fastest respondents are not even data scientists

# Intensity of ML use

In [None]:
ML_USE_ORDER = ['I do not know', 
                'No (we do not use ML methods)',
                'We are exploring ML methods (and may one day put a model into production)',
                'We use ML methods for generating insights (but do not put working models into production)',
                'We recently started using ML methods (i.e., models in production for less than 2 years)',
                'We have well established ML methods (i.e., models in production for more than 2 years)']

ML_USE_DICT = {x: x.split('(')[0].strip() for x in ML_USE_ORDER}
ML_USE_ORDER = [x.split('(')[0].strip() for x in ML_USE_ORDER]

fig, axs = plt.subplots(1, 6, figsize=(30,20), sharey=True);
fig.suptitle('Intensivity of ML use in company', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i].set_title(TITLES[i])
    plot_results(pair[0]['Q22'].map(ML_USE_DICT), ML_USE_ORDER, ax=axs[i], kind='barh', color=current_palette[0]);
    plot_results(pair[1]['Q22'].map(ML_USE_DICT), ML_USE_ORDER, ax=axs[i], kind='barh', color=current_palette[1]);

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

And they do not know what their company is doing

# Salary

In [None]:
fig, axs = plt.subplots(2, 3, figsize=(30,20));
fig.suptitle('Distribution of midrange salary', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i//3][i-3*(i//3)].set_title(TITLES[i])
    plot_results(salary.loc[pair[0].index]['midrange'], None, ax=axs[i//3][i-3*(i//3)], kind='kde', color=current_palette[0]);
    plot_results(salary.loc[pair[1].index]['midrange'], None, ax=axs[i//3][i-3*(i//3)], kind='kde', color=current_palette[1]);

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

In [None]:
slow_median_all = salary.loc[slow_all.index]['midrange'].median()
fast_median_all = salary.loc[fast_all.index]['midrange'].median()
slow_mean_all = salary.loc[slow_all.index]['midrange'].mean()
fast_mean_all = salary.loc[fast_all.index]['midrange'].mean()

slow_median_india = salary.loc[slow_india.index]['midrange'].median()
fast_median_india = salary.loc[fast_india.index]['midrange'].median()
slow_mean_india = salary.loc[slow_india.index]['midrange'].mean()
fast_mean_india = salary.loc[fast_india.index]['midrange'].mean()

slow_median_usa = salary.loc[slow_usa.index]['midrange'].median()
fast_median_usa = salary.loc[fast_usa.index]['midrange'].median()
slow_mean_usa = salary.loc[slow_usa.index]['midrange'].mean()
fast_mean_usa = salary.loc[fast_usa.index]['midrange'].mean()

slow_median_brazil = salary.loc[slow_brazil.index]['midrange'].median()
fast_median_brazil = salary.loc[fast_brazil.index]['midrange'].median()
slow_mean_brazil = salary.loc[slow_brazil.index]['midrange'].mean()
fast_mean_brazil = salary.loc[fast_brazil.index]['midrange'].mean()

slow_median_japan = salary.loc[slow_japan.index]['midrange'].median()
fast_median_japan = salary.loc[slow_brazil.index]['midrange'].median()
slow_mean_japan = salary.loc[slow_japan.index]['midrange'].mean()
fast_mean_japan = salary.loc[slow_brazil.index]['midrange'].mean()

slow_median_russia = salary.loc[slow_russia.index]['midrange'].median()
fast_median_russia = salary.loc[fast_russia.index]['midrange'].median()
slow_mean_russia = salary.loc[slow_russia.index]['midrange'].mean()
fast_mean_russia = salary.loc[fast_russia.index]['midrange'].mean()

info_time['Mean slow salary - Mean quick salary'] = [np.round(slow_mean_all-fast_mean_all).astype(int), 
                                                     np.round(slow_mean_india-fast_mean_india).astype(int), 
                                                     np.round(slow_mean_usa-fast_mean_usa).astype(int), 
                                                     np.round(slow_mean_brazil-fast_mean_brazil).astype(int), 
                                                     np.round(slow_mean_japan-fast_mean_japan).astype(int),
                                                     np.round(slow_mean_russia-fast_mean_russia).astype(int)]

info_time['Median slow salary - Median quick salary'] = [int(slow_median_all-fast_median_all), 
                                                         int(slow_median_india-fast_median_india), 
                                                         int(slow_median_usa-fast_median_usa), 
                                                         int(slow_median_brazil-fast_median_brazil), 
                                                         int(slow_median_japan-fast_median_japan),
                                                         int(slow_median_russia-fast_median_russia)]

In [None]:
info_time

The difference between median salaries suggests to take it slow and breathe

# Cloud computing

In [None]:
CC_SPENT_ORDER = ['$0 ($USD)', '$1-$99', '$100-$999', '$1000-$9,999', '$10,000-$99,999', '$100,000 or more ($USD)']

fig, axs = plt.subplots(2, 3, figsize=(30,20));
fig.suptitle('Money spent on cloud computing in the past 5 years', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i//3][i-3*(i//3)].set_title(TITLES[i])
    plot_results(pair[0]['Q25'], CC_SPENT_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[0], rot=30);
    plot_results(pair[1]['Q25'], CC_SPENT_ORDER, ax=axs[i//3][i-3*(i//3)], kind='bar', color=current_palette[1], rot=30);

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

# Coding experience

In [None]:
CODE_ECP_ORDER = ['I have never written code', '< 1 years', '1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']

fig, axs = plt.subplots(1, 6, figsize=(30,20), sharey=True);
fig.suptitle('Coding experience', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i].set_title(TITLES[i])
    plot_results(pair[0]['Q6'], CODE_ECP_ORDER, ax=axs[i], kind='barh', color=current_palette[0]);
    plot_results(pair[1]['Q6'], CODE_ECP_ORDER, ax=axs[i], kind='barh', color=current_palette[1]);

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

Coding in kaggle competitions is overrated

# TPU usage

In [None]:
TPU_USAGE_ORDER = ['Never', 'Once', '2-5 times', '6-25 times', 'More than 25 times']

fig, axs = plt.subplots(1, 6, figsize=(30,20), sharey=True);
fig.suptitle('TPU usage (aka what Google is really interested in)', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i].set_title(TITLES[i])
    plot_results(pair[0]['Q13'], TPU_USAGE_ORDER, ax=axs[i], color=current_palette[0], kind='barh');
    plot_results(pair[1]['Q13'], TPU_USAGE_ORDER, ax=axs[i], color=current_palette[1], kind='barh');

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

Let Google analytics decide what to do with this info

# ML experience

In [None]:
ML_CODE_EXP = ['I do not use machine learning methods', 'Under 1 year', '1-2 years', '2-3 years', '3-4 years', '4-5 years', '5-10 years', '10-20 years', '20 or more years']

fig, axs = plt.subplots(1, 6, figsize=(30,20), sharey=True);
fig.suptitle('Machine learning coding experience', x=0.5, y=1.05)

for i, pair in enumerate(DATAFRAMES):
    axs[i].set_title(TITLES[i])
    plot_results(pair[0]['Q15'], ML_CODE_EXP, ax=axs[i], color=current_palette[0], kind='barh');
    plot_results(pair[1]['Q15'], ML_CODE_EXP, ax=axs[i], color=current_palette[1], kind='barh');

fig.legend([patches.Patch(facecolor=current_palette[color]) for color in range(2)], ['Slow respondents', 'Quick respondents'], loc='upper right');

fig.tight_layout();

In the end, we are all here to <s>machine</s> learn