# **Kaggle Data Science & Machine Learning Survey**
![Kaggle](https://upload.wikimedia.org/wikipedia/commons/7/7c/Kaggle_logo.png)

Hi! I'm glad to see you here! 

Let's try to find some interesting relations in this data. Also, we'll look at the data from past years, and compare them.

### **Past datasets:**
**2017:** [https://www.kaggle.com/kaggle/kaggle-survey-2017](https://www.kaggle.com/kaggle/kaggle-survey-2017)

**2018:** [https://www.kaggle.com/kaggle/kaggle-survey-2018](https://www.kaggle.com/kaggle/kaggle-survey-2018)

**2019:** [https://www.kaggle.com/c/kaggle-survey-2019/data](https://www.kaggle.com/c/kaggle-survey-2019/data)

In [None]:
!pip install seaborn --upgrade

In [None]:
# loading packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.dpi'] = 300

# ignoring warnings
import warnings
warnings.simplefilter("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
sns.__version__

# A first look at the data

In [None]:
# loading data
data_2017 = pd.read_csv('../input/kaggle-survey-2017/multipleChoiceResponses.csv', encoding='latin-1')
data_2018 = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv')
data_2019 = pd.read_csv('../input/kaggle-survey-2019/multiple_choice_responses.csv')
data_2020 = pd.read_csv('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

print('Data 2017 shape: {}'.format(data_2017.shape))
print('Data 2018 shape: {}'.format(data_2018.shape))
print('Data 2019 shape: {}'.format(data_2019.shape))
print('Data 2020 shape: {}'.format(data_2020.shape))
print('-'*30)
print('Head of the data 2020:')
data_2020.head()

In [None]:
print('Data 2017 types: \n{}'.format(data_2017.dtypes.value_counts()))
print('-'*20)
print('Data 2018 types: \n{}'.format(data_2018.dtypes.value_counts()))
print('-'*20)
print('Data 2019 types: \n{}'.format(data_2019.dtypes.value_counts()))
print('-'*20)
print('Data 2020 types: \n{}'.format(data_2020.dtypes.value_counts()))

In [None]:
print('Columns with NaN (2017): %d' %data_2017.isna().any().sum())
print('Columns with NaN (2018): %d' %data_2018.isna().any().sum())
print('Columns with NaN (2019): %d' %data_2019.isna().any().sum())
print('Columns with NaN (2020): %d' %data_2020.isna().any().sum())

Almost all columns have NaN values.

# DATA 2020 EDA

In [None]:
# data frame without explanation row
df = data_2020.drop(0, axis = 0)

There are a lot of relations that we can find in this data. Let's look at some of them.

In [None]:
def bar_plot(x, y, title, palette_len, xlim = None, ylim = None, 
             xticklabels = None, yticklabels = None, 
             top_visible = False, right_visible = False, 
             bottom_visible = True, left_visible = False,
             xlabel = None, ylabel = None, figsize = (10, 4),
             axis_grid = 'y'):
    fig, ax = plt.subplots(figsize = figsize)
    plt.title(title, size = 15, fontweight = 'bold', fontfamily = 'serif')

    for i in ['top', 'right', 'bottom', 'left']:
        ax.spines[i].set_color('black')
    
    ax.spines['top'].set_visible(top_visible)
    ax.spines['right'].set_visible(right_visible)
    ax.spines['bottom'].set_visible(bottom_visible)
    ax.spines['left'].set_visible(left_visible)

    sns.barplot(x = x, y = y, edgecolor = 'black', ax = ax,
                palette = reversed(sns.color_palette("viridis", len(palette_len))))
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)    
    ax.set_xticklabels(xticklabels, fontfamily = 'serif')
    ax.set_yticklabels(yticklabels, fontfamily = 'serif')
    plt.xlabel(xlabel, fontfamily = 'serif')
    plt.ylabel(ylabel, fontfamily = 'serif')
    ax.grid(axis = axis_grid, linestyle = '--', alpha = 0.9)
    plt.show()

In [None]:
sns.set_style("whitegrid")
age = df.Q1.value_counts().sort_index()

bar_plot(x = age.index, 
         y = age,
         title = 'Age distribution',
         palette_len = age.index, 
         ylim = (0, 4500), 
         xticklabels = age.index,
         yticklabels = ['0', '', '1000', '', '2000', '', '3000', '', '4000', ''])

Then let's look at the salary distribution.

In [None]:
salary_order = ['$0-999', '1,000-1,999', '2,000-2,999', '3,000-3,999', 
                '4,000-4,999', '5,000-7,499', '7,500-9,999', '10,000-14,999',
                '15,000-19,999', '20,000-24,999', '25,000-29,999', 
                '30,000-39,999', '40,000-49,999', '50,000-59,999', 
                '60,000-69,999', '70,000-79,999', '80,000-89,999', 
                '90,000-99,999', '100,000-124,999', '125,000-149,999',
                '150,000-199,999', '200,000-249,999', '250,000-299,999', 
                '300,000-500,000', '> $500,000']
salary = df.Q24.fillna('unknown').value_counts()[salary_order]

bar_plot(x = salary, 
         y = salary.index,
         title = 'Salary distribution',
         palette_len = salary.index, 
         xlim = (0, 2500), 
         xticklabels = range(0, 2501, 500),
         yticklabels = salary.index,
         left_visible = True,
         figsize = (10, 8), axis_grid = 'x')

It is interesting to analyze the distribution of salaries according to different criteria.

In [None]:
max_salary = df.Q24.fillna('unknown-unknown') \
    .apply(lambda x: x.replace('$', '') \
    .replace('> 500,000', '500,000-500,000') \
    .replace(',', '') \
    .split("-")[1]).replace('unknown', np.nan).astype('float64') + 1

new_df = pd.DataFrame({'max_salary': max_salary, 'age': df.Q1})

In [None]:
def horizontal_boxplot(data, title, xlabel, order = None):
    fig, ax = plt.subplots(1, 1, figsize=(10, 4))
    plt.title(title, size = 15, fontweight = 'bold', fontfamily = 'serif')
    
    for i in ['top', 'right', 'bottom', 'left']:
        ax.spines[i].set_visible(False)
    
    sns.boxplot(x = data, y = new_df.max_salary, ax = ax,
                palette = reversed(sns.color_palette("viridis", data.nunique())),
                order = order)
    plt.xlabel('')
    plt.ylabel('Salary', fontfamily = 'serif')    
    plt.xticks(fontfamily = 'serif')
    plt.yticks(fontfamily = 'serif')
    ax.grid(axis = 'y', linestyle = '--', alpha = 0.9)
    plt.show()
    
def vertical_boxplot(data, title, ylabel, order = None):
    fig, ax = plt.subplots(1, 1, figsize=(10, 4))
    plt.title(title, size = 15, fontweight = 'bold', fontfamily = 'serif')
    
    for i in ['top', 'right', 'bottom', 'left']:
        ax.spines[i].set_visible(False)
    
    sns.boxplot(y = data, x = new_df.max_salary, ax = ax,
                palette = reversed(sns.color_palette("viridis", data.nunique())),
                order = order)
    plt.xlabel('Salary', fontfamily = 'serif')
    plt.ylabel('')   
    plt.xticks(fontfamily = 'serif')
    plt.yticks(fontfamily = 'serif')
    ax.grid(axis = 'x', linestyle = '--', alpha = 0.9)
    plt.show()

In [None]:
horizontal_boxplot(new_df.age.sort_values(ascending = True), 
                   'Salary by age', 'Age')

The salary rises depending on age. It looks logical.

In [None]:
new_df['sex'] = df.Q2

horizontal_boxplot(new_df.sex[new_df.sex.isin(['Man', 'Woman'])], 
                   'Salary by sex', 'Sex')

In [None]:
hm = pd.DataFrame(new_df[new_df.sex.isin(['Man', 'Woman'])][['sex', 'age']] \
                  .value_counts()).reset_index().pivot('sex', 'age', 0) \
                  .fillna(0).astype('int')

plt.figure(figsize=(10, 5))
plt.title('Count by sex and age', size = 15, fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm, annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Age', fontfamily = 'serif')
plt.ylabel('Sex', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
education_order = ['No formal education past high school', 
              'Some college/university study without earning a bachelor’s degree',
              'Professional degree', 'Bachelor’s degree', 
              'Master’s degree', 'Doctoral degree']
new_df['education'] = df.Q4

vertical_boxplot(new_df.education, 'Salary by education', 
                 'Education', order = education_order)

There is a clear dependence between academic degree and salary size.

In [None]:
new_df['position'] = df.Q5

vertical_boxplot(new_df.position, 'Salary by position', 'Position')

It looks like the average data analyst's salary is one of the lowest :(

In [None]:
exp_order = ['I have never written code', '< 1 years', '1-2 years', 
             '3-5 years', '5-10 years', '10-20 years', '20+ years']
new_df['experience'] = df.Q6

vertical_boxplot(new_df.experience, 'Salary by experience', 
                 'Experience', order = exp_order)

As with a degree, experience is positively correlated with salary.

In [None]:
hm = pd.DataFrame(new_df[['position', 'education']].value_counts()) \
    .reset_index().pivot('position', 'education', 0) \
    .fillna(0).astype('int')[education_order]

plt.figure(figsize=(10, 6))
plt.title('Count by position and education', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm, annot = True, fmt = "d", linewidths=.5)

plt.xlabel('Education', fontfamily = 'serif')
plt.ylabel('Position', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
hm = pd.DataFrame(new_df.groupby(['position', 'education']) \
                  .max_salary.mean()) \
    .reset_index().pivot('position', 'education', 'max_salary') \
    .fillna(0).astype('int')[education_order]

plt.figure(figsize=(10, 6))
plt.title('The average salary by position and education', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm, annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Education', fontfamily = 'serif')
plt.ylabel('Position', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
hm = pd.DataFrame(new_df[['position', 'experience']].value_counts()) \
    .reset_index().pivot('position', 'experience', 0) \
    .fillna(0).astype('int')[exp_order]

plt.figure(figsize=(10, 6))
plt.title('Count by position and experience', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm, annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Experience', fontfamily = 'serif')
plt.ylabel('Position', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
hm = pd.DataFrame(new_df.groupby(['position', 'experience']) \
                  .max_salary.mean()) \
    .reset_index().pivot('position', 'experience', 'max_salary') \
    .fillna(0).astype('int')[exp_order]

plt.figure(figsize=(10, 6))
plt.title('The average salary by position and experience', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm, annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Experience', fontfamily = 'serif')
plt.ylabel('Position', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
top_countries = df.Q3.value_counts()
top_countries = top_countries[top_countries.index != 'Other'][:30]

bar_plot(x = top_countries, 
         y = top_countries.index,
         title = 'TOP countries by Kaggle users',
         palette_len = top_countries.index, 
         xlim = (0, 6000), 
         xticklabels = range(0, 6001, 1000),
         yticklabels = top_countries.index,
         left_visible = True,
         figsize = (10, 8), axis_grid = 'x')

The absolute domination of India :)

In [None]:
comp_order = ['0-49 employees', '50-249 employees', '250-999 employees',
              '1000-9,999 employees', '10,000 or more employees']
comp_by_emp = df.Q20.value_counts()[comp_order]

bar_plot(x = comp_by_emp, 
         y = comp_by_emp.index,
         title = 'Companies by employees',
         palette_len = comp_by_emp.index, 
         xlim = (0, 5000), 
         xticklabels = range(0, 5001, 1000),
         yticklabels = comp_by_emp.index,
         left_visible = True, axis_grid = 'x')

In [None]:
new_df['comp_size'] = df.Q20

vertical_boxplot(new_df.comp_size, 'Salary by company size', 
                 'Company size', order = comp_order)

Salary in big companies is higher that also looks logical.

In [None]:
hm = pd.DataFrame(new_df.groupby(['experience', 'comp_size']) \
                  .max_salary.mean()) \
    .reset_index().pivot('experience', 'comp_size', 'max_salary') \
    .fillna(0).astype('int').loc[exp_order, comp_order]

plt.figure(figsize=(10, 6))
plt.title('The average salary by experience and company size', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm, annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Company size', fontfamily = 'serif')
plt.ylabel('Experience', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
new_df['country'] = df.Q3

hm = pd.DataFrame(new_df.groupby(['experience', 'country']) \
                  .max_salary.mean()) \
    .reset_index().pivot('country', 'experience', 'max_salary') \
    .fillna(0).astype('int')[exp_order]

plt.figure(figsize=(10, 12))
plt.title('The average salary by top country and experience', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm.loc[top_countries.index], annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Experience', fontfamily = 'serif')
plt.ylabel('Country', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
hm = pd.DataFrame(new_df.groupby(['comp_size', 'country']) \
                  .max_salary.mean()) \
    .reset_index().pivot('country', 'comp_size', 'max_salary') \
    .fillna(0).astype('int')[comp_order]

plt.figure(figsize=(10, 12))
plt.title('The average salary by top country and company size', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm.loc[top_countries.index], annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Company size', fontfamily = 'serif')
plt.ylabel('Country', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
hm = pd.DataFrame(new_df.groupby(['position', 'country']) \
                  .max_salary.mean()) \
    .reset_index().pivot('country', 'position', 'max_salary') \
    .fillna(0).astype('int')

plt.figure(figsize=(15, 10))
plt.title('The average salary by top country and position', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm.loc[top_countries.index], annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Position', fontfamily = 'serif')
plt.ylabel('Country', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
hm = pd.DataFrame(new_df.groupby(['education', 'country']) \
                  .max_salary.mean()) \
    .reset_index().pivot('country', 'education', 'max_salary') \
    .fillna(0).astype('int')[education_order]

plt.figure(figsize=(10, 10))
plt.title('The average salary by top country and education', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')
sns.heatmap(hm.loc[top_countries.index], annot = True, fmt = "d", linewidths=.5)
plt.xlabel('Education', fontfamily = 'serif')
plt.ylabel('Country', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

Let's look at the programming languages that respondents use on a regular basis.

In [None]:
reg_pl = df.iloc[:, 7:20].apply(lambda x: [x.dropna().unique()[0], 
                                           x.count()], axis = 0).T
reg_pl.columns = ['lang', 'count']
reg_pl['perc'] = reg_pl['count'] / len(df) * 100
reg_pl = reg_pl.sort_values('perc', ascending = False)

bar_plot(x = reg_pl.perc, 
         y = reg_pl.lang,
         title = 'Programming languages that respondents use on a regular basis',
         palette_len = reg_pl.lang, 
         xlim = (0, 80), 
         xticklabels = range(0, 81, 10),
         yticklabels = reg_pl.lang,
         left_visible = True, xlabel = 'Part of respondents (%)',
         figsize = (10, 5), axis_grid = 'x')

In [None]:
recomend_pl = df.Q8.value_counts().reset_index()
recomend_pl.columns = ['lang', 'count']
recomend_pl['perc'] = recomend_pl['count'] / len(df) * 100
recomend_pl = recomend_pl.sort_values('perc', ascending = False)

bar_plot(x = recomend_pl.perc, 
         y = recomend_pl.lang,
         title = 'Programming languages that respondents recommend to learn first',
         palette_len = recomend_pl.lang, 
         xlim = (0, 80), 
         xticklabels = range(0, 81, 10),
         yticklabels = recomend_pl.lang,
         left_visible = True, xlabel = 'Part of respondents (%)',
         figsize = (10, 5), axis_grid = 'x')

As expected, Python is the best programming language :)

My favorite R comes second.

# 2017-2020: changes in relations

Now, we'll look at time changes in some relations.

In [None]:
df_2017 = data_2017.drop(0, axis = 0)
df_2018 = data_2018.drop(0, axis = 0)
df_2019 = data_2019.drop(0, axis = 0)

df_2017['year'] = '2017'
df_2018['year'] = '2018'
df_2019['year'] = '2019'
df['year'] = '2020'

In [None]:
df_all = pd.DataFrame({'year': [2017, 2018, 2019, 2020], 
                       'respondents': [len(df_2017), len(df_2018), 
                                       len(df_2019), len(df)]})

bar_plot(x = df_all.year, 
         y = df_all.respondents,
         title = 'The number of respondents',
         palette_len = df_all, 
         ylim = (0, 25000), 
         xticklabels = df_all.year,
         yticklabels = range(0, 25001, 5000))

In [None]:
def age_data_ch(data):
    if data.iloc[:, 1].dtype == 'object':
        data.columns = ['sex', 'age', 'count']
        data.age = data.age.apply(lambda x: x.split("-")[0])
        data.age = data.age.apply(lambda x: x.replace('+', '')).astype('int')
    else: 
        data.columns = ['sex', 'age', 'count']
        data.age = data.age.astype('int')
    return data

sex_2017 = age_data_ch(df_2017[['GenderSelect', 'Age']].value_counts().reset_index())
sex_2018 = age_data_ch(df_2018[['Q1', 'Q2']].value_counts().reset_index())
sex_2019 = age_data_ch(df_2019[['Q2', 'Q1']].value_counts().reset_index())
sex_2020 = age_data_ch(df[['Q2', 'Q1']].value_counts().reset_index())

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 5))
sns.set_style("white")
plt.suptitle('Age distributions', size = 15, 
             fontweight = 'bold', fontfamily = 'serif')

sns.lineplot(data = sex_2017[(sex_2017.sex == "Male") & (sex_2017.age >= 18)], 
             x = 'age', y = 'count', color = '#73D055FF', label = '2017', ax = ax1)
sns.lineplot(data = sex_2018[sex_2018.sex == "Male"], x = 'age', y = 'count',
            color = '#1F968BFF', label = '2018', ax = ax1)
sns.lineplot(data = sex_2019[sex_2019.sex == "Male"], x = 'age', y = 'count',
            color = '#39568CFF', label = '2019', ax = ax1)
sns.lineplot(data = sex_2020[sex_2020.sex == "Man"], x = 'age', y = 'count',
            color = '#440154FF', label = '2020', ax = ax1)
ax1.set_title('Male', size = 13, fontweight = 'bold', fontfamily = 'serif')
ax1.legend(prop={'size': 14})
ax1.set_xlabel('Age', fontfamily = 'serif')
ax1.set_ylabel('')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')

sns.lineplot(data = sex_2017[(sex_2017.sex == "Female") & (sex_2017.age >= 18)], 
             x = 'age', y = 'count', color = '#73D055FF', label = '2017', ax = ax2)
sns.lineplot(data = sex_2018[sex_2018.sex == "Female"], x = 'age', y = 'count',
            color = '#1F968BFF', label = '2018', ax = ax2)
sns.lineplot(data = sex_2019[sex_2019.sex == "Female"], x = 'age', y = 'count',
            color = '#39568CFF', label = '2019', ax = ax2)
sns.lineplot(data = sex_2020[sex_2020.sex == "Woman"], x = 'age', y = 'count',
            color = '#440154FF', label = '2020', ax = ax2)
ax2.set_title('Female', size = 13, fontweight = 'bold', fontfamily = 'serif')
ax2.legend(prop={'size': 14})
ax2.set_xlabel('Age', fontfamily = 'serif')
ax2.set_ylabel('')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')

plt.show()

In [None]:
degree = ['Professional degree', "Bachelor's degree", "Bachelor’s degree", 
          "Master's degree", "Master’s degree", 'Doctoral degree']

degree_2017 = df_2017[df_2017.FormalEducation.isin(degree)]
degree_2017['Q4'] = df_2017.FormalEducation
degree_2018 = df_2018[df_2018.Q4.isin(degree)]
degree_2018.Q4 = degree_2018.Q4.apply(lambda x: x.replace("’", "'"))
degree_2019 = df_2019[df_2019.Q4.isin(degree)]
degree_2019.Q4 = degree_2019.Q4.apply(lambda x: x.replace("’", "'"))
degree_2020 = df[df.Q4.isin(degree)]
degree_2020.Q4 = degree_2020.Q4.apply(lambda x: x.replace("’", "'"))

In [None]:
degree_by_year = pd.concat([degree_2017[['year', 'Q4']],
                            degree_2018[['year', 'Q4']],
                            degree_2019[['year', 'Q4']],
                            degree_2020[['year', 'Q4']]], 
                           axis = 0)

plt.figure(figsize=(10, 5))
plt.title('Degree by year', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')

sns.histplot(data = degree_by_year, x = 'Q4', hue = 'year',
             multiple = "dodge", shrink = .8, edgecolor = 'black',
             palette = reversed(sns.color_palette("viridis", 4)))
plt.xlabel('Degree', fontfamily = 'serif')
plt.ylabel('Count', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
positions = ['Student', 'Data Engineer', 'Software Engineer', 'Data Scientist',
           'Data Analyst', 'Research Scientist','Statistician',
           'Product/Project Manager', 'Machine Learning Engineer',
           'Business Analyst', 'DBA/Database Engineer']

positions_2017 = df_2017[df_2017.CurrentJobTitleSelect.isin(positions)]
positions_2017['Q5'] = df_2017.CurrentJobTitleSelect
positions_2018 = df_2018[df_2018.Q6.isin(positions)]
positions_2018['Q5'] = df_2018.Q6
positions_2019 = df_2019[df_2019.Q5.isin(positions)]
positions_2020 = df[df.Q5.isin(positions)]

position_by_year = pd.concat([positions_2017[['year', 'Q5']],
                              positions_2018[['year', 'Q5']],
                              positions_2019[['year', 'Q5']],
                              positions_2020[['year', 'Q5']]], axis = 0)

plt.figure(figsize=(10, 10))
plt.title('Some positions by year', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')

sns.histplot(data = position_by_year, y = 'Q5', hue = 'year',
             multiple = "dodge", shrink = .8, edgecolor = 'black',
             palette = reversed(sns.color_palette("viridis", 4)))
plt.xlabel('Count', fontfamily = 'serif')
plt.ylabel('Position', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
countries = top_countries.index[:10]

countries_2017 = df_2017[df_2017.Country.isin(countries)]
countries_2017['Q3'] = df_2017.Country
countries_2018 = df_2018[df_2018.Q3.isin(countries)]
countries_2019 = df_2019[df_2019.Q3.isin(countries)]
countries_2020 = df[df.Q3.isin(countries)]

cc_by_year = pd.concat([countries_2017[['year', 'Q3']],
                        countries_2018[['year', 'Q3']],
                        countries_2019[['year', 'Q3']],
                        countries_2020[['year', 'Q3']]], axis = 0)

plt.figure(figsize=(10, 8))
plt.title('TOP-2020 Kaggle countries by year', size = 15, 
          fontweight = 'bold', fontfamily = 'serif')

sns.histplot(data = cc_by_year, y = 'Q3', hue = 'year',
             multiple = "dodge", shrink = .8, edgecolor = 'black',
             palette = reversed(sns.color_palette("viridis", 4)))
plt.ylabel('Count', fontfamily = 'serif')
plt.ylabel('Country', fontfamily = 'serif')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
def lang_func(data, year):
    reg_pl = data.apply(lambda x: [x.dropna().unique()[0], 
                                   x.count()], axis = 0).T
    reg_pl.columns = ['lang', 'count']
    reg_pl['perc'] = reg_pl['count'] / len(df) * 100
    reg_pl = reg_pl.sort_values('perc', ascending = False)
    reg_pl['year'] = year
    return reg_pl

In [None]:
lang_2018 = lang_func(df_2018.iloc[:, 65:83], 2018)
lang_2019 = lang_func(df_2019.iloc[:, 82:94], 2019)
lang_2020 = lang_func(df.iloc[:, 7:20], 2020)
lang = pd.concat([lang_2018, lang_2019, lang_2020])
lang_2017 = pd.DataFrame({'lang': lang['lang'].unique(), 
                          'count': np.zeros(len(lang['lang'].unique())),
                          'perc': np.zeros(len(lang['lang'].unique())),
                          'year' : 2017})
lang = pd.concat([lang_2017, lang])
lang = lang.sort_values('perc', ascending = False)

plt.figure(figsize=(10, 15))
plt.title('Programming languages that respondents use on a regular basis by year', 
          size = 14, fontweight = 'bold', fontfamily = 'serif')

sns.barplot(y = lang['lang'], x = lang['perc'], edgecolor = 'black',
            hue = lang['year'],
            palette = reversed(sns.color_palette("viridis", 4)))
plt.legend(loc = 'center right')
plt.xlabel('Part of respondents (%)', fontfamily = 'serif')
plt.ylabel('')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

In [None]:
def recom_lang(data, year):
    recomend_pl = data.value_counts().reset_index()
    recomend_pl.columns = ['lang', 'count']
    recomend_pl['perc'] = recomend_pl['count'] / len(data) * 100
    recomend_pl = recomend_pl.sort_values('perc', ascending = False)
    recomend_pl['year'] = year
    return recomend_pl

In [None]:
recom_2017 = recom_lang(df_2017.LanguageRecommendationSelect, 2017)
recom_2018 = recom_lang(df_2018.Q18, 2018)
recom_2019 = recom_lang(df_2019.Q19, 2019)
recom_2020 = recom_lang(df.Q8, 2020)
recom = pd.concat([recom_2017, recom_2018, recom_2019, recom_2020])
recom = recom.sort_values('perc', ascending = False)

plt.figure(figsize=(10, 15))
plt.title('Programming languages that respondents recommend to learn first by year', 
          size = 14, fontweight = 'bold', fontfamily = 'serif')

sns.barplot(y = recom['lang'], x = recom['perc'], edgecolor = 'black',
            hue = recom['year'],
            palette = reversed(sns.color_palette("viridis", 4)))
plt.legend(loc = 'center right')
plt.xlabel('Part of respondents (%)', fontfamily = 'serif')
plt.ylabel('')
plt.xticks(fontfamily = 'serif')
plt.yticks(fontfamily = 'serif')
plt.show()

The importance of Python as the most recommended programming language increases every year.

## **Work in progress...**