In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

# splitting the dataset into questions and data points
questions = data.iloc[0, :].T
data = data.iloc[1:, :]

In [None]:
# print all available questions
for i,  q in questions.iteritems():
    print(i, q)

**This notebook is going to be heavily using heatmaps in order to visualize various distributions over different categories. For example:**
1. Q8: What programming language would you recommend an aspiring data scientist to learn first?

**This question can lead us to ask "Well, what did participants with different experiences recommendations differ?". This question could be asked by faceting a bar plot with each experience category, but a simpler answer would be to plot a heat map with a relative frequency of recommendations for each group.**

# What programming language does people with different experiences recommend?

In [None]:
plt.figure(figsize=(7, 10))
q4_order = ['No formal education past high school', 'Some college/university study without earning a bachelor’s degree',
            'Bachelor’s degree', 'Professional degree', 'Master’s degree', 'Doctoral degree', 'I prefer not to answer']
q4_q8 = pd.pivot_table(data, index=['Q4'], columns=['Q8'], aggfunc='size', fill_value=0).loc[q4_order]
q4_q8 = q4_q8.apply(lambda x: x/x.sum(), axis=1)
sns.heatmap(q4_q8, cmap="YlGnBu",
            square=True, 
            linewidth=2.5, 
            cbar=False, 
            annot=True,
            fmt=".2f"
           )
plt.ylabel('Education')
plt.xlabel('Recommended Language')
plt.title('Recommended Languages Relative Frequencies by Education');

**We can see that the majority of participants regardless of their education recommended python to aspiring data sceintists.**

# What programming language does people with different titles recommend?

In [None]:
plt.figure(figsize=(7, 11))
q5_q8 = pd.pivot_table(data, index=['Q5'], columns=['Q8'], aggfunc='size', fill_value=0)
q5_q8 = q5_q8.apply(lambda x: x/x.sum(), axis=1)
sns.heatmap(q5_q8, cmap="YlGnBu",
            square=True, 
            linewidth=2.5, 
            cbar=False, 
            annot=True,
            fmt=".2f"
           )

plt.ylabel('Title')
plt.xlabel('Recommended Language')
plt.title('Recommended Languages Relative Frequencies by Title');

**It is clear that there is a consensus about the starter language between participants with different titles, but we can that statisticians are mostly split between Python and R. As some may argue that R is better suited for statistics.**

**We can also see that some business analysts, data analysts, data enginneers and data scientists recommend starting with SQL. It might be the these participants activites heavily envolve SQL. This could further investigated by filtering out the participants who recommended SQL and checking their most frequent daily activities in their jobs in question 28**

# What are the most used programming languages according to multiple selection question?

In [None]:
plt.figure(figsize=(10, 6));
q7 = [f'Q7_Part_{i+1}' for i in range(12)]
pd.melt(data.reset_index(), id_vars='index', value_vars=q7).value.value_counts()[::-1].plot(kind='barh')
plt.title('Most Used Languages')
plt.xlabel('Count')
plt.ylabel('Language');

**It is obvious that if most participants recommended python, it would be the most commonly used language between them.**

# What are the most used programming languages for groups with different experience?

In [None]:
def get_lang_freq(group):
    """Calculates each language absolute frequency from multiple selections of question 7."""
    group_lang_freq = pd.melt(group.reset_index(), id_vars='index', value_vars=q7).value.value_counts()
    return group_lang_freq


q6_order = ['< 1 years', '1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years']

groups_lang_freq = data.groupby('Q6').apply(get_lang_freq).unstack().loc[q6_order]
groups_lang_freq = groups_lang_freq.apply(lambda x: x/x.sum(), axis=1)

plt.figure(figsize=(10, 14))
sns.heatmap(groups_lang_freq, cmap="YlGnBu",
            square=True, 
            linewidth=2.5, 
            cbar=False, 
            annot=True,
            fmt=".2f"
           );

plt.ylabel('Experience')
plt.xlabel('Language')
plt.title('Languages Relative Frequencies by Different Experience Groups');

**We can see a trend were the relative frequency of python users in each group falls as experience increases. This might be because python was relatively new to people with more experience, where only some of them began usin python and others were content with what they know. This could be further investigated by filtering the groups they don't use python and analyse their titles, and their stack, etc.**

# What are the most used programming languages for groups with different titles?

In [None]:
groups_lang_freq = data.groupby('Q5').apply(get_lang_freq).unstack().fillna(0).astype(int)
groups_lang_freq = groups_lang_freq.apply(lambda x: x/x.sum(), axis=1)

plt.figure(figsize=(7,11))
sns.heatmap(groups_lang_freq, cmap="YlGnBu",
            square=True, 
            linewidth=2.5, 
            cbar=False, 
            annot=True,
            fmt=".2f"
           )

plt.ylabel('Title')
plt.xlabel('Language')
plt.title('Languages Relative Frequencies by Title');

**We can see a difference between ML Engineers and Data Scientists, where the former group have higher popularity with twice the popularity with C, C++, Java and Javascript. While the latter have more popularity with R and SQL.**

**This difference could be further investigated by analysing how the daily activities of ML Engineers differ from Data Scientists.**

**There are 2 different question relating to two different experiences:**
1. Q6: For how many years have you been writing code and/or programming?
2. Q15: For how many years have you used machine learning methods?

**These two questions enable use to ask another question.**

# How many of the coding veterans where familiar with machine learning methods from the start?

We can see below that both features have different categorization, and it might help if we bin some of ML experience categories so we can better visualize it with coding experience.

We'll merge 2-3 years, 3-4 years and 4-5 years into one 3-5 years to be like coding experience categories.

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
data['Q6'].value_counts().plot(kind='bar', ax=axes[0])
axes[0].set_title('Coding Experience Frequencies')
data['Q15'].value_counts().plot(kind='bar', ax=axes[1])
axes[1].set_title('ML Experience Frequencies');

In [None]:
# binning some ML experience categories to be similar to coding experience
data['Q15-2'] = data['Q15'].replace(
    {'Under 1 year': '< 1 years', '20 or more years': '20+ years', '2-3 years': '3-5 years', 
     '3-4 years': '3-5 years', '4-5 years': '3-5 years'})

In [None]:
plt.figure(figsize=(7, 10))

q15_order = ['I do not use machine learning methods', 'Under 1 year', '1-2 years', '2-3 years', 
             '3-4 years', '4-5 years', '5-10 years', '10-20 years', '20 or more years']

q15_2_order =  ['I do not use machine learning methods'] + q6_order
q6_q15_piv = pd.pivot_table(data, index=['Q6'], columns=['Q15-2'], aggfunc='size', fill_value=0).loc[q6_order, q15_2_order]
q6_q15_piv = q6_q15_piv.apply(lambda x: x/x.sum(), axis=1)
ax = sns.heatmap(q6_q15_piv, cmap="YlGnBu",
            square=True, 
            linewidth=2.5, 
            cbar=False, 
            annot=True,
            fmt=".1f"
           )
    
plt.ylabel('Coding Experience (Years)')
plt.xlabel('Machine Learning Experience (Years)')
plt.title('Machine Learning Experience Distributed against Coding Experience');

We can see that only 30% of people with 3-5 years of coding behind their back were familiar with ML methods from the start, and most of them were introduced to it somewhere in the past 2 years.

Also it look like 50% of the people with 1-2 years of coding experience made began using ML methods began using ML in the last year. Probably they started learning coding in general and then deviated from their path towards data science. Or probably they just learned R or Python for statistics and then later got introduced to ML.

This was some of ideas that I had. I'll update it with more questions soon.

If you liked it please upvote and I hope you can use the code to build upon it in your kernels :D.