In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import matplotlib.ticker as ticker
 import seaborn as sns

data = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')

Before doing the main task, let's skim through the whole data. 

In [None]:
data.head()

There are so many items in the data, I choose to extract some of the information that I need for analyzing. I sometimes want to know that what people with similar circumstances with me do. And by looking at their choices, I can make a plan to achieve my goal. 

In this EDA, at first, I have to specify what my current conditions are: Male with master's degree.

In [None]:
df_similar_to_me = data[(data["Q2"] == "Man")  & (data["Q4"] == "Masterâ€™s degree")]

df_similar_to_me.head()

Let's take a glance at the age group.

In [None]:
age_group = df_similar_to_me['Q1'].value_counts().sort_index().astype(np.int64)

idx = age_group.index

fig, ax = plt.subplots(figsize=(15, 8))
ax.bar(idx, age_group, facecolor='lightgray')
ax.set_xlabel('Age group', fontsize=14, fontweight='bold', color='gray', labelpad=12)
ax.set_ylabel('Population', fontsize=14, fontweight='bold', color='gray', labelpad=12)

for p in ax.patches:
    left, bottom, width, height = p.get_bbox().bounds
    if height == 1529:
        ax.annotate(height.astype(np.int64), (left+width/2, height+30), ha='center', color='k', fontweight='bold')
    else:
        ax.annotate(height.astype(np.int64), (left+width/2, height+30), ha='center', color='k')
    
ax.patches[2].set_facecolor('yellow')
ax.patches[2].set_linewidth('3')
ax.patches[2].set_edgecolor('darkorange')

It is not that quite intersting, but I'm glad that I don't seem late. Anyway, next we will analyze their current occupation.

In [None]:
current_role = df_similar_to_me['Q5'].value_counts().sort_index().astype(np.int64)

fig, ax = plt.subplots(figsize=(15, 8))
ax.bar(current_role.index, current_role, facecolor='lightgray')
ax.set_xlabel('Current role', fontsize=14, fontweight='bold', color='gray', labelpad=12)
ax.set_ylabel('Population', fontsize=14, fontweight='bold', color='gray', labelpad=12)

ax.tick_params(axis='x', rotation=90, labelsize=10, length=5)

for p in ax.patches:
    left, bottom, width, height = p.get_bbox().bounds
    ax.annotate(height.astype(np.int64), (left+width/2, height+10), ha='center', color='k', fontweight='bold')
    

ax.patches[5].set_facecolor('yellow')
ax.patches[5].set_linewidth('3')
ax.patches[5].set_edgecolor('darkorange')

ax.patches[6].set_facecolor('lime')
ax.patches[6].set_linewidth('3')
ax.patches[6].set_edgecolor('seagreen')

ax.patches[9].set_facecolor('skyblue')
ax.patches[9].set_linewidth('3')
ax.patches[9].set_edgecolor('dodgerblue')

ax.get_xticklabels()[5].set_color('darkorange')
ax.get_xticklabels()[5].set_fontweight('bold')
ax.get_xticklabels()[6].set_color('seagreen')
ax.get_xticklabels()[6].set_fontweight('bold')
ax.get_xticklabels()[9].set_color('dodgerblue')
ax.get_xticklabels()[9].set_fontweight('bold')

Since the 3 jobs tha I admire most are 'Data Scientist', 'Machine Learning Engineer', 'Reserach Scientist', let's focus on those 3 groups. Maybe there are some differences in coding/programming experience between those groups.

In [None]:
top_3_jobs = df_similar_to_me[(df_similar_to_me["Q5"] == "Data Scientist") |
                              (df_similar_to_me["Q5"] == "Machine Learning Engineer") |
                              (df_similar_to_me["Q5"] == "Research Scientist")]

top_3_jobs.head()

top_3_jobs['count'] = 1

jobs_exp = pd.pivot_table(top_3_jobs, index=['Q6'], columns=['Q5'], values='count', aggfunc='sum').reindex(['1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years'])

jobs_exp.head()

fig, ax = plt.subplots(figsize=(5,12))
sns.heatmap(jobs_exp, linewidths=2.5, annot=True, annot_kws={"size": 10, "fontweight": 'bold'}, cmap="Greens", fmt="d")
ax.set_ylabel('Experience in programming',fontsize=14, fontweight='bold', color='gray', labelpad=12)
ax.set_xlabel('Current Jobs',fontsize=14, fontweight='bold', color='gray', labelpad=12)

I get a sense of how much experience I need for getting those jobs. Then, do I have to know many languages? Let's see.

In [None]:
lang_sum = top_3_jobs.loc[:,'Q7_Part_1':'Q7_OTHER'].count(axis=1).values

top_3_jobs['lang_sum'] = lang_sum.astype(np.int64)

g = sns.catplot(data=top_3_jobs, x='Q6', y='lang_sum', order=['1-2 years', '3-5 years', '5-10 years', '10-20 years', '20+ years'], 
            kind='violin', inner='stick', palette='pastel')

g.set_xlabels('Experience in coding', fontsize=14, fontweight='bold', color='gray', labelpad=12)
g.set_ylabels('Number of languages', fontsize=14, fontweight='bold', color='gray', labelpad=12)

g.axes[0][0].axhline(2, ls='-', color='dodgerblue', linewidth=3)
g.axes[0][0].axhline(3, ls='-', color='dodgerblue', linewidth=3)

As above graph shows, regardless of experience, most of them are capable of 2 or 3 languages. Then what language should I learn?

In [None]:
df = pd.DataFrame(top_3_jobs.loc[:,'Q7_Part_1':'Q7_OTHER'])

df_2 = df.notnull().astype('int')
df_2 = df_2.append(df_2.agg("sum"), ignore_index=True)

lang_group = df_2.iloc[-1,:]
idx = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'None', 'Other']

fig, ax = plt.subplots(figsize=(15, 6))
ax.bar(idx, lang_group, facecolor='lightgray')
ax.set_xlabel('Language', fontsize=14, fontweight='bold', color='gray', labelpad=12)
ax.set_ylabel('Recommendations', fontsize=14, fontweight='bold', color='gray', labelpad=12)
ax.set_title('First Language to learn for aspiring data scientist', fontsize=14, fontweight='bold')

for p in ax.patches:
    left, bottom, width, height = p.get_bbox().bounds
    if height == 1626 or height == 861:
        ax.annotate(height.astype(np.int64), (left+width/2, height+30), ha='center', color='k', fontweight='bold')
    else:
        ax.annotate(height.astype(np.int64), (left+width/2, height+30), ha='center', color='k')
    
ax.patches[0].set_facecolor('yellow')
ax.patches[0].set_linewidth('3')
ax.patches[0].set_edgecolor('darkorange')

ax.patches[2].set_facecolor('yellow')
ax.patches[2].set_linewidth('3')
ax.patches[2].set_edgecolor('darkorange')

Luckily, Python is the most recommended language. The most interesting part, it's time to talk about money.

In [None]:
salary = top_3_jobs.iloc[:,118].value_counts().astype(np.int64)

salary

Let's preprocess the index data for analysis. Since I take the median of each price range, it can be a little bit inaccurate analysis, since I don't know about the distribution in the certain wage group.

In [None]:
salary.index = [i.replace(',','') for i in salary.index]

salary.index = [i.replace('$','') for i in salary.index]

salary.index = [i.replace('> ','') for i in salary.index]

new_index = {}
for i in salary.index:
    mid_point = i.find('-')
    left_num = int(i[:(mid_point)])
    right_num = int(i[(mid_point+1):])
    new_index[i] = (left_num+right_num)/2

new_index

In [None]:
exp_comp = pd.pivot_table(top_3_jobs, values='count', index=['Q6'], columns=['Q24'], aggfunc=np.sum)

new_col = list(new_index.values())

exp_comp.columns = new_col

exp_comp.fillna(0, inplace=True)

exp_comp

exp_comp['avg_income'] = 0
v2 = np.array(exp_comp.columns[:-1])

for i in range(0,7):
    v1 = np.array(exp_comp.iloc[i, :-1])
    sum = np.sum(v1)
    avg_income = np.dot(v1, v2) / sum
    exp_comp.iloc[i,-1] = round(avg_income,2)

q6_order = ['I have never written code', '< 1 years', '1-2 years', '3-5 years','5-10 years', '10-20 years', '20+ years']

exp_comp = exp_comp.reindex(q6_order)

fig, ax = plt.subplots(figsize=(15, 6))
ax.plot(exp_comp.index, exp_comp['avg_income'], color = 'dodgerblue', linewidth=2.5, linestyle='--')
ax.bar(exp_comp.index, exp_comp['avg_income'], facecolor = 'lightgray', edgecolor='black')
ax.set_xlabel('Experience in coding', fontsize=14, fontweight='bold', color='gray', labelpad=12)
ax.set_ylabel('Average yearly compensation', fontsize=14, fontweight='bold', color='gray', labelpad=12)

Because all the data is categorical, it has weak logical background for drawing the line plot, but I just want to show the trend. Let's analyze this result by jobs. Similar with above procedure, we have to do some preprocessing for changing string data to numeric data.

In [None]:
year_job_salary = top_3_jobs[["Q6", "Q5", "Q24"]]

new_salary_list = list(new_index.values())

old_salary_index = top_3_jobs.iloc[:,118].value_counts().astype(np.int64)

old_salary_list = list(old_salary_index.index)

idx_map = dict(zip(old_salary_list, new_salary_list))

year_job_salary = year_job_salary.replace({"Q24": idx_map})

q6_order = ['I have never written code', '< 1 years', '1-2 years', '3-5 years','5-10 years', '10-20 years', '20+ years']

integer_q6 = [0, 1, 1.5, 4, 7.5, 15, 20]

q6_dict = dict(zip(q6_order, integer_q6))

year_job_salary = year_job_salary.replace({"Q6": q6_dict})

year_job_salary.fillna(0, inplace=True)

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 6))
axs = axes.ravel()
palette = sns.color_palette("mako_r", 3)

sns.lineplot(data=year_job_salary, x="Q6", y="Q24", hue="Q5", ci=None, palette=palette, ax=axs[0])
axs[0].legend().set_title('Jobs')
axs[0].set_xlabel('Experience in coding', fontsize=14, fontweight='bold', color='gray', labelpad=12)
axs[0].set_ylabel('Average yearly compensation', fontsize=14, fontweight='bold', color='gray', labelpad=12)

sns.lineplot(data=year_job_salary, x="Q6", y="Q24", ci=None, ax=axs[1])
axs[1].set_ylabel('')
axs[1].set_xlabel('Experience in coding', fontsize=14, fontweight='bold', color='gray', labelpad=12)