In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# INTRODUCTION

As a person who is new to data science, I want to learn the programming languages that the professionals are using, so that I can focus on the 
subjects that are used in the job.  In this analysis the professionals are those whose experience is 5 years or more.
In addition, I also want to know how much is the cost when learning these languages.
I hope this analysis will help guide those students whose aspirations is to become a data scientist.

In [None]:
df = pd.read_csv('/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
print(list(df.columns))

#rename the columns
df.rename(columns={'Q1':'age','Q3': 'country', 'Q4':'education','Q5':'role', 'Q6':'pg_exp', 'Q24':'compensation', 'Q25':'cost'}, inplace=True)

#rename the countries to match the country references data type that will be used later
df.loc[df['country'] == 'United Kingdom of Great Britain and Northern Ireland',['country']] = 'United Kingdom'
df.loc[df['country'] == 'United States of America',['country']] = 'United States'
df.loc[df['country'] == 'Iran, Islamic Republic of...', ['country']] = 'Iran'
df.loc[df['country'] == 'Viet Nam', ['country']] = 'Vietnam'
df.loc[df['country'] == 'Republic of Korea', ['country']] = 'South Korea'

In [None]:
df.head()

In [None]:
df['country'].unique()

In [None]:
df.describe()

In [None]:
countries_summary = df.loc[1:, ['country']].groupby('country')['country'].count()
countries_summary_df = countries_summary.to_frame(name='respondents')
countries_summary_df.reset_index(inplace=True)

countries_summary_df

In [None]:
countries_df = pd.read_csv('/kaggle/input/latitude-and-longitude-for-every-country-and-state/world_country_and_usa_states_latitude_and_longitude_values.csv')
countries_df.head()
#countries_df.query("country.str.contains('Korea')", engine='python')

In [None]:
#show the survey respondents' current residence
countries_summary_df = pd.merge(countries_summary_df, countries_df, on='country')
fig = px.scatter_geo(countries_summary_df, lat=countries_summary_df.latitude, lon=countries_summary_df.longitude, color="country",
                     hover_name="country", size="respondents", title="Which country do the respondents reside",
                     projection="natural earth")
fig.show()

In [None]:
def change_range_values(x):
    if (isinstance(x, float)):
        return 0
    
    if (x == '> $500,000'):
        return 500000
    
    if (x == '$100,000 or more ($USD)'):
        return 100000
        
    l = []
    for s in x.split('-'):
        i = s.replace(',','').replace('$','').replace('>','')
        if (i.isnumeric()):
            l.append(int(i))
        else:
            return 0
    
    return np.mean(l)

df['compensation_mean'] = df.loc[:, 'compensation'].apply(lambda x: change_range_values(x))
df['cost_mean'] = df.loc[:, 'cost'].apply(lambda x: change_range_values(x))

df.loc[:, ['compensation', 'compensation_mean','cost','cost_mean']]

#df['compensation_mean'].unique()

## Average Compensation Per Country of Residence

The average compensation that I used is the mean of the compensation range. Asian countries such as Sri Lanka, Iran, Bangladesh, Philippines, Indonesia, Vietnam, Pakistan, Nepal and India and African countries such as Morocco, Kenya and Nigeria belong to those with the lowest compensation.


In [None]:
#show the survey respondents' income
with_compensation_df = df.loc[df['compensation_mean'] > 0]
countries_income_summary_df = with_compensation_df.loc[1:,['country', 'compensation_mean']].groupby('country').agg({'compensation_mean':'mean'}).reset_index()
countries_income_summary_df = pd.merge(countries_income_summary_df, countries_df, on='country')
countries_income_summary_df
fig = px.scatter_geo(countries_income_summary_df, lat=countries_income_summary_df.latitude, lon=countries_income_summary_df.longitude, color="country_code",
                     hover_name="country", size="compensation_mean", title="What is the average compensation of the respondents per country of residence?",
                     projection="natural earth")
fig.show()

In [None]:
role_summary = df.loc[1:, ['role']].groupby(['role'])['role'].count()
role_summary_df = role_summary.to_frame(name='respondents')
role_summary_df.reset_index(inplace=True)

role_summary_df['percentage'] = role_summary_df['respondents'] / role_summary_df['respondents'].sum()

role_summary_df

## Roles of the respondents

Most of the respondents are students, followed by Data Scientists and Software Engineers.

In [None]:
#show the education level of the respondents
sns.set(rc={'figure.figsize':(30,20)})
ax = sns.barplot(x="percentage", y="role", data=role_summary_df)
ax.set_title('What are the roles of respondents', fontsize=20)

In [None]:
def get_sum(df, q, r, p='Part', o = 'OTHER'):
    '''
      df - the dataframe
      q - the question number e.g. Q7
      r - the range e.g. 1 to 12, to represent Q7_Part1 up to Q7_Part_11
      o - this is the OTHER choice, e.g. Q7_OTHER
    '''
    c = []
    k = []
    for i in r:
        s = q + '_' + p + '_' + str(i)
        k.append(df[s].dropna().unique()[0])
        c.append(df[s].notnull().sum())
    return k,c

In [None]:
def get_language_reg(df, top):
    k, c = get_sum(df, 'Q7', np.arange(1, 13))
    d = {'language_reg': k, 'count': c}
    df = pd.DataFrame(d)
    df['percentage'] = df['count'] / df['count'].sum()
    return df.nlargest(top, 'percentage')

In [None]:
def get_ide(df, top):
    k, c = get_sum(df, 'Q9', np.arange(1, 12))
    d = {'ide': k, 'count': c}
    df = pd.DataFrame(d)
    df['percentage'] = df['count'] / df['count'].sum()
    return df.nlargest(top, 'percentage')

In [None]:
def get_notebook(df, top):
    k, c = get_sum(df, 'Q10', np.arange(1, 14))
    d = {'notebook': k, 'count': c}
    df = pd.DataFrame(d)
    df['percentage'] = df['count'] / df['count'].sum()
    return df.nlargest(top, 'percentage')

In [None]:
def get_visual(df, top):
    k, c = get_sum(df, 'Q14', np.arange(1, 12))
    d = {'visual': k, 'count': c}
    df = pd.DataFrame(d)
    df['percentage'] = df['count'] / df['count'].sum()
    return df.nlargest(top, 'percentage')

# Professionals

The professionals are defined as those who are not students and not currently employed and has a years of experience in programming for 5 or more years. 

In [None]:
pg_df = df.loc[~df['role'].isin(['Student','Other', 'Currently not employed']) & df['pg_exp'].isin(['5-10 years','10-20 years', '20+ years'])]

## Programming Languages
The top programming languages currently used by professionals are Python, R and SQL. These are also the languages that they are recommending. 
The top IDE is Jupyter Notebook and the two most popular notebooks are Collab and Kaggle

In [None]:
# programming languages used by more experienced persons

language_reg_df = get_language_reg(pg_df, 3)
language_reg_df['Role'] = 'Professional'
language_reg_df = language_reg_df.drop(columns = ['count'])

language_reco = pg_df.groupby('Q8')['Q8'].count()
language_reco_df = language_reco.to_frame(name='count')
language_reco_df.reset_index(inplace=True)
language_reco_df['percentage'] = language_reco_df['count'] / language_reco_df['count'].sum()
language_reco_df = language_reco_df.nlargest(3, 'percentage')

ide_df = get_ide(pg_df, 3)
ide_df['Role'] = 'Professional'
ide_df = ide_df.drop(columns = ['count'])

notebook_df = get_notebook(pg_df, 3)
notebook_df['Role'] = 'Professional'
notebook_df = notebook_df.drop(columns = ['count'])

visual_df = get_visual(pg_df, 3)
visual_df['Role'] = 'Professional'
visual_df = visual_df.drop(columns = ['count'])

sns.set(rc={'figure.figsize':(30,20)})
fig, axs = plt.subplots(3, 2)

sns.barplot(x="language_reg", y="percentage", data=language_reg_df, ax=axs[0,0])
axs[0,0].set_title('Top 3 Languages Regularly Used by Users with 5 or more years of Experience')

sns.barplot(x="Q8", y="percentage", data=language_reco_df, ax=axs[0,1])
axs[0,1].set_title('Top 3 Languages Recommended by Users with 5 or more years of Experience')

sns.barplot(x="ide", y="percentage", data=ide_df, ax=axs[1,0])
axs[1,0].set_title('Top 3 IDEs Regularly Used by Users with 5 or more years of Experience')

sns.barplot(x="notebook", y="percentage", data=notebook_df, ax=axs[1,1])
axs[1,1].set_title('Top 3 Hosted Notebooks Regularly Used by Users with 5 or more years of Experience')

sns.barplot(x="visual", y="percentage", data=visual_df, ax=axs[2,0])
axs[2,0].set_title('Top 3 Visualization Tool Regularly Used by Users with 5 or more years of Experience');

## Hardware

When learning data science, most of the professionals use only a personal computer or laptop and GPUs

In [None]:
# hardware used by more experienced persons
computing = pg_df.groupby('Q11')['Q11'].count()
computing_df = computing.to_frame(name='count')
computing_df.reset_index(inplace=True)
computing_df['percentage'] = computing_df['count'] / computing_df['count'].sum()
computing_df = computing_df.nlargest(3, 'percentage')

k, c = get_sum(pg_df, 'Q12', np.arange(1, 4))
d = {'hw_special': k, 'count': c}
hw_special_df = pd.DataFrame(d)
hw_special_df['percentage'] = hw_special_df['count'] / hw_special_df['count'].sum()
hw_special_df = hw_special_df.nlargest(3, 'count')

tpu = pg_df.groupby('Q13')['Q13'].count()
tpu_df = tpu.to_frame(name='count')
tpu_df.reset_index(inplace=True)
tpu_df['percentage'] = tpu_df['count'] / tpu_df['count'].sum()

sns.set(rc={'figure.figsize':(30,10)})
fig, axs = plt.subplots(1, 3)

ax_q11 = sns.barplot(x="Q11", y="percentage", data=computing_df, ax=axs[0])
axs[0].set_title('Top 3 Computing Platform Most Often Used by Users with 5 or more years of Experience')
ax_q11.set_xticklabels(ax_q11.get_xticklabels(), 
                          rotation=45, 
                          horizontalalignment='right')
sns.barplot(x="hw_special", y="percentage", data=hw_special_df, ax=axs[1])
axs[1].set_title('Top 3 Specialized Hardware Regularly Used by Users with 5 or more years of Experience')

sns.barplot(x="Q13", y="percentage", data=tpu_df, ax=axs[2])
axs[2].set_title('No of times TPU is used by Users with 5 or more years of Experience');

## Machine Learning

Many of the professionals are also new to machine learning. The most popular ML framework they used are Sckit-learn, Tensorflow and Keras. As for the ML algorithm, the most widely used are Linear or Logistic Regression, Decision Trees or Random Forests and Convolutional Neural Networks.

In [None]:
# advanced topics
machine_learning_exp = pg_df.groupby('Q15')['Q15'].count()
machine_learning_exp_df = machine_learning_exp.to_frame(name='count')
machine_learning_exp_df.reset_index(inplace=True)
machine_learning_exp_df ['percentage'] = machine_learning_exp_df['count'] / machine_learning_exp_df['count'].sum()

sns.set(rc={'figure.figsize':(30,30)})
ax = sns.barplot(x="percentage", y="Q15", data=machine_learning_exp_df)
ax.set_title('Years of Machine Learning Experience of Users with 5 or more years of Experience', fontsize=20);


In [None]:
# ML used by more experienced persons
k, c = get_sum(pg_df, 'Q16', np.arange(1, 16))
d = {'ml_framework': k, 'count': c}
ml_framework_df = pd.DataFrame(d)
ml_framework_df['percentage'] = ml_framework_df['count'] / ml_framework_df['count'].sum()
ml_framework_df = ml_framework_df.nlargest(3, 'percentage')

k, c = get_sum(pg_df, 'Q17', np.arange(1, 12))
d = {'ml_algo': k, 'count': c}
ml_algo_df = pd.DataFrame(d).nlargest(3, 'count')

k, c = get_sum(pg_df, 'Q18', np.arange(1, 7))
d = {'vision': k, 'count': c}
vision_df = pd.DataFrame(d).nlargest(3, 'count')

k, c = get_sum(pg_df, 'Q19', np.arange(1, 6))
d = {'nlp': k, 'count': c}
nlp_df = pd.DataFrame(d).nlargest(3, 'count')

sns.set(rc={'figure.figsize':(30,20)})
fig, axs = plt.subplots(2, 2)

sns.barplot(x="ml_framework", y="percentage", data=ml_framework_df, ax=axs[0,0])
axs[0,0].set_title('Top 3 ML Frameworks Regularly Used by Users with 5 or more years of Experience')

sns.barplot(x="ml_algo", y="count", data=ml_algo_df, ax=axs[0,1])
axs[0,1].set_title('Top 3 ML Algo  Regularly Used by Users with 5 or more years of Experience')

ax_vision = sns.barplot(x="vision", y="count", data=vision_df, ax=axs[1,0])
axs[1,0].set_title('Top 3 Computer Vision Methods Regularly Used by Users with 5 or more years of Experience')
ax_vision.set_xticklabels(ax_vision.get_xticklabels(), 
                          rotation=45, 
                          horizontalalignment='right')

ax_nlp = sns.barplot(x="nlp", y="count", data=nlp_df, ax=axs[1,1])
axs[1,1].set_title('Top 3 NLP Regularly Used by Users with 5 or more years of Experience')

ax_nlp.set_xticklabels(ax_nlp.get_xticklabels(), 
                          rotation=45, 
                          horizontalalignment='right');


## Cloud Computing

For cloud computing the products and platforms used by professionals are AWS and GCP. For the big data the top 3 are MySql, Postgres and SQL Server.


In [None]:
k, c = get_sum(pg_df, 'Q26_A', np.arange(1, 12))
d = {'cloud_computing_platform': k, 'count': c}
cloud_computing_platform_df = pd.DataFrame(d)
cloud_computing_platform_df['percentage'] = cloud_computing_platform_df['count'] / cloud_computing_platform_df['count'].sum()
cloud_computing_platform_df = cloud_computing_platform_df.nlargest(3, 'count')

k, c = get_sum(pg_df, 'Q27_A', np.arange(1, 12))
d = {'cloud_computing_products': k, 'count': c}
cloud_computing_products_df = pd.DataFrame(d)
cloud_computing_products_df['percentage'] = cloud_computing_products_df['count'] / cloud_computing_products_df['count'].sum()
cloud_computing_products_df = cloud_computing_products_df.nlargest(3, 'count')

k, c = get_sum(pg_df, 'Q28_A', np.arange(1, 11))
d = {'machine_learning_products': k, 'count': c}
machine_learning_products_df = pd.DataFrame(d)
machine_learning_products_df['percentage'] = machine_learning_products_df['count'] / machine_learning_products_df['count'].sum()
machine_learning_products_df = machine_learning_products_df.nlargest(3, 'count')

k, c = get_sum(pg_df, 'Q29_A', np.arange(1, 18))
d = {'big_data_products': k, 'count': c}
big_data_products_df = pd.DataFrame(d)
big_data_products_df['percentage'] = big_data_products_df['count'] / big_data_products_df['count'].sum()
big_data_products_df = big_data_products_df.nlargest(3, 'count')

sns.set(rc={'figure.figsize':(30,20)})
fig, axs = plt.subplots(2, 2)

sns.barplot(x="cloud_computing_platform", y="percentage", data=cloud_computing_platform_df, ax=axs[0,0])
axs[0,0].set_title('Top 3 Cloud Computing Platform Regularly Used by Users with 5 or more years of Experience')

sns.barplot(x="cloud_computing_products", y="percentage", data=cloud_computing_products_df, ax=axs[0,1])
axs[0,1].set_title('Top 3 Cloud Computing Products Regularly Used by Users with 5 or more years of Experience')

sns.barplot(x="machine_learning_products", y="percentage", data=machine_learning_products_df, ax=axs[1,0])
axs[1,0].set_title('Top 3 Machine Learning Products Regularly Used by Users with 5 or more years of Experience')

sns.barplot(x="big_data_products", y="percentage", data=big_data_products_df, ax=axs[1,1])
axs[1,1].set_title('Top 3 Machine Learning Products Regularly Used by Users with 5 or more years of Experience');

# STUDENTS

Majority of the students are from India and US.  Most of them have no experience or have less than 1 up to 2 years programming experience

In [None]:
student_df = df.loc[df['role'].isin(['Student'])]
student_df

In [None]:
#show the survey students' current residence
countries_student_summary = student_df.loc[:, ['country']].groupby('country')['country'].count()
countries_student_summary_df = countries_student_summary.to_frame(name='respondents')
countries_student_summary_df.reset_index(inplace=True)
countries_student_summary_df

In [None]:
countries_student_summary_df = pd.merge(countries_student_summary_df, countries_df, on='country')
fig = px.scatter_geo(countries_student_summary_df, lat=countries_student_summary_df.latitude, lon=countries_student_summary_df.longitude, color="country_code",
                     hover_name="country", size="respondents", title="Which country do the students reside",
                     projection="natural earth")
fig.show()

In [None]:
pg_exp = student_df.groupby('pg_exp')['pg_exp'].count()
pg_exp_df = pg_exp.to_frame(name='count')
pg_exp_df.reset_index(inplace=True)
pg_exp_df['percentage'] = pg_exp_df['count'] / pg_exp_df['count'].sum()

sns.set(rc={'figure.figsize':(10,10)})
ax = sns.barplot(x="percentage", y="pg_exp", data=pg_exp_df)
ax.set_title('Years of Programming Experience', fontsize=20);

## Programming Languages

Students may need to study SQL as an additional programming language. As for the IDE, Notebook and Visualization tool, the students are using the same tools as the professionals.

In [None]:
#programming languages used by students
language_reg_student_df = get_language_reg(student_df, 3)
language_reg_student_df['Role'] = 'Student'
language_reg_student_df = language_reg_student_df.drop(columns = ['count'])
merge_language_reg = pd.concat([language_reg_df, language_reg_student_df])

g = sns.catplot(data=merge_language_reg, kind="bar", x="language_reg", y="percentage", hue="Role", ci="sd", palette="dark", alpha=.6, height=6);
g.fig.suptitle('Comparison of Programming Languages Used by Professionals and Students');

In [None]:
ide_student_df = get_ide(student_df, 3)
ide_student_df = ide_student_df.drop(columns=['count'])
ide_student_df['Role'] = 'Student'
merge_ide_df = pd.concat([ide_df, ide_student_df])
g = sns.catplot(data=merge_ide_df, kind="bar", x="ide", y="percentage", hue="Role", ci="sd", palette="dark", alpha=.6, height=6);
g.fig.suptitle('Comparison of IDE Used by Professionals and Students')
g.set_xticklabels(rotation=90);

In [None]:
notebook_student_df = get_notebook(student_df, 3)
notebook_student_df = notebook_student_df.drop(columns=['count'])
notebook_student_df['Role'] = 'Student'
merge_notebook_df = pd.concat([notebook_df, notebook_student_df])
g = sns.catplot(data=merge_notebook_df, kind="bar", x="notebook", y="percentage", hue="Role", ci="sd", palette="dark", alpha=.6, height=6);
g.fig.suptitle('Comparison of Visualization Tools Used by Professionals and Students')
g.set_xticklabels(rotation=90);

In [None]:
visual_student_df = get_visual(student_df, 3)
visual_student_df = visual_student_df.drop(columns=['count'])
visual_student_df['Role'] = 'Student'
merge_visual_df = pd.concat([visual_df, visual_student_df])
g = sns.catplot(data=merge_visual_df, kind="bar", x="visual", y="percentage", hue="Role", ci="sd", palette="dark", alpha=.6, height=6);
g.fig.suptitle('Comparison of Visualization Tools Used by Professionals and Students')
g.set_xticklabels(rotation=90);

In [None]:
# advanced topics
machine_learning_exp = student_df.groupby('Q15')['Q15'].count()
machine_learning_exp_df = machine_learning_exp.to_frame(name='count')
machine_learning_exp_df.reset_index(inplace=True)
machine_learning_exp_df.reset_index(inplace=True)
machine_learning_exp_df['percentage'] = machine_learning_exp_df['count'] / machine_learning_exp_df['count'].sum()

sns.set(rc={'figure.figsize':(10,10)})
ax = sns.barplot(x="percentage", y="Q15", data=machine_learning_exp_df)
ax.set_title('Years of Machine Learning Experience', fontsize=20);

# COST

The list of countries where the cost of learning is 45% of their income. Among these countries, the 3 countries below have lower compensation compared with the cost.

* Bangladesh
* Iran
* Kenya

In [None]:
total_cost_mean = df[df['cost_mean'] > 0].mean()
countries_income_summary_df['cost_over_compensation'] = total_cost_mean['cost_mean'] / countries_income_summary_df['compensation_mean'] 
#countries_income_summary_df

In [None]:
sns.set(rc={'figure.figsize':(30,20)})

countries_more_than_xpct = countries_income_summary_df[countries_income_summary_df['cost_over_compensation'] > 1]
ax = sns.barplot(x="country", y="compensation_mean", data=countries_more_than_xpct)
ax.set_title('Compensation vs Cost', fontsize=20)
ax.axhline(total_cost_mean['cost_mean']);

In [None]:
#show the survey respondents' cost
with_cost_df = df.loc[df['cost_mean'] > 0]
countries_cost_summary_df = with_cost_df.loc[1:,['country', 'cost_mean']].groupby('country').agg({'cost_mean':'mean'}).reset_index()
countries_cost_summary_df = pd.merge(countries_cost_summary_df, countries_df, on='country')
countries_cost_summary_df
fig = px.scatter_geo(countries_cost_summary_df, lat=countries_cost_summary_df.latitude, lon=countries_cost_summary_df.longitude, color="country_code",
                     hover_name="country", size="cost_mean", title="How much have been spent on machine learning per country ?",
                     projection="natural earth")
fig.show()

In [None]:
#show the company size vs cost on machine learning
sns.set(rc={'figure.figsize':(15,10)})
ax = sns.barplot(x="Q20", y="cost_mean", data=with_cost_df)
ax.set_title('How much have been spent on machine learning per company size', fontsize=20);

# CONCLUSION

  The survey shows that twenty five percent(25%) of the respondents are students.  Most of these students are from India.  The students use almost the same programming languages, IDE, Notebooks and Visualization as that of the professionals. They may need to include SQL language in the languages that they are using or studying.
  In the area of machine learning/cloud computing, the respondents have spent around 20,000 USD, which is quite high. This cost is around 100% of the average compensation of professionals in 21 countries.  Mostly likely this area is where the students need support. Or they may choose to apply in a large company who has been spending a huge amount in machine learning or cloud computing.