In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

There are many paths and ways to enter the Data Science filed. Those many ways and experts opinions overwhelm students that want to start their career as Data Scientists. The best way to to understand how a field develops is to look at the trends in the field using a large dataset with different paths and check the patterns in there.

# Data Exploration (EDA)

In [None]:
data = pd.read_csv("/kaggle/input/kaggle-survey-2020/kaggle_survey_2020_responses.csv")
data.head()

In [None]:
data.describe()

In [None]:
data.shape

In [None]:
data.isna().sum()

It seems that the questions at the end of the survey are mostly Nulls, maybe because of the length of the survey. I will handle the null values with each question instead of handling them in advance in order not to lose too much data befre deciding on which question to be used.

# What are top used Languages by Data Scientists?

In [None]:
def plot_legends(ax, title, x_label, x_shift = 0.09, hei = 20):
    """Sets titles and removes unnecessary parts of the graph to maintain the data to ink ratio
    Args:
        ax: The axis instance to be plotted
        x_label: the label of the x axis
        x_shift: the amount by which the bar or line marker is shifted in x axis
        hei: the amount by which the bar or line marker is shifted in the y axis
    """
    # Add title and remove borders
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_yticks([])
    ax.spines[:].set_visible(False)

    # Adding text annotations at the upper part of the bar or line marker
    for p in ax.patches:
        ax.annotate(str(round(p.get_height())), (p.get_x() + x_shift, p.get_height() + hei))

Plotting the frequency of each programming language used by Data Scientists to check what is the most used languages in the field.

In [None]:
data_scientists = data.query("Q5 == 'Data Scientist'")
lang_cols = [x for x in list(data_scientists.columns) if 'Q7' in x]
lang_freq = pd.Series([data_scientists[x].count() for x in lang_cols[:-2]])
lang_freq.index = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB']

In [None]:
ax = lang_freq.sort_values(ascending = False).plot(kind = 'bar', 
                                                   color = ['r'] + ['b' for i in range(len(lang_freq) -1)],
                                                   xlabel = list(lang_freq.index), figsize = (20,5) )

plot_legends(ax, 'Language Users Frequency', "Language Used")

# What is the top used languages for Data Scientists with highest salaries?

In [None]:
# Removing nonnumerical characters from the Salary field and splitting ranges to take the average

def remove_chars(chars):
    """Removing unwanted characters from the the Salary column
    Args:
        chars: the list of characters to be removed
    """
    for char in chars:
        data_scientists.loc[:, 'Q24'] = data_scientists.Q24.apply(lambda i: i.replace(char, '') if type(i) == str else i)

remove_chars(['$', '>', '<'])
data_scientists.loc[:, 'Q24'] = data_scientists.Q24.apply(lambda i: i.split("-") if type(i) == str else i)
data_scientists.loc[:, 'Q24'] = data_scientists.Q24.apply(lambda i: np.mean([float(x.replace(",", "")) for x in i]) if type(i) == list else i)
data_scientists.loc[:, 'Q24'].hist(bins = 50, grid = False, figsize = (20, 5));

In [None]:
tmp_experience = data_scientists[['Q15','Q24']].groupby('Q15').mean().sort_values(by = 'Q24')
tmp_experience.index = ['Under 1 year', '1-2 years', 'I do not use ML', '2-3 years', 
                        '3-4 years', '4-5 years', '5-10 years', '10-20 years','20 or more years']

ax = tmp_experience.plot(kind = 'line', figsize = (20,5),color = ['r'] + ['b' for i in range(len(tmp_experience) )], legend = False, marker='o',
    linewidth=2, markersize=12)    
    
plot_legends(ax, 'Language Users Frequency', "Language Used")

for i,j in zip([x for x in range(len(tmp_experience.index))],tmp_experience.Q24):
    ax.annotate(str(round(j)),xy=(i,j + 4e3))

# Is there an association between salaries and the programming Languages used?

In [None]:
salary_lang_df = data_scientists[lang_cols[:-2] + ['Q24']]
salary_lang_df.columns = ['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash', 'MATLAB', 'Salary']

for col in salary_lang_df[:-1]:
    salary_lang_df.loc[salary_lang_df[col] == col, col] = 1

In [None]:
import matplotlib.pyplot as plt

tmp =  pd.melt(salary_lang_df, id_vars=["Salary"], 
             value_vars=['Python', 'R', 'SQL', 'C', 'C++', 
                         'Java', 'Javascript', 'Julia', 
                         'Swift', 'Bash', 'MATLAB']).dropna(subset = ['value'], axis = 0).drop('value', axis = 1).groupby('variable').mean().sort_values(by = 'Salary',ascending = False)



tmp['variable'] = tmp.index

ax = tmp.plot(x = 'variable',
              y = 'Salary',
              kind = 'bar', 
              figsize = (20,5),
              color = ['b', 'b', 'r', 'b', 'b', 'r', 'b', 'b', 'b', 'b', 'b'])


plot_legends(ax, 'Mean Salary for Language used', "Language Used", 0.05, 1000)

In [None]:
tmp = pd.melt(salary_lang_df, id_vars=["Salary"], 
              value_vars=['Python', 'R', 'SQL', 'C', 'C++', 'Java', 'Javascript', 'Julia', 'Swift', 'Bash'
                          , 'MATLAB']).dropna(subset = ['value'], axis = 0).drop('value', axis = 1).variable.value_counts()

ax = tmp.plot(kind = 'bar', 
              figsize = (20,5),
              color = ['r', 'b', 'r', 'b', 'b', 'blue', 'b', 'b', 'b', 'b', 'b'])
    
plot_legends(ax, 'Number of people using different programming languages', "Language Used", 0.09, 100)

In [None]:
data_scientists
course_cols = [x for x in list(data_scientists.columns) if 'Q37' in x]
salary_course_df = data_scientists[course_cols[:-2] + ['Q24']]
salary_course_df.columns = ["Coursera", "edX", "Kaggle Learn Courses", "DataCamp", "Fast.ai", "Udacity", "Udemy", "LinkedIn Learning", "Cloud-certification programs (direct from AWS, Azure, GCP, or similar)", "University Courses (resulting in a university degree)", "Salary"]


for col in salary_course_df[:-1]:
    salary_course_df.loc[salary_course_df[col] == col, col] = 1
    
# Updating the list of names
salary_course_df.columns = ["Coursera", "edX", "Kaggle Learn Courses", 
                            "DataCamp", "Fast.ai", "Udacity", "Udemy", 
                            "LinkedIn Learning", "Cloud-certification", 
                            "University Courses", "Salary"]

In [None]:
val = pd.melt(salary_course_df, id_vars=["Salary"], value_vars= ["Coursera", "edX", "Kaggle Learn Courses", "DataCamp", "Fast.ai", "Udacity", "Udemy", "LinkedIn Learning", "Cloud-certification", "University Courses"]).dropna(subset = ['value'], axis = 0).drop('value', axis = 1)
val2 = val.groupby('variable')
val2 = val2.Salary.mean()/ val.variable.value_counts()[list(val2.Salary.mean().index)]
ax =  val2.sort_values(ascending = False).plot(figsize = (20, 5), kind = 'bar')
    
plot_legends(ax, 'Mean Salary for Data Scientists who started with the platform divided by the number of users for the platform', "Platform Used", 0.05, 1000)

In [None]:
data_scientists[data_scientists.Q7_Part_8.notna()].Q4.value_counts()

In [None]:
data_scientists[data_scientists.Q7_Part_8.notna()].Q15.value_counts()

In [None]:
print('The number of people that uses Julia = {}'.format(salary_lang_df.query("Julia == 1").shape[0]))

In [None]:
print('The number of people that uses Julia & Python" = {}'.format(salary_lang_df.query("Julia == 1 & Python == 1").shape[0]))

## Results
Although it seems that people who uses Julia earn more than people who uses other languages like Python, but with further investigation I found that 95% of the people who use Julia also use Python

Also it was noted that 33 of the 71 people who has knowledge with Julia has Master's degree and and 21 had Phd, which shows why their salary is higher in average compared to other data scientists.

There are many variables that needs to be investigated to reach a better understanding.