In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import re
import matplotlib.pyplot as plt

!pip install pycountry_convert
import pycountry_convert

from sklearn.impute import KNNImputer

In [None]:
data = pd.read_csv('../input/datarelated-developers-survey-by-stack-overflow/survey_final.csv',low_memory=False)

In [None]:
data.describe(include='all')

Next cell, I will clean the data and make it more consistent

In [None]:
# Make all data field consistent betweem years
data['EdLevel'].replace({"Master's degree (M.A., M.S., M.Eng., MBA, etc.)":"Master's degree",
                          "Bachelor's degree (B.A., B.S., B.Eng., etc.)":"Bachelor's degree",
                          "Secondary school": "Secondary school",
                          "Professional degree (JD, MD, etc.)":"Professional degree",
                          "Some college/university study without earning a degree":"Some college/university study without earning a bachelor's degree",
                          "Associate degree (A.A., A.S., etc.)":"Associate degree", 
                          "Other doctoral degree (Ph.D., Ed.D., etc.)":"Doctoral degree",
                          "Bachelor's degree (BA, BS, B.Eng., etc.)":"Bachelor's degree",
                          "Master's degree (MA, MS, M.Eng., MBA, etc.)":"Master's degree",
                          "Other doctoral degree (Ph.D, Ed.D., etc.)":"Doctoral degree",
                          "Bachelor‚'s degree (BA, BS, B.Eng., etc.)":"Bachelor's degree",
                          "Master‚'s degree (MA, MS, M.Eng., MBA, etc.)":"Master's degree",
                          "Secondary school (e.g. American high school, German Realschule or Gymnasium, etc.)":"Secondary school", 
                          " ":"I prefer not to answer",
                          "Full-stack developer":"I prefer not to answer" }, inplace=True)

data['OrgSize'].replace({'2-9 employees' : '2 to 9 employees',
                        'Fewer than 10 employees': '2 to 9 employees'}, inplace = True)

data['UndergradMajor'].replace({"Computer science, computer engineering, or software engineering":"Computer science",
                                "Another engineering discipline (such as civil, electrical, mechanical, etc.)":"Another engineering discipline",
                                "A humanities discipline (such as literature, history, philosophy, etc.)":"Humanities",
                                "A health science (such as nursing, pharmacy, radiology, etc.)":"Health science",
                                "Information systems, information technology, or system administration":"Information systems",
                                "A natural science (such as biology, chemistry, physics, etc.)":"Natural science",
                                "Fine arts or performing arts (such as graphic design, music, studio art, etc.)":"Fine arts or performing arts",
                                "A social science (such as anthropology, psychology, political science, etc.)":"Social science",
                                "A business discipline (such as accounting, finance, marketing, etc.)":"Business",
                                "Another engineering discipline (ex. civil, electrical, mechanical)":"Another engineering discipline",
                                "A business discipline (ex. accounting, finance, marketing)":"Business",
                                "A natural science (ex. biology, chemistry, physics)":"Natural science",
                                "A social science (ex. anthropology, psychology, political science)":"Social science",
                                "A humanities discipline (ex. literature, history, philosophy)":"Humanities",
                                "Fine arts or performing arts (ex. graphic design, music, studio art)":"Fine arts or performing arts",
                                "A health science (ex. nursing, pharmacy, radiology)":"Health science",
                                "Computer science or software engineering":"Computer science",
                                "A non-computer-focused engineering discipline":"Another engineering discipline",
                                "A social science":"Social science","A natural science":"Natural science",
                                "A business discipline":"Business","Information technology, networking, or system administration":"Information systems","Fine arts or performing arts":"Fine arts or performing arts","Management information systems":"Information systems","A humanities discipline":"Humanities","Psychology":"Social science","A health science":"Health science"}, inplace=True)

def jobsat(x):
    if (x == 'Extremely satisfied') | (x == 10):
        return 9
    elif x == 'Very satisfied':
        return 8
    elif x == 'Moderately satisfied':
        return 7
    elif x == 'Slightly satisfied':
        return 6
    elif x == 'Neither satisfied nor dissatisfied':
        return 5
    elif x == 'Slightly dissatisfied':
        return 4
    elif x == 'Moderately dissatisfied':
        return 3
    elif x == 'Very dissatisfied':
        return 2
    elif (x == 'Extremely dissatisfied') | (x == 0):
        return 1
    else:
        return x

data['JobSat'] = data['JobSat'].apply(jobsat).astype(float) 


def yearcodeconvert(x):
    if type(x) == str:
        a = re.findall('[0-9]+', x)
        if len(a) == 2:
            return int(a[1])
        if int(a[0]) >=30:
            return 30
        else:
            return int(x)
    else:
        return x
data['YearsCodePro'].replace({'Less than 1 year':1,
                              'Less than a year':1,
                              '20 or more years':20,
                              '30 or more years':30,
                              'More than 50 years':30}, inplace = True)
                             
data['YearsCodePro'] = data['YearsCodePro'].apply(yearcodeconvert)

In [None]:
# save data of each year for processing
data_2017 = data[data['Year']==2017]
data_2018 = data[data['Year']==2018]
data_2019 = data[data['Year']==2019]
data_2020 = data[data['Year']==2020]

## DevType separation

This step will convert the Devtype column's values into dummies variables of eachdev type, then I will keep only the data-related devtype
- Data scientist or machine learning specialist
- Database administrator
- Data or business analyst
- Engineer, data

### 2017

In [None]:
mask = data_2017['DevType'].str.contains(r'data|machine|business', case = False,na=False)
df = data_2017[mask].copy()
# remove all space at beginning of the text
df['DevType'].replace('^\s+', '', regex=True, inplace=True) #front
# Split the text by semi colon
split_2017 = df['DevType'].str.get_dummies(sep='; ')
# as 2017 have different choice from 18,19 and 20, we will merge the choices to align it with the rest
split_2017['Data scientist or machine learning specialist'] = split_2017['Data scientist'] | split_2017['Machine learning specialist']
# Get the desired columns
select_type_2017 = ['Data scientist or machine learning specialist',
                   'Database administrator']
candidate_2017 = (np.sum(split_2017.loc[:,select_type_2017],axis = 1) != 0).index

In [None]:
split_2017.head()

### 2018

In [None]:
mask = data_2018['DevType'].str.contains(r'data|machine|business', case = False,na=False)
df = data_2018[mask].copy()
# Replace all space at beginning of the text
df['DevType'].replace('^\s+', '', regex=True, inplace=True) #front
# Split the text by semi colon
split_2018 = df['DevType'].str.get_dummies(sep=';')
# Get the desired columns
select_type_2018 = ['Data or business analyst',
                    'Data scientist or machine learning specialist',
                    'Database administrator']
candidate_2018 = (np.sum(split_2018.loc[:,select_type_2018],axis = 1) != 0).index

### 2019

In [None]:
mask = data_2019['DevType'].str.contains(r'data|machine|business', case = False,na=False)
df = data_2019[mask].copy()
# Replace all space at beginning of the text
df['DevType'].replace('^\s+', '', regex=True, inplace=True) #front
# Split the text by semi colon
split_2019 = df['DevType'].str.get_dummies(sep=';')
# Get the desired columns
select_type_2019 = ['Data or business analyst',
               'Data scientist or machine learning specialist',
               'Database administrator',
               'Engineer, data']
candidate_2019 = (np.sum(split_2019.loc[:,select_type_2019],axis = 1) != 0).index

### 2020

In [None]:
mask = data_2020['DevType'].str.contains(r'data|machine|business', case = False,na=False)
df = data_2020[mask].copy()
# Replace all space at beginning of the text
df['DevType'].replace('^\s+', '', regex=True, inplace=True) #begin of the txt
# Split the text by semi colon
split_2020 = df['DevType'].str.get_dummies(sep=';')
# Get the desired columns
select_type_2020 = ['Data or business analyst',
                   'Data scientist or machine learning specialist',
                   'Database administrator',
                   'Engineer, data']
candidate_2020 = (np.sum(split_2020.loc[:,select_type_2020],axis = 1) != 0).index

- Now, we merge all the splitted columns of `DevType` together. And from now, we will just use the data of the developers having the data related job 
- Then we will also convert features that contain list of values in to dummies variable: `DatabaseDesireNextYear`, `DatabaseWorkedWith`, `LanguageDesireNextYear` and `LanguageWorkedWith`.

In [None]:
# Concat all dummies of 4 year
dm_dev_type = pd.concat([split_2017[select_type_2017],
                                 split_2018[select_type_2018],
                                 split_2019[select_type_2019],
                                 split_2020[select_type_2020]],axis = 0)

dm_dev_type.rename(columns = {'Data scientist or machine learning specialist' : 'DS_MLspecialist',
                             'Database administrator' : 'DB_Admin',
                             'Data or business analyst' : 'DA_BAnalyst',
                             'Engineer, data':'DataEngineer'},inplace=True)
# we only consider these job type in the data
data = data.loc[dm_dev_type.index,:]
# Function to convert 
def dummies_converter(df, col):
    # remove space at the begining of the text
    df[col].replace('^\s+', '', regex=True, inplace=True) #begin of the txt
    # Split the text by semi colon
    dm1 = df[df['Year'] == 2017][col].str.get_dummies(sep='; ')
    dm2 = df[df['Year'] >= 2018][col].str.get_dummies(sep=';')
    return pd.concat([dm1,dm2],axis = 0)
# Feature to get dummies:
feat_for_dm = ['DatabaseDesireNextYear', 'DatabaseWorkedWith', 'LanguageDesireNextYear', 'LanguageWorkedWith', 'DevType']
# Convert to dummies
dm_db_nextyear = dummies_converter(data,'DatabaseDesireNextYear')
dm_db_work = dummies_converter(data,'DatabaseWorkedWith')
dm_language_nextyear = dummies_converter(data,'LanguageDesireNextYear')
dm_language_work = dummies_converter(data,'LanguageWorkedWith')
# Drop converted features
data = data.drop(feat_for_dm,axis = 1,errors='ignore')

**Now we have 5 data frames of dummies features**
- dm_dev_type
- dm_db_nextyear
- dm_db_work
- dm_language_nextyear
- dm_language_work

In [None]:
#data = pd.merge(data, dev_type, left_index=True, right_index=True)
dm_dev_type.head()

In [None]:
df = data.copy()
df.describe(include='all')

**The target varialbe of this analyis this annual salary - `ConvertedComp`, lets take a look into that feature first**

In [None]:
df.isna().sum()

In [None]:
fig, ax = plt.subplots(figsize=(18, 5))
sns.boxplot(x="Year", y="ConvertedComp",
            hue = 'Employment',
            data=df,
            ax = ax)
sns.despine(offset=10, trim=True)
plt.show()

To make this analysis as practical as possible, we will only consider people who was having a job related to data, so we will exclude people who does not have a salary information and job of not employed, no information on job or retired.

In [None]:
# Drop irrelevant job title
df = df.loc[~df['Employment'].isin(['Not employed, and not looking for work',
                                    'Not employed, but looking for work',
                                    'I prefer not to say', 'Retired']), :]
# Drop people do not have information about salary
df = df.loc[~df["ConvertedComp"].isna(),:]

In [None]:
fig, ax = plt.subplots(figsize=(18, 5))
sns.boxplot(x="Year", y="ConvertedComp",
            hue = 'Employment',
            data=df,
            ax = ax)
sns.despine(offset=10, trim=True)
plt.show()

From the box-plot, we can observe that there is a lot of outliers in the annual salary, for relevancy of this analysis, we will exclude people ving annual salary more than 300,000$

In [None]:
# Remove outlier
df = df.loc[(df['ConvertedComp'] < 300000) & (df['ConvertedComp'] > 0),:]
# plot
fig, ax = plt.subplots(figsize=(18, 5))
sns.boxplot(x="Year", y="ConvertedComp",
            hue = 'Employment',
            data=df,
            ax = ax)
sns.despine(offset=10, trim=True)
plt.show()

In [None]:
#percentage of data kept after removing outliers and considering only instances that have our target variable
len(df)/len(data)*100

## Missing values treatment

In [None]:
#This is the number of missing values still existing (without dummies)
df.isna().sum()

In [None]:
#will give the index of the rows with any missing value
nans_index = df.isna().any(axis=1)
#if we would remove all the rowa with still missing values that would mean loosing 14,5% of data
len(df[nans_index])/len(df)*100

In [None]:
print(df.dtypes)

In [None]:
#the nan values in the dummies will be substituted by zero, we just must have in consideration that for example the databases in 2017 don't include all the options that exist in the other years
#in the same way in devtype the options of data/ business analyst and data engineer in 2017 qre not existing
dm_db_nextyear.fillna(value=0, inplace=True)
dm_dev_type.fillna(value=0, inplace=True)
dm_db_work.fillna(value=0, inplace=True)
dm_language_nextyear.fillna(value=0, inplace=True)
dm_language_work.fillna(value=0, inplace=True)

In [None]:
df_all = pd.concat([df, dm_dev_type, dm_db_nextyear, dm_db_work, dm_language_nextyear, dm_language_work],
                   axis=1, join='inner')
# export the data
filename = 'stack.csv'
df_all.to_csv(filename, index=False)
# df
df_all.head()

# Analysis

In [None]:
# First, we will divided the salary by 1000 for easy interpretation
df['ConvertedComp'] = df['ConvertedComp']/1000

## 1. What is the impact of education major on salary

## 2. What is the impact of Job type on salary

In [None]:
# Let concat the data and the dummies variables of devtype
df_job = pd.concat([df, dm_dev_type],
                   axis=1, join='inner')
df_job.head()

Percentage of respondents by title

In [None]:
df_dev_type = pd.concat([data['Year'], dm_dev_type],
                   axis=1, join='inner')
df_measured = df_dev_type.groupby('Year').sum()
df_measured.loc[2017,:] = df_measured.loc[2017,:]/data_2017.shape[0]
df_measured.loc[2018,:] = df_measured.loc[2018,:]/data_2018.shape[0]
df_measured.loc[2019,:] = df_measured.loc[2019,:]/data_2019.shape[0]
df_measured.loc[2020,:] = df_measured.loc[2020,:]/data_2020.shape[0]

In [None]:
df_measured

### Average salary by title

In [None]:
fig, axes = plt.subplots(nrows = 1, ncols = 4, sharex="all", figsize=(16,4))
sns.set_style("white")
names = ["Data scientist \n Machine learning specialist", 'Database administrator' , 'Data or business analyst', 'Engineer, data']
job_types = ['DS_MLspecialist', 'DB_Admin', 'DA_BAnalyst', 'DataEngineer']
for job_type, name, ax in zip(job_types ,names , axes.flatten()):
    ax = sns.pointplot(x="Year", y="ConvertedComp", hue=job_type, 
                capsize=.2, palette="rocket", legend_out=True,
                data=df_job, ax= ax)
    ax.set_title(name, fontsize=14)
    ax.set_ylim(bottom=50, top=75)

### Median salary by average job satisfaction

In [None]:
df_toconcat = pd.concat([df_job[df_job['DS_MLspecialist'] ==1].groupby('Year')['JobSat'].mean(),
                        df_job[df_job['DB_Admin'] ==1].groupby('Year')['JobSat'].mean(),
                        df_job[df_job['DA_BAnalyst'] ==1].groupby('Year')['JobSat'].mean(),
                        df_job[df_job['DataEngineer'] ==1].groupby('Year')['JobSat'].mean()],axis = 1)
df_toconcat.head()

In [None]:
df_measured = pd.concat([df_job[df_job['DS_MLspecialist'] ==1].groupby('Year')['ConvertedComp'].median(),
                        df_job[df_job['DB_Admin'] ==1].groupby('Year')['ConvertedComp'].median(),
                        df_job[df_job['DA_BAnalyst'] ==1].groupby('Year')['ConvertedComp'].median(),
                        df_job[df_job['DataEngineer'] ==1].groupby('Year')['ConvertedComp'].median()],axis = 1)

df_measured = pd.DataFrame(df_measured.values, index = df_measured.index, columns=job_types)

df_test = pd.DataFrame([] , columns=['year','medianSalary','type'])
for col in df_measured.columns:
    df_test = pd.concat([df_test,
                         pd.DataFrame({'year' : df_measured[col].index ,  
                                       'medianSalary': df_measured[col].values,
                                       'type': [col,col,col,col]})],axis = 0)
    
df_measured = pd.concat([df_job[df_job['DS_MLspecialist'] ==1].groupby('Year')['JobSat'].mean(),
                        df_job[df_job['DB_Admin'] ==1].groupby('Year')['JobSat'].mean(),
                        df_job[df_job['DA_BAnalyst'] ==1].groupby('Year')['JobSat'].mean(),
                        df_job[df_job['DataEngineer'] ==1].groupby('Year')['JobSat'].mean()],axis = 1)
df_measured = pd.DataFrame(df_toconcat.values, index = df_toconcat.index, columns=job_types)
df_test1 = pd.DataFrame([] , columns=['year','averageSatisfation','type'])
for col in df_measured.columns:
    df_test1 = pd.concat([df_test1,
                         pd.DataFrame({'year' : df_measured[col].index ,  
                                       'averageSatisfation': df_measured[col].values,
                                       'type': [col,col,col,col]})],axis = 0)


In [None]:
fig, axes = plt.subplots(figsize=(6,6))
sns.set_style("whitegrid")
sns.scatterplot(x = 'averageSatisfation' , y = 'medianSalary',
               hue = 'type', size="year" , palette="tab10",
               s=200,
               data = pd.concat([df_test, df_test1['averageSatisfation']],axis =1), ax = axes)

## Data job in the world

In [None]:
coordinate = pd.read_csv('https://raw.githubusercontent.com/albertyw/avenews/master/old/data/average-latitude-longitude-countries.csv')
coordinate.head()

In [None]:
df_map = pd.merge(df_job.groupby('Country')['ConvertedComp'].median(), coordinate.set_index('Country'),left_index=True,right_index=True)
df_map['Country'] = df_map.index
df_map

In [None]:
# Refference:
#1. https://python-visualization.github.io/folium/quickstart.html
#2. https://towardsdatascience.com/creating-a-simple-folium-map-covid-19-worldwide-total-case-a0a1429c6e7c

url = 'https://raw.githubusercontent.com/python-visualization/folium/master/examples/data'
country_shapes = f'{url}/world-countries.json'
# Create a world map to show distributions of users 
import folium
from folium.plugins import MarkerCluster
#empty map

world_map= folium.Map(tiles="cartodbpositron")

#show the map
folium.Choropleth(
    #The GeoJSON data to represent the world country
    geo_data=country_shapes,
    name='Median salary',
    data=df_map,
    bins=9,
    #The column aceppting list with 2 value; The country name and  the numerical value
    columns=['Country', 'ConvertedComp'],
    key_on='feature.properties.name',
    fill_color='YlOrRd',  
    fill_opacity = 0.9,
    nan_fill_color='white',
    nan_fill_opacity = 0.1,
    line_weight = 0.5
).add_to(world_map)

world_map

See by continent

In [None]:
#function to convert to alpah2 country codes and continents
#from pycountry_convert import country_alpha2_to_continent_code, country_name_to_country_alpha2

def get_continent(col):
    try:
        cn_continent = pycountry_convert.country_alpha2_to_continent_code(col)
    except:
        cn_continent = 'Unknown' 
    return cn_continent

In [None]:
df_map_1 = pd.merge(df_job.groupby(['Year','Country'])['ConvertedComp'].median(), coordinate.set_index('Country'),left_index=True,right_index=True)
df_map_1.reset_index(inplace=True) 
df_map_1['Continent'] = df_map_1['ISO 3166 Country Code'].map(get_continent)
df_map_1 = df_map_1[df_map_1['Continent'] != 'Unknown']


In [None]:
fig, axes = plt.subplots(nrows = 1, ncols = 6, sharex="all", figsize=(24,8))
sns.set_style("whitegrid")
continents = ['OC','NA', 'EU', 'AS','AF', 'SA']
continent_names = [ 'Oceania','North America', 'Europe', 'Asia', 'Africa', 'South America']
i = 0
for continent, name, ax in zip(continents ,continent_names , axes.flatten()): 
    ax = sns.barplot(x="Year", y="ConvertedComp", 
                    color = sns.color_palette("autumn")[i], capsize= 0.2, 
                    data=df_map_1[df_map_1['Continent'] == continent], ax = ax)
    ax.set_title(name, fontsize=24)        
    if i >= 1:
        ax.set_yticks(t)
        #ax.get_yaxis().set_visible(False)
        ax.set_yticklabels([])
    else: 
        t = ax.get_yticks()
        #ax.set_yticklabels(labels = ax.get_yticklabels(),fontsize = 14) 
    ax.tick_params(axis='both', which='major', labelsize=18)
    ax.set_xlabel('')
    ax.set_ylabel('')
    sns.despine(right=True, left=True)
    i += 1

## 3. What is the impact of Year of experience on Salary

## 4. What is the impact of Programming language on Salary

In [None]:
# concat programming language features together
df_p_language = pd.concat([df, dm_db_nextyear, dm_db_work,  dm_language_nextyear, dm_language_work],
                   axis=1, join='inner')
df_p_language.head()