# Stackoverflow Survey 2017 Insights

We are going to look how answers differ between developers from countries of interest.
What makes a company attractive for potential employees? How do salaries compare accross countries?
Let's start by opening our toolbox and loading the survey data.

In [None]:
import numpy as np
import pandas
from matplotlib import pyplot as plt
from matplotlib.pyplot import yticks
import plotly.plotly as py
from plotly.graph_objs import *
import seaborn as sns
%matplotlib inline

df = pandas.read_csv('../input/survey_results_public.csv')
schema = pandas.read_csv('../input/survey_results_schema.csv').set_index('Column')

## What's the avg importance per job assessment attribute per country of interest?

In [None]:
def to_importance_value(label):
    imp_map = {'Somewhat important': 2, 'Important': 3, 'Not very important': 1, 'Not at all important': 0,
               'Very important': 4}
    return imp_map[label]

In [None]:
countries_of_interest = ['Germany', 'Switzerland', 'Spain', 'Ukraine', 'United States']
assess_job_df = pandas.DataFrame(countries_of_interest, columns=['Country'])

for column in schema.index.values:
    if column.startswith('AssessJob'):
        assess_job_df[column] = assess_job_df['Country'].apply(
            lambda c: df[df['Country'] == c][column].dropna().apply(to_importance_value).mean())

assess_job_df.head()

## How important are technologies used, job role, compensation and the product to be worked on for developers from the US, Spain, Ukraine and the DACH region?

In [None]:
def retain_cols_of_interest(df):
    cols_of_interest = ['AssessJobTech', 'AssessJobRole', 'AssessJobCompensation', 'AssessJobProduct']
    return df[cols_of_interest].dropna().apply(lambda x: [to_importance_value(col) for col in x])

def plot_comparison(df1, df2, df1_label, df2_label):
    fig, ((ax11, ax12), (ax21, ax22)) = plt.subplots(2, 2, figsize=(13,10))
    sns.set_style("whitegrid")
    ax11.set_title(df1_label)
    ax11.set_ylabel('mean importance')
    ax12.set_title(df2_label)
    sns.barplot(data=df1, ax=ax11)
    sns.barplot(data=df2, ax=ax12)
    sns.violinplot(data=df1, ax=ax21)
    sns.violinplot(data=df2, ax=ax22)
    for ax in (ax11, ax12, ax21, ax22):
        ax.set_xticklabels(('Technology', 'Job Role', 'Compensation', 'Product'))
    ax11.set_ybound(1.5,3.4)
    ax12.set_ybound(1.5,3.4)
    ax21.set_ybound(0,5)
    ax22.set_ybound(0,5)

In [None]:
dach = df[df['Country'].map(lambda x: x in ['Switzerland', 'Germany', 'Austria'])]
dach = retain_cols_of_interest(dach)
spanish = df[df['Country'] == 'Spain']
spanish = retain_cols_of_interest(spanish)

plot_comparison(dach, spanish, 'DACH', 'Spain')

In [None]:
us = df[df['Country'] == 'United States']
us = retain_cols_of_interest(us)
ukraine = df[df['Country'] == 'Ukraine']
ukraine = retain_cols_of_interest(ukraine)

plot_comparison(us, ukraine, 'United States', 'Ukraine')

## How satisfied are developers with their job and career?

In [None]:
coi_df = df[df['Country'].map(lambda x: x in countries_of_interest)].reset_index()
fig = plt.figure(figsize=(8,6))
sns.boxplot(x="Country", y="JobSatisfaction", data=coi_df)
fig = plt.figure(figsize=(8,6))
sns.boxplot(x="Country", y="CareerSatisfaction", data=coi_df)

## Preparing the annual salary data

In [None]:
coi_df.Salary.describe()

Obviously the salary data needs cleaning.

Salaries were reported in different currencies, so we need to convert all salaries to the same currency.
We also need to drop obvious outliers.
For that we will fill missing currencies and replace weird currency values with the most frequently reported currency for the respondents' countries before converting all salaries to the same currency (we'll be using exchange rates from March 2017 when the survey was active).

In [None]:
coi_df[['Country','Currency']].groupby('Country').describe()

In [None]:
exchange_rates = {
    'U.S. dollars ($)': 0.99152,
    'Euros (€)': 1.07065,
    'Swiss francs': 1
}
coi_df.loc[coi_df['Country'].map(lambda c: c in ['Germany', 'Spain']), 'Currency'] =\
    coi_df['Currency'].apply(lambda x: x in exchange_rates and x or 'Euros (€)')
coi_df.loc[coi_df['Country'].map(lambda c: c in ['United States', 'Ukraine']), 'Currency'] =\
    coi_df['Currency'].apply(lambda x: x in exchange_rates and x or 'U.S. dollars ($)')
coi_df.loc[coi_df['Country'] == 'Switzerland', 'Currency'] = \
    coi_df['Currency'].apply(lambda x: x in exchange_rates and x or 'Swiss francs')


In [None]:
q1 = coi_df['Salary'].quantile(0.005)

def normalize_salary(cols):
    salary, currency = cols
    if np.isnan(salary) or salary < q1:
        return None
    salary = to_chf(salary, currency)
    if salary < 9000: # correction for monthly salary
        return salary * 12
    return salary


def to_chf(salary, currency):
    if currency in exchange_rates and not np.isnan(salary):
        return exchange_rates.get(currency) * salary
    else:
        return None

coi_df['Salary'] = coi_df[['Salary','Currency']].apply(normalize_salary, axis=1)
coi_df.Salary.describe()

## Salary comparison by country

In [None]:
coi_df[['Country', 'Salary', 'JobSatisfaction', 'CareerSatisfaction']].groupby('Country').mean()

In [None]:
fig = plt.figure(figsize=(10,8))
sns.boxplot(x="Country", y="Salary", data=coi_df)

## How many Big Macs can you buy each year when you work as a developer in one of those countries?

The salary alone doesn't provide much value in this comparison.
What does it look like when we relate these numbers to the [Big-Mac-Index](http://www.economist.com/content/big-mac-index)?

In [None]:
bm_index = {
    'United States': 5.3,
    'Switzerland': 6.74,
    'Germany': 4.45,
    'Ukraine': 1.7,
    'Spain': 4.34
}

coi_df.head()
    
coi_df['BigMacs'] = coi_df[['Country', 'Salary']].apply(
    lambda col: np.round(col[1] / bm_index.get(col[0])), axis=1)
coi_df[['Country', 'BigMacs']].groupby('Country').median()

The purchasing power in Spain seems quite low with only approx. 61.5 % of the purchasing power in Germany.
However this is consistent with the purchasing power for these two countries as determined in the [GfK study](http://www.gfk.com/insights/press-release/pp-europe/) (62.5%).

In [None]:
fig = plt.figure(figsize=(10,8))
sns.boxplot(x="Country", y="BigMacs", data=coi_df)

## Is there a correlation between code copying from Stackoverflow and the feeling to be overpaid?
Maybe people who are being paid the salary of an engineer but just copy and paste answers from stackoverflow consider theirselves overpaid.

In [None]:
fig = plt.figure(figsize=(7, 7))
df.Overpaid.value_counts().plot.pie()

In [None]:
labels_overpaid = ('Greatly underpaid', 'Somewhat underpaid', 'Neither underpaid nor overpaid', 'Somewhat overpaid', 'Greatly overpaid')
labels_copied_code = ("Haven't done at all", 'Once or twice', 'Several times', 'At least once each week', 'At least once each day')
coi_df['OverpaidScore'] = coi_df['Overpaid'].dropna().apply(lambda x: labels_overpaid.index(x))
coi_df['CopyCodeScore'] = coi_df['StackOverflowCopiedCode'].dropna().apply(lambda x: labels_copied_code.index(x))

sns.pairplot(coi_df[['JobSatisfaction', 'OverpaidScore', 'CopyCodeScore']].dropna(), markers=["+"])

In [None]:
coi_df[['JobSatisfaction', 'OverpaidScore', 'CopyCodeScore']].corr()

## What about gender pay equality?

In [None]:
mf = coi_df[coi_df['Gender'].map(lambda x : x in ('Male','Female'))]
mf.groupby('Gender').size()

Women in this field are definitely underrepresented.

In [None]:
fig = plt.figure(figsize=(6,6))
g = sns.barplot(x="Gender", y="BigMacs", data=mf)
g.axes.set_ybound(8000, 17000)

## Developer attitude differences by language and country

Replace labels with numeric representations and calculate a new feature with less categories from the list of languages worked with.

In [None]:
# replace agree/disagree questions with numeric values
agree_labels = ('Strongly disagree', 'Disagree', 'Agree', 'Somewhat agree', 'Strongly agree')
agree_columns = ['ProblemSolving', 'BoringDetails', 'LearningNewTech', 'JobSecurity',
                'DiversityImportant', 'AnnoyingUI', 'FriendsDevelopers', 'RightWrongWay',
                'UnderstandComputers', 'SeriousWork', 'InvestTimeTools', 'WorkPayCare',
                'ChallengeMyself', 'CompetePeers', 'ChangeWorld']

def dev_language(langs):
    if not isinstance(langs, str):
        return None
    langs = langs.split("; ")
    if 'Java' in langs and ('JavaScript' in langs or 'TypeScript' in langs):
        return 'Java and JS/TS'
    elif 'Python' in langs and 'JavaScript' in langs or 'TypeScript' in langs:
        return 'Python and JS/TS'
    elif 'JavaScript' in langs or 'TypeScript' in langs:
        return 'JS/TS'
    elif 'Java' in langs:
        return 'Java'
    elif 'Python' in langs:
        return 'Python'
    else:
        return 'others'

coi_df[agree_columns] = coi_df[agree_columns].dropna().apply(lambda c: c.map(lambda a: a in agree_labels and agree_labels.index(a) or None), axis=1)
coi_df['DevLang'] = coi_df['HaveWorkedLanguage'].apply(dev_language)
coi_df[['Country', 'DevLang']].groupby('Country').describe()

In [None]:
att = coi_df[['Country', 'Gender', 'DevLang'] + agree_columns]

In [None]:
# display a correlation matrix for some attitude/opinion attributes
corr = att[['BoringDetails','SeriousWork', 'InvestTimeTools', 'WorkPayCare',
                'ChallengeMyself', 'CompetePeers', 'ChangeWorld']].corr()
colormap = plt.cm.hot
plt.figure(figsize=(12,12))
plt.title('Pearson Correlation of Attitude Features', y=1.05, size=15)
sns.heatmap(corr,linewidths=0.1,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

In [None]:
g = sns.barplot(x="Country", y="JobSecurity", data=att)
g.axes.set_title('Job security is important to me')
g.axes.set_ybound(2,3.2)
g.figure.set_size_inches(12,7)

In [None]:
g = sns.barplot(x="Country", y="AnnoyingUI", data=att)
g.axes.set_title('It annoys me when software has a poor UI')
g.axes.set_ybound(2,3.2)
g.figure.set_size_inches(12,7)

In [None]:
g = sns.barplot(x="Country", y="ChangeWorld", data=att)
g.axes.set_title('I want to change the world')
g.axes.set_ybound(2,3.2)
g.figure.set_size_inches(12,7)

In [None]:
g = sns.barplot(x="Country", y="ProblemSolving", data=att)
g.axes.set_title('I love solving problems')
g.axes.set_ybound(2, 3.5)
g.figure.set_size_inches(12,7)

In [None]:
g = sns.barplot(x="DevLang", y="UnderstandComputers", data=att)
g.axes.set_title("Honestly, there's a lot about computers that I just don't understand")
g.axes.set_ybound(1,2.5)
g.figure.set_size_inches(12,7)

In [None]:
g = sns.barplot(x="Country", y="WorkPayCare", data=att)
g.axes.set_title("I don't really care what I work on, so long as I'm paid well")
g.axes.set_ybound(1,2.1)
g.figure.set_size_inches(12,7)

## Suppose you could choose your own working hours for an 8-hour day. What time would you start work for the day? 

In [None]:
workstart = coi_df[['Country','WorkStart', 'Respondent']].groupby(('Country','WorkStart')).count()
workstart_pcts = workstart.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
workstart_pcts.columns = ['Percent']
# show the most popular times
workstart_pcts = workstart_pcts.reset_index().groupby('Country', group_keys=False).apply(lambda x: x.nlargest(5, 'Percent'))
workstart_pcts.set_index(['Country', 'WorkStart'])

In [None]:
# plot the two most popular starting times by country
workstart_pcts = workstart_pcts.groupby('Country', group_keys=False).apply(lambda x: x.nlargest(2, 'Percent'))
fig = plt.figure(figsize=(13,8))
g = sns.barplot(data=workstart_pcts, x='Country', y='Percent', hue='WorkStart')
g.set_ybound(10, 33)
g.set_ylabel('Percent')