In [None]:
import pandas as pd #allows reading tabular and multi-dimentional data
import matplotlib.pyplot as plt #visulisation package
import seaborn as sns #visulisation package
import numpy as np #linear algebra
from string import ascii_letters #helps searching for a specifit charactor(s) in string 
import math
import plotly.express as px
import plotly.graph_objects as go
import warnings
warnings.filterwarnings("ignore")

In [None]:
df = pd.read_csv("../input/stack-overflow-developer-survey-2020/developer_survey_2020/survey_results_public.csv",index_col="Respondent")
df_schema= pd.read_csv("../input/stack-overflow-developer-survey-2020/developer_survey_2020/survey_results_schema.csv", index_col= "Column")
df.rename(columns={"ConvertedComp": "Salary_USD"}, inplace=True)
pd.set_option("Display.max_columns", 21)
pd.set_option("Display.max_rows", 61)
df.drop(columns=["CompTotal","CompFreq"], inplace=True)

In [None]:
#Code from https://www.kaggle.com/yashvi/data-analyst-jobs-visualization#Revenue-of-different-Industries
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)

# Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [None]:
missing_values_table(df)

In [None]:
df.describe()

In [None]:
df["YearsCode"].replace(to_replace="More than 50 years", value=51, inplace=True)
df["YearsCode"].replace(to_replace="Less than 1 year", value=0, inplace=True)
df["YearsCodePro"].replace(to_replace="More than 50 years", value=int(51), inplace=True)
df["YearsCodePro"].replace(to_replace="Less than 1 year", value=int(0), inplace=True)
df["Age1stCode"].replace(to_replace="Younger than 5 years", value=int(4), inplace=True)
df["Age1stCode"].replace(to_replace="Older than 85", value=int(86), inplace=True)
df = df.astype({"YearsCode": float, "YearsCodePro": float, "Age1stCode": float})

We have now changed all the objects in these columns to numerical values.


In [None]:
df.describe()


We'll treat the missig values by re-evaluating them to a value found in other similar rows. For example, the here in the UK, you are less likely to work for more than 40hrs/w whereas this is not the case in the US. Therefore, it makes sense to treat missing values based on similar rows found within the same country or within the same Salary(the more hours you work the more likely you are to earn more!). 

In [None]:
df[["WorkWeekHrs","Salary_USD"]] = df[["WorkWeekHrs","Salary_USD"]].fillna(
    df.groupby("Country")["WorkWeekHrs","Salary_USD"].transform("mean"))

In [None]:
df[["YearsCode", "YearsCodePro", "Age1stCode", "Age"]]=df[["YearsCode", "YearsCodePro", "Age1stCode", "Age"]].fillna(
    df.groupby("Salary_USD")["YearsCode", "YearsCodePro", "Age1stCode", "Age"].transform("mean"))

In [None]:
df.describe()

In [None]:
df.Age[df.Age < 16].value_counts()

In [None]:
df.Age1stCode[df.Age1stCode < 15].value_counts()

It is hard to make a judgement about these columns as there might be people out there who are/were coding from a very young age! Therefore, I'll make some adjustments that I **feel** is the sensible thing to do.

In [None]:
df["Age"] = df["Age"].apply(lambda x: 14 if x<14 else x)
df.Age.value_counts()[14]

In [None]:
df["WorkWeekHrs"] = df["WorkWeekHrs"].apply(lambda x: 70 if x>70 else x)

In [None]:
df["Age"] = df["Age"].apply(lambda x: 85 if x>85 else x)
df.describe()

We now have a much more standard database as we have gotten rid of some errors/impartial responses, such as working 475/week or being 279 years old.

In [None]:
plt.subplots(figsize=(8,4))
df["Age"].hist(bins=40,edgecolor="black")
plt.xticks(list(range(15,80,5)))
plt.title("Age Distribution")
plt.ylabel("Number of Participants", fontsize=16)
plt.xlabel("Age", fontsize=16)
plt.show() 

The majority of respondants are between 20 to 40 years old.

In [None]:
plt.subplots(figsize=(8,4))
df["Age1stCode"].hist(bins=15,edgecolor="black")
plt.xticks(list(range(0,60,5)))
plt.title("Age When First started Coding")
plt.ylabel("Frequncy", fontsize=16)
plt.show() 

In [None]:
plt.subplots(figsize=(5,5))
data=df["Hobbyist"]
data.value_counts().plot.pie(autopct='%1.1f%%',colors=sns.color_palette("dark",5),startangle=90,wedgeprops = { "linewidth" : 2, "edgecolor" : "white"})
plt.title("Do You Code As a Hobby?")
my_circle=plt.Circle((0,0), 0.7, color="white")
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.show()

In [None]:
df["Gender"].value_counts()

I really don't like doing this next step but for the sake of simplicity, I will put any other genders else than man and woman into one catagory. Gender is fluid and there are more than two genders out there. But we will do what's best for this analysis.

In [None]:
df["Gender"] = df["Gender"].str.split(',').str[0]
df["Gender"].replace({'Man;Non-binary': "Non-binary/Other", 'Woman;Man': "Non-binary/Other", 'Woman;Man;Non-binary': "Non-binary/Other", 'Woman;Non-binary': "Non-binary/Other","Non-binary":"Non-binary/Other"},inplace=True)

Converting the salary to pound sterling.

In [None]:
def USD_GBP(x):
    return x * float(0.72)#the rate as of 19th of March
df["Salary_USD"] = df["Salary_USD"].apply(USD_GBP)
df.rename(columns={"Salary_USD": "Salary_GBP"}, inplace=True)

In [None]:
plt.figure(figsize=(7,5))
chart = sns.barplot(
    data=df,
    x="Gender",
    y="Salary_GBP",
    palette="Set1"
)
chart=chart.set_xticklabels(
    chart.get_xticklabels(), 
    rotation=65, 
    horizontalalignment="right" 
)

In [None]:
df["Gender"].value_counts(normalize=True)

The error bar in the non-binary is rather large so therefore not very reliable. Men and women seem to be earning the same. As only just below 8% of the data comes from people who identify as a woman, it cannot be conculded that women earn the same as men in this industry. Clearly, we need more women for a fairer representation of genders.

In [None]:
plt.subplots(figsize=(12,10))
coun_deg=df["UndergradMajor"].value_counts()[:15].sort_values(ascending=False).to_frame()
sns.barplot(coun_deg.UndergradMajor,coun_deg.index,palette="mako")
plt.title("Top 15 Degree Subject by the Number of Respondants",size=15)
plt.xlabel("# Participants", size=10)
plt.show() 

Unsurprisingly, the majority have studied a computer science or a computer-related degree. I definitely expected more people from a mathematical background, though.

In [None]:
sal_ed=df.groupby("UndergradMajor")["Salary_GBP"].median().to_frame().sort_values(by="Salary_GBP",ascending=False).head(20)

In [None]:
ax=sns.barplot(sal_ed.Salary_GBP,sal_ed.index,palette=sns.color_palette("inferno",20))
plt.title("Top 20 Median Salary by Degree Subject",size=15)
for i, v in enumerate(sal_ed.Salary_GBP): 
    ax.text(.5, i, v,fontsize=10,color="white",weight="bold")
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

In [None]:
df["UndergradMajor"].value_counts(normalize=True)

This is interesting!! Although it can be argued that a graphic design degree is somewhat relavant to the field of computer science, social sciences, perfoming arts, and humanities discipline are completely irrelevant in programming. Yet people from these backgrounds are amonsts the highest earners! It is worth mentioning that the top 4 account for less than 5 percent of the data. So it is safe to say that we need more data from people from these academic backgrounds to make a firm decision.

In [None]:
df["Sexuality"] = df["Sexuality"].str.split(";").str[0]
df["Sexuality"].value_counts(normalize=True)

In [None]:
plt.figure(figsize=(7,5))
chart = sns.barplot(
    data=df,
    x="Sexuality",
    y="Salary_GBP",
    palette="Set1"
)
chart=chart.set_xticklabels(
    chart.get_xticklabels(), 
    rotation=65, 
    horizontalalignment="right" 
)

Despite our plot indicating that people who identify themselves as "straight" might earn less than other sexual identities, we simply dont have enough data to support this claim! The large error bar is also hinting that our graph might not be very reliable.

In [None]:
plt.subplots(figsize=(12,10))
coun=df["Country"].value_counts()[:15].sort_values(ascending=False).to_frame()
sns.barplot(coun.Country,coun.index,palette='inferno')
plt.title("Top 15 Countries By Number Respondants",size=15)
plt.ylabel("Country", size=10)
plt.xlabel("# Participants", size=10)
plt.show()

In [None]:
sal_job=df.groupby("Country")["Salary_GBP"].median().to_frame().sort_values(by="Salary_GBP",ascending=False).head(20)
ax=sns.barplot(sal_job.Salary_GBP,sal_job.index,palette=sns.color_palette("icefire",20))
plt.title("Top 20 Median Salary by Country",size=15)
for i, v in enumerate(sal_job.Salary_GBP): 
    ax.text(.5, i, v,fontsize=10,color="white",weight="bold")
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

In [None]:
countries=["Andorra", "Lao People's Democratic Republic", "Nomadic", "Bahamas", "Uzbekistan", "Hong Kong (S.A.R.)", "Denmark", "Afghanistan"]
country_filter=df["Country"].isin(countries)
df.loc[country_filter, ["Country"]].value_counts()

Again, we cannot really rely on the data from Afghanistan, Nomadic, Andorra, Lao, Bahamas or Uzbekistan as we simply have insufficient evidence to prove these countries are amongst highest payers. So we could repeat the process but drop these countries from the list

In [None]:
sal_job2=df.groupby("Country")["Salary_GBP"].median().to_frame().sort_values(by="Salary_GBP",ascending=False).head(26).drop(["Afghanistan","Nomadic","Andorra"
                                ,"Uzbekistan", "Lao People's Democratic Republic", "Bahamas"])
ax=sns.barplot(sal_job2.Salary_GBP,sal_job2.index,palette=sns.color_palette("icefire",20))
plt.title("Top 20 Median Salary by Country",size=15)
for i, v in enumerate(sal_job2.Salary_GBP): 
    ax.text(.5, i, v,fontsize=10,color="white",weight="bold")
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

In [None]:
plt.subplots(figsize=(12,10))
coun=df["Ethnicity"].value_counts()[:10].sort_values(ascending=False).to_frame()
sns.barplot(coun.Ethnicity,coun.index,palette='inferno')
plt.title("Ethnic Group Per #Respondants",size=15)
plt.ylabel("Ethnicity", size=10)
plt.xlabel("# Participants", size=10)
plt.show()

In [None]:
df["DevType"].value_counts()

We need to tidy up this colum as it is very messy.

In [None]:
df["DevType"] = df["DevType"].str.split(';').str[0]
df["DevType"].value_counts().head(20)

**Much better!**

In [None]:
sal_job=df.groupby("DevType")["Salary_GBP"].mean().to_frame().sort_values(by="Salary_GBP",ascending=False).head(20)
ax=sns.barplot(sal_job.Salary_GBP,sal_job.index,palette=sns.color_palette('inferno',20))
plt.title("Mean-Salary by Profession",size=15)
plt.ylabel("Developer Role")
for i, v in enumerate(sal_job.Salary_GBP): 
    ax.text(.5, i, v,fontsize=10,color='white',weight='bold')
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

In [None]:
sal_ed=df.groupby("EdLevel")["Salary_GBP"].median().to_frame().sort_values(by="Salary_GBP",ascending=False).head(20)
ax=sns.barplot(sal_ed.Salary_GBP,sal_ed.index,palette=sns.color_palette("inferno",20))
plt.title("Median-Salary by Education Level",size=15)
plt.ylabel("Developer Role")
for i, v in enumerate(sal_ed.Salary_GBP): 
    ax.text(.5, i, v,fontsize=10,color='white',weight='bold')
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

It is no surprise that you should earn more if you have higher qualifications and the evidence of that is shown here.

In [None]:
plt.subplots(figsize=(10,8))
data=df["JobSat"]
data.value_counts().plot.pie(autopct='%1.1f%%',colors=sns.color_palette('Paired',10),startangle=90,wedgeprops = { 'linewidth' : 2, 'edgecolor' : 'white' })
plt.title("Job Satisfaction Pie-chart")
my_circle=plt.Circle( (0,0), 0.7, color='white')
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.ylabel('')
plt.show()

Really impressive results in my opinion!Only 8.3% didn't like their job.

In [None]:
df["OrgSize"].value_counts()  

In [None]:
df["OrgSize"].replace({"Just me - I am a freelancer, sole proprietor, etc.": "Self-employed"},inplace=True)

In [None]:

plt.subplots(figsize=(24,14))
sns.countplot(y=df["OrgSize"],order=df["OrgSize"].value_counts().index)
plt.show()

There is definitely a real mix of data here. But, the majority seem to be working for smaller organisations with less that 500 employees!

In [None]:
salary=df[["Salary_GBP","OrgSize","YearsCode", "YearsCodePro"]].dropna()
sal_org=df.groupby("OrgSize")["Salary_GBP"].mean().to_frame().sort_values(by="Salary_GBP",ascending=False).head(20)

In [None]:
ax=sns.barplot(sal_org.Salary_GBP,sal_org.index,palette=sns.color_palette("ocean",20))
plt.title("Mean-Salary by Organisation Size",size=15)
plt.ylabel("Organisation Size")
plt.xlabel("Salary")
for i, v in enumerate(sal_job.Salary_GBP): 
    ax.text(.5, i, v,fontsize=10,color='white',weight='bold')
fig=plt.gcf()
fig.set_size_inches(10,10)
plt.show()

This graph is a strong indication that bigger organisations tend to pay a lot more in **average** salary than smaller organisations. Which is not very surprising at all!

In [None]:
df["pop_languages"] = df["LanguageWorkedWith"].str.split(';').str[0]
df["pop_languages_desired"] = df["LanguageDesireNextYear"].str.split(';').str[0]

In [None]:
f,ax=plt.subplots(1,2,figsize=(28,18))
sns.countplot(y=df["DatabaseWorkedWith"],ax=ax[0],order=df["DatabaseWorkedWith"].value_counts().head(20).index)
ax[0].set_title("Most Popular Databases")
ax[0].set_ylabel('')
sns.countplot(y=df["pop_languages"],ax=ax[1],order=df["pop_languages"].value_counts().head(20).index)
ax[1].set_title("Most Popular Languages")
ax[1].set_ylabel('')
plt.subplots_adjust(wspace=0.4)
plt.show()

In [None]:
df["pop_databases_desired"] = df["DatabaseDesireNextYear"].str.split(';').str[0]

In [None]:
f,ax=plt.subplots(1,2,figsize=(28,17))
sns.countplot(y=df["pop_databases_desired"],ax=ax[0],order=df["pop_databases_desired"].value_counts().head(20).index)
ax[0].set_title("Most Desired Databases")
ax[0].set_ylabel('')
sns.countplot(y=df["pop_languages_desired"],ax=ax[1],order=df["pop_languages_desired"].value_counts().head(20).index)
ax[1].set_title("Most Desired Languages")
ax[1].set_ylabel('')
plt.subplots_adjust(wspace=0.4)
plt.show()

In [None]:
df["years_code"]=pd.cut(df["YearsCode"], bins=[0, 10, 20, 30, 40, 50, 60], include_lowest=True)
df["years_code_pro"]=pd.cut(df["YearsCodePro"], bins=[0, 10, 20, 30, 40, 50, 60], include_lowest=True)

In [None]:
f,ax=plt.subplots(1,2,figsize=(28,18))
sns.countplot(y=df["years_code"],ax=ax[0],order=df["years_code"].value_counts().head(20).index)
ax[0].set_title("Years Coded")
sns.countplot(y=df["years_code_pro"],ax=ax[1],order=df["years_code_pro"].value_counts().head(20).index)
ax[1].set_title("Years Coded Professionally")
plt.subplots_adjust(wspace=0.4)
plt.show()