In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [3]:
def create_year_dataframe(year):
    df = pd.read_csv("survey_results_public_" + str(year) + ".csv", low_memory=False)
    return df

In [4]:
years_data = [2015, 2016, 2017, 2018, 2019]

In [5]:
def create_all_dataframes(all_years):
    dict_df = {}
    for year in years_data:
        dict_df[year] = create_year_dataframe(year)
    return dict_df

In [6]:
all_years = create_all_dataframes(years_data)

In [7]:
all_years[2015].head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Select all that apply,Unnamed: 9,...,Unnamed: 212,Unnamed: 213,Select all that apply.7,Unnamed: 215,Unnamed: 216,Unnamed: 217,Unnamed: 218,Unnamed: 219,Unnamed: 220,Unnamed: 221
0,Country,Age,Gender,Tabs or Spaces,Years IT / Programming Experience,Occupation,Desktop Operating System,Desktop Operating System: write-in,Current Lang & Tech: Android,Current Lang & Tech: Arduino,...,Why use Stack Overflow: I don't use Stack Over...,How often are Stack Overflow's answers helpful,Why answer: Help a programmer in need,Why answer: Help future programmers,Why answer: Demonstrate expertise,Why answer: Self promotion,Why answer: Sense of responsibility to developers,Why answer: No idea,Why answer: I don't answer and I don't want to,Why answer: I don't answer but I want to
1,Croatia,25-29,Male,Tabs,2 - 5 years,Back-end web developer,Ubuntu,,,,...,,Usually,,,It feels good to demonstrate my expertise.,Demonstrating my expertise will benefit me,I feel a sense of responsibility to the develo...,,,
2,France,20-24,Male,Spaces,1 - 2 years,Back-end web developer,Windows 7,,,,...,,Usually,,My answer will help lots of people who have th...,It feels good to demonstrate my expertise.,,,,,
3,India,20-24,Male,Tabs,1 - 2 years,Back-end web developer,Windows 7,,,,...,,Rarely,,,,Demonstrating my expertise will benefit me,,,,
4,Latvia,25-29,Male,It depends,6 - 10 years,Back-end web developer,Ubuntu,,,,...,,Usually,It feels good to help a programmer in need,My answer will help lots of people who have th...,It feels good to demonstrate my expertise.,Demonstrating my expertise will benefit me,I feel a sense of responsibility to the develo...,,,


In [8]:
#cleaning 2015 headers
columns_2015 = all_years[2015].iloc[0]
all_years[2015] = all_years[2015][1:]
all_years[2015].columns = columns_2015

<H2> Question 1: How has the survey changed over the year <h2>

In [9]:
def get_yearly_shape(dictionary, year):
    return dictionary[year].shape[0], dictionary[year].shape[1]


In [10]:
get_yearly_shape(all_years, 2019)

(88883, 85)

<H2> Question 2: How have the most common occupations changed in the United States over the last 3 years <h2>

In [68]:
def create_value_df(df, column):
    new_df = pd.DataFrame(df[column].value_counts()).reset_index()
    new_df.columns=[column, "Count"]
    new_df["Percentage"] = new_df["Count"]/sum(new_df["Count"])
    new_df.sort_values("Percentage", ascending=False, inplace=True)
    return new_df

In [69]:
occupation_2015 = create_value_df(all_years[2015], "Occupation")
occupation_2016 = create_value_df(all_years[2016], "occupation")

In [82]:
occupation_2015

Unnamed: 0,Occupation,Count,Percentage
0,Full-stack web developer,6765,0.324445
1,Student,2845,0.136444
2,Back-end web developer,2104,0.100906
3,Desktop developer,1735,0.083209
4,Front-end web developer,1242,0.059565
5,Mobile developer - Android,847,0.040622
6,Mobile developer - iOS,634,0.030406
7,Embedded application developer,609,0.029207
8,Enterprise level services developer,599,0.028728
9,Developer with a statistics or mathematics bac...,464,0.022253


In [73]:
def split_words(df, column):
    all_the_words = defaultdict(int) 
    for row in df[column]:
        if isinstance(row, float):
            all_the_words[row] += 1
        else:
            for word in row.split(";"):
                all_the_words[word.strip()] += 1
    new_df = pd.DataFrame(pd.Series(all_the_words)).reset_index()
    new_df.columns = [column, "Count"]
    new_df["Percentage"] = new_df["Count"] / sum(new_df["Count"])
    new_df.sort_values('Percentage', ascending=False, inplace=True)
    return new_df

In [75]:
occupation_2017 = split_words(all_years[2017], "DeveloperType")

In [76]:
occupation_2017

Unnamed: 0,DeveloperType,Count,Percentage
5,Web developer,26235,0.283542
0,,15267,0.165002
4,Desktop applications developer,10435,0.112779
2,Mobile developer,8326,0.089986
12,Database administrator,5192,0.056114
11,Developer with a statistics or mathematics bac...,4091,0.044215
13,Systems administrator,4086,0.044161
7,DevOps specialist,4015,0.043393
6,Embedded applications/devices developer,3352,0.036228
9,Data scientist,3045,0.03291


In [79]:
occupation_2018 = split_words(all_years[2018], "DevType")

In [80]:
occupation_2018

Unnamed: 0,DevType,Count,Percentage
10,Back-end developer,53300,0.188431
0,Full-stack developer,44353,0.156801
11,Front-end developer,34822,0.123106
14,Mobile developer,18804,0.066478
6,Desktop or enterprise applications developer,15807,0.055882
9,Student,15732,0.055617
1,Database administrator,13216,0.046722
12,Designer,12019,0.042491
3,System administrator,10375,0.036679
2,DevOps specialist,9549,0.033759


In [81]:
occupation_2019 = split_words(all_years[2019], "DevType")
occupation_2019

Unnamed: 0,DevType,Count,Percentage
5,"Developer, full-stack",42222,0.160884
4,"Developer, back-end",40665,0.154951
2,"Developer, front-end",26649,0.101544
1,"Developer, desktop or enterprise applications",17316,0.065982
7,"Developer, mobile",14698,0.056006
17,Student,11921,0.045424
10,Database administrator,9520,0.036275
3,Designer,9182,0.034987
20,System administrator,8929,0.034023
14,DevOps specialist,8862,0.033768
