In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [None]:
def create_year_dataframe(year):
    df = pd.read_csv("survey_results_public_" + str(year) + ".csv", low_memory=False)
    return df

In [None]:
years_data = [2015, 2016, 2017, 2018, 2019]

In [None]:
def create_all_dataframes(all_years):
    dict_df = {}
    for year in years_data:
        dict_df[year] = create_year_dataframe(year)
    return dict_df

In [None]:
all_years = create_all_dataframes(years_data)

In [None]:
#cleaning 2015 headers
columns_2015 = all_years[2015].iloc[0]
all_years[2015] = all_years[2015][1:]
all_years[2015].columns = columns_2015

In [None]:
def segment_by_column_and_value(df, column, value):
    return df[df[column]==value]

In [None]:
all_years[2016].rename(columns={"country":"Country"}, inplace=True)

In [None]:
def create_column_df(df_dict, column, value):
    dictionary = {}
    for key in df_dict:
        dictionary[key] = segment_by_column_and_value(df_dict[key], column, value)
    return dictionary

In [None]:
united_states = create_column_df(all_years, "Country", "United States")

<H2> Question 1: How has the survey changed over the year <h2>

In [None]:
def get_yearly_shape(dictionary, year):
    return dictionary[year].shape[0], dictionary[year].shape[1]

In [None]:
get_yearly_shape(all_years, 2019)

<H2> Question 2: How have the most common occupations changed in the United States over the last 3 years <h2>

In [None]:
def segment_by_column_and_value(df, column, value):
    return df[df[column]==value]

In [None]:
def create_value_df(df, year, column, column_name):
    new_df = pd.DataFrame(df[year][column].value_counts()).reset_index()
    new_df.columns=[column_name, "Count"]
    new_df["Percentage"] = new_df["Count"]/sum(new_df["Count"])
    new_df = new_df.replace("(?i).*mobile.*", "Mobile", regex=True)
    new_df = new_df.groupby("DeveloperType", as_index=False).sum()
    new_df["Year"] = year
    new_df.sort_values("Percentage", ascending=False, inplace=True)
    return new_df

In [None]:
def split_words(df, year, column, column_name):
    all_the_words = defaultdict(int) 
    for row in df[year][column]:
        if isinstance(row, float):
            all_the_words[row] += 1
        else:
            for word in row.split(";"):
                all_the_words[word.strip()] += 1
    new_df = pd.DataFrame(pd.Series(all_the_words)).reset_index()
    new_df.columns = [column_name, "Count"]
    new_df["Percentage"] = new_df["Count"] / sum(new_df["Count"])
    new_df["Year"] = year
    new_df.sort_values('Percentage', ascending=False, inplace=True)
    return new_df

In [None]:
occupation_2015 = create_value_df(united_states, 2015, "Occupation", "DeveloperType")
occupation_2016 = create_value_df(united_states, 2016, "occupation", "DeveloperType")
#occupation_2017 = split_words(all_years, 2017, "DeveloperType", "DeveloperType")
occupation_2018 = split_words(united_states, 2018, "DevType", "DeveloperType")
occupation_2019 = split_words(united_states, 2019, "DevType", "DeveloperType")

In [None]:
occupation_df = occupation_2015.append(occupation_2016).append(occupation_2018).append(occupation_2019)

In [None]:
def standardize_developer_types(df, regexes, words):
    new_df = df.replace(regexes, words, regex=True)
    return new_df

In [None]:
regexes = ["(?i).*full-stack.*", "(?i).*back-end.*", "(?i).*front-end.*", "(?i).*embedded.*", "(?i).*desktop.*", "(?i).*executive.*", "(?i).*devops.*", "(?i).*mobile.*" ] 

In [None]:
words = ["Full-stack", "Back-end", "Front-end", "Embedded", "Desktop", "Executive", "DevOps", "Mobile"]

In [None]:
standardized_data_df = standardize_developer_types(occupation_df, regexes, words)

In [None]:
developer_types = ["Full-stack", "Front-end", "Back-end", "Mobile", "Student", "Desktop"]

In [None]:
graph_data = standardized_data_df[standardized_data_df["DeveloperType"].isin(developer_types)]

In [None]:
sns.barplot(x="DeveloperType", y="Percentage", hue="Year", data=graph_data)

In [None]:
test = standardize_developer_types(united_states[2019], regexes, words)

In [None]:
test