In [17]:
# import statements
import pandas as pd
import numpy as np
import json

# functions
def most_frequent(List):
    counter = 0
    num = List[0]
     
    for i in List:
        curr_frequency = List.count(i)
        if(curr_frequency> counter):
            counter = curr_frequency
            num = i
 
    return num
def selectOccupations(df, columnName):
    """
    UK wage data is provided as a CSV where the occupation is provided with increasing granularity (e.g. managers > managers in transport > managers in transport and distribution)
    I only want to store the most granular occupations with wage data so that involves only selecting the categories with the most tabs.
    This should return a DataFrame that has only the more granular occupations WITH mean | median salary data.
    """
    diff = []

    for i in range(len(df)):
        A = df.loc[i, columnName].split(" ")
        B = df.loc[i, columnName].strip().split(" ")
        df.loc[i, columnName] = df.loc[i, columnName].strip()
        diff.append(len(A)-len(B))

    most_freq = most_frequent(diff) # we are selecting the most frequent difference instead of the maximum difference after a peek at the data shows some random outliers that might just be entry error

    # create columns to base selections for df
    df["diff"] = diff
    df['medianSeries'] = pd.to_numeric(df.Median, errors='coerce').notnull()
    df['meanSeries'] = pd.to_numeric(df.Mean, errors='coerce').notnull()
    df = df[(df["medianSeries"] | df["meanSeries"]) & (df["diff"]==most_freq)].reset_index(drop=True)
    df = df.replace('x', np.NaN)
    return df.iloc[:,0:4]
def jsonifyOccupations(df, yearInput):
    """
    Given a dataframe of occupations with mean | median salary data for a given year, write a json string with the following format:
    Yeah, there's probably a way to do this with a pandas grouping or pivot but I hate pandas.
    {
        'jobA': {
            'year_1': {
                'median': 0000000,
                'mean': 0000000
            }
        },
        
        'jobB': {
            'year_1': {
                'median': 0000000,
                'mean': 0000000
            }
        },
        ...
        
        'jobZ': {
            'year_1': {
                'median': 0000000,
                'mean': 0000000
            }
        },
        
    }
    """
    d = {}

    for i in range(len(df)):
        content = dict.fromkeys(['median', 'mean'])
        content['median'] = df.loc[i,'Median']
        content['mean'] = df.loc[i,'Mean']
        year = {str(yearInput): content}
        d[df.loc[i,'Description']] = [year]
    return d
def merge_dictionaries(dictA, dictB):
    """
    Bruh does this function already exist wtf. This sounds like a homework problem but anyway
    Take dictB. 
    If a key in dictB is present in dictA, append the contents of the key to dictA[key].
    Otherwise, add dictB[key] and it's contents as a new key in dictA.
    """
    for key in dictB:
        if key in dictA.keys():
            dictA[key].append(dictB[key][0])
        else:
            dictA[key] = dictB[key]
    return dictA

# calls
UK2017 = jsonifyOccupations(selectOccupations(pd.read_csv("datasets/UK 2017.csv"), "Description"), 2017)
UK2018 = jsonifyOccupations(selectOccupations(pd.read_csv("datasets/UK 2018.csv"), "Description"), 2018)
UK2019 = jsonifyOccupations(selectOccupations(pd.read_csv("datasets/UK 2019.csv"), "Description"), 2019)
UK2020 = jsonifyOccupations(selectOccupations(pd.read_csv("datasets/UK 2020.csv"), "Description"), 2020)
UK2021 = jsonifyOccupations(selectOccupations(pd.read_csv("datasets/UK 2021.csv"), "Description"), 2021)

temp = merge_dictionaries(UK2017, UK2018)
temp = merge_dictionaries(temp, UK2019)
temp = merge_dictionaries(temp, UK2020)
temp = merge_dictionaries(temp, UK2021)
UK_JSON = json.dumps(temp, indent = 4) 

print(UK_JSON)





{
    "Chief executives and senior officials": [
        {
            "2017": {
                "median": "86264",
                "mean": "122047"
            }
        },
        {
            "2018": {
                "median": "90000",
                "mean": "137815"
            }
        },
        {
            "2019": {
                "median": "91646",
                "mean": "142199"
            }
        },
        {
            "2020": {
                "median": "79633",
                "mean": "111760"
            }
        },
        {
            "2021": {
                "median": "90242",
                "mean": "137745"
            }
        }
    ],
    "Elected officers and representatives": [
        {
            "2017": {
                "median": NaN,
                "mean": "17260"
            }
        }
    ],
    "Production managers and directors in manufacturing": [
        {
            "2017": {
                "median": "45184",
                "mean