In [95]:
import pandas as pd
import numpy as np
import json

def filterOccupations(df, descriptionColumn, contents):
    """
    Canada wage data is presented in a table that is also organized like dookie with no easy separator for granularity.
    df refers to the dataframe containing information from the statscan website (lightly filtered to get right of annoying headers)
    columnName refers to the job title column (we will rename to description for consistency with other datasets)
    contents refers to whether the table contains data about the mean or median annual wages
    I have to go into each title, filter out the contents of the [] and then filter out the rows where the contents within [] are only length 1 (this filters our broad categories).
    See https://www.statcan.gc.ca/en/subjects/standard/noc/2021/introductionV1 for more details.
    I'll also multiply the median and mean values by 52.
    Thus, we should return a dataframe with the formatting:

    Description  |   Median_2017   |   Median_2018   |   Median_2019   |   Median_2020   |   Median_2021   |
    --------------------------------------------------------------------------------------------------------
    JobA         |      wages      |      wages      |      wages      |      wages      |      wages      |
    JobB         |      wages      |      wages      |      wages      |      wages      |      wages      |
    ...                  ...               ...               ...               ...               ...
    JobZ         |      wages      |      wages      |      wages      |      wages      |      wages      |

    """
    isNotCategory = []

    df = df.replace(',','', regex = True)

    for i in range(len(df)):
        A = df.loc[i, descriptionColumn].split("  ")
        df.loc[i, descriptionColumn] = A[0]
        A = A[-1].replace('[','').replace(']','')
        isNotCategory.append(len(A)!=1)
    
    df['isNotCategory'] = isNotCategory
    df = df[df['isNotCategory']].reset_index(drop=True)
    for i in range(1, len(df.columns)):
        df.iloc[:,i] = pd.to_numeric(df.iloc[:,i].values)
        df.iloc[:,i] *= 52
        df.iloc[:,i] = round(df.iloc[:,i],2)
    return df.iloc[:,0:-1]

def jsonifyOccupations(medianDF, meanDF):
    """
    medianDF should contain the median annual wage data for occupations across time (with the year being the column title)
    meanDF should contain the mean annual wage data for occupation across time (with the year being the column title)
    Merge these into a dictionary with the following format to match other data sets
    {
        'jobA': {
            'year_1': {
                'median': 0000000,
                'mean': 0000000
            },
            ...
            'year_5': {
                'median': 0000000,
                'mean': 0000000
            }
        },
        ...
        
        'jobZ': {
            'year_1': {
                'median': 0000000,
                'mean': 0000000
            },
            ...
            'year_5': {
                'median': 0000000,
                'mean': 0000000
            }
        }
    }
    """
    d = {}
    # iterate through rows (occupation category)
    for i in range(len(medianDF)):
        
        # initialize an empty key named after the occupation category
        # it's initialized as a list in order to append values as we loop through the years
        d[medianDF.loc[i,'Description']] = {}

        # loop through years
        for j in range(1,len(medianDF.columns)): # starts at 1 to bypass the first column (occupation category)
            content = dict.fromkeys(['median','mean'])
            content['median'] = medianDF.iloc[i,j]
            content['mean'] = meanDF.iloc[i,j]
            #print(content)
            #print(year)
            d[medianDF.loc[i,'Description']][medianDF.columns[j]] = content

    return d

CAN_mean = filterOccupations(pd.read_csv('raw data/CAN mean.csv'), 'Description', 'mean')
CAN_median = filterOccupations(pd.read_csv('raw data/CAN median.csv'), 'Description', 'mean')

with open('processed data/CAN_2017-2021.json', 'w', encoding='utf-8') as f:
    json.dump(jsonifyOccupations(CAN_median, CAN_mean), f, ensure_ascii=False, indent=4)