In [1]:
# Import Dependecies

import pandas as pd
import numpy as np
import csv 
import matplotlib.pyplot as plt
from pathlib import Path
import scipy.stats as st

In [2]:
# Creating path to CSV file
employment_file = Path('../../Resources/employment_data.csv')

# Reading the CSV
employ_df = pd.read_csv(employment_file)

# Display CSV
employ_df

Unnamed: 0,REF_DATE,GEO,DGUID,Labour force characteristics,Sex,Age group,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
0,1976,Canada,2016A000011124,Population,Both sexes,15 years and over,Persons,249,units,0,v1063541342,1.1.1.1,17058.0,,,,1
1,1976,Canada,2016A000011124,Population,Both sexes,15 to 24 years,Persons,249,units,0,v1063541343,1.1.1.2,4550.4,,,,1
2,1976,Canada,2016A000011124,Population,Both sexes,25 years and over,Persons,249,units,0,v1063541344,1.1.1.3,12507.6,,,,1
3,1976,Canada,2016A000011124,Population,Both sexes,25 to 44 years,Persons,249,units,0,v1063541345,1.1.1.4,6259.7,,,,1
4,1976,Canada,2016A000011124,Population,Both sexes,45 to 64 years,Persons,249,units,0,v1063541363,1.1.1.22,4376.9,,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
409459,2022,British Columbia,2016A000259,Employment rate,Females,55 to 59 years,Percentage,242,units,0,v1063549916,11.10.3.17,68.6,,,,1
409460,2022,British Columbia,2016A000259,Employment rate,Females,60 to 64 years,Percentage,242,units,0,v1063549917,11.10.3.18,49.9,,,,1
409461,2022,British Columbia,2016A000259,Employment rate,Females,65 years and over,Percentage,242,units,0,v1063549918,11.10.3.19,11.5,,,,1
409462,2022,British Columbia,2016A000259,Employment rate,Females,65 to 69 years,Percentage,242,units,0,v1063549919,11.10.3.20,24.7,,,,1


In [3]:
# Check and browse columns
employ_df.columns

Index(['REF_DATE', 'GEO', 'DGUID', 'Labour force characteristics', 'Sex',
       'Age group', 'UOM', 'UOM_ID', 'SCALAR_FACTOR', 'SCALAR_ID', 'VECTOR',
       'COORDINATE', 'VALUE', 'STATUS', 'SYMBOL', 'TERMINATED', 'DECIMALS'],
      dtype='object')

In [4]:
# Drop irrelevant columns and rearrange remaining
reduced_df = employ_df[['VECTOR','REF_DATE','GEO', 'Labour force characteristics','Sex',
        'UOM','VALUE', 'COORDINATE']]

reduced_df

Unnamed: 0,VECTOR,REF_DATE,GEO,Labour force characteristics,Sex,UOM,VALUE,COORDINATE
0,v1063541342,1976,Canada,Population,Both sexes,Persons,17058.0,1.1.1.1
1,v1063541343,1976,Canada,Population,Both sexes,Persons,4550.4,1.1.1.2
2,v1063541344,1976,Canada,Population,Both sexes,Persons,12507.6,1.1.1.3
3,v1063541345,1976,Canada,Population,Both sexes,Persons,6259.7,1.1.1.4
4,v1063541363,1976,Canada,Population,Both sexes,Persons,4376.9,1.1.1.22
...,...,...,...,...,...,...,...,...
409459,v1063549916,2022,British Columbia,Employment rate,Females,Percentage,68.6,11.10.3.17
409460,v1063549917,2022,British Columbia,Employment rate,Females,Percentage,49.9,11.10.3.18
409461,v1063549918,2022,British Columbia,Employment rate,Females,Percentage,11.5,11.10.3.19
409462,v1063549919,2022,British Columbia,Employment rate,Females,Percentage,24.7,11.10.3.20


In [5]:
# Rename columns 
renamed_df = reduced_df.rename(columns={
     "VECTOR" : "Vector ID",
     "REF_DATE" : "Year",
     "GEO" : "Province",
     "UOM" : "Unit of Measure",
     "VALUE": "Value",
     "COORDINATE" : "Coordinate"
})

renamed_df.head()


Unnamed: 0,Vector ID,Year,Province,Labour force characteristics,Sex,Unit of Measure,Value,Coordinate
0,v1063541342,1976,Canada,Population,Both sexes,Persons,17058.0,1.1.1.1
1,v1063541343,1976,Canada,Population,Both sexes,Persons,4550.4,1.1.1.2
2,v1063541344,1976,Canada,Population,Both sexes,Persons,12507.6,1.1.1.3
3,v1063541345,1976,Canada,Population,Both sexes,Persons,6259.7,1.1.1.4
4,v1063541363,1976,Canada,Population,Both sexes,Persons,4376.9,1.1.1.22


# Data Clean up: Limit data to years 2019-2021 only and remove combined values in columns (such as 'Canada' in 'Province) 

In [6]:
# Filtering by the years 2019-2021
years_df = renamed_df.loc[renamed_df['Year'].isin([2019, 2020, 2021])]
years_df

Unnamed: 0,Vector ID,Year,Province,Labour force characteristics,Sex,Unit of Measure,Value,Coordinate
374616,v1063541342,2019,Canada,Population,Both sexes,Persons,30694.8,1.1.1.1
374617,v1063541343,2019,Canada,Population,Both sexes,Persons,4482.9,1.1.1.2
374618,v1063541344,2019,Canada,Population,Both sexes,Persons,26211.9,1.1.1.3
374619,v1063541345,2019,Canada,Population,Both sexes,Persons,9977.6,1.1.1.4
374620,v1063541363,2019,Canada,Population,Both sexes,Persons,9955.2,1.1.1.22
...,...,...,...,...,...,...,...,...
400747,v1063549916,2021,British Columbia,Employment rate,Females,Percentage,67.8,11.10.3.17
400748,v1063549917,2021,British Columbia,Employment rate,Females,Percentage,49.6,11.10.3.18
400749,v1063549918,2021,British Columbia,Employment rate,Females,Percentage,10.1,11.10.3.19
400750,v1063549919,2021,British Columbia,Employment rate,Females,Percentage,21.1,11.10.3.20


In [7]:
# Drop all rows where the 'Province' is has string values of 'Canada'
years_df = years_df.query('Province != "Canada"')
years_df

Unnamed: 0,Vector ID,Year,Province,Labour force characteristics,Sex,Unit of Measure,Value,Coordinate
375408,v1063542134,2019,Newfoundland and Labrador,Population,Both sexes,Persons,445.8,2.1.1.1
375409,v1063542135,2019,Newfoundland and Labrador,Population,Both sexes,Persons,55.4,2.1.1.2
375410,v1063542136,2019,Newfoundland and Labrador,Population,Both sexes,Persons,390.4,2.1.1.3
375411,v1063542137,2019,Newfoundland and Labrador,Population,Both sexes,Persons,119.0,2.1.1.4
375412,v1063542155,2019,Newfoundland and Labrador,Population,Both sexes,Persons,162.4,2.1.1.22
...,...,...,...,...,...,...,...,...
400747,v1063549916,2021,British Columbia,Employment rate,Females,Percentage,67.8,11.10.3.17
400748,v1063549917,2021,British Columbia,Employment rate,Females,Percentage,49.6,11.10.3.18
400749,v1063549918,2021,British Columbia,Employment rate,Females,Percentage,10.1,11.10.3.19
400750,v1063549919,2021,British Columbia,Employment rate,Females,Percentage,21.1,11.10.3.20


In [8]:
# Drop rows where 'Sex' = 'Both sexes' due to repitition of data
sex_df = years_df.query('Sex != "Both sexes"')
sex_df

Unnamed: 0,Vector ID,Year,Province,Labour force characteristics,Sex,Unit of Measure,Value,Coordinate
375430,v1063542156,2019,Newfoundland and Labrador,Population,Males,Persons,219.2,2.1.2.1
375431,v1063542157,2019,Newfoundland and Labrador,Population,Males,Persons,28.9,2.1.2.2
375432,v1063542158,2019,Newfoundland and Labrador,Population,Males,Persons,190.4,2.1.2.3
375433,v1063542159,2019,Newfoundland and Labrador,Population,Males,Persons,58.6,2.1.2.4
375434,v1063542177,2019,Newfoundland and Labrador,Population,Males,Persons,79.6,2.1.2.22
...,...,...,...,...,...,...,...,...
400747,v1063549916,2021,British Columbia,Employment rate,Females,Percentage,67.8,11.10.3.17
400748,v1063549917,2021,British Columbia,Employment rate,Females,Percentage,49.6,11.10.3.18
400749,v1063549918,2021,British Columbia,Employment rate,Females,Percentage,10.1,11.10.3.19
400750,v1063549919,2021,British Columbia,Employment rate,Females,Percentage,21.1,11.10.3.20


In [9]:
# Resetting the index
df = sex_df.reset_index(drop=True)
df

Unnamed: 0,Vector ID,Year,Province,Labour force characteristics,Sex,Unit of Measure,Value,Coordinate
0,v1063542156,2019,Newfoundland and Labrador,Population,Males,Persons,219.2,2.1.2.1
1,v1063542157,2019,Newfoundland and Labrador,Population,Males,Persons,28.9,2.1.2.2
2,v1063542158,2019,Newfoundland and Labrador,Population,Males,Persons,190.4,2.1.2.3
3,v1063542159,2019,Newfoundland and Labrador,Population,Males,Persons,58.6,2.1.2.4
4,v1063542177,2019,Newfoundland and Labrador,Population,Males,Persons,79.6,2.1.2.22
...,...,...,...,...,...,...,...,...
15835,v1063549916,2021,British Columbia,Employment rate,Females,Percentage,67.8,11.10.3.17
15836,v1063549917,2021,British Columbia,Employment rate,Females,Percentage,49.6,11.10.3.18
15837,v1063549918,2021,British Columbia,Employment rate,Females,Percentage,10.1,11.10.3.19
15838,v1063549919,2021,British Columbia,Employment rate,Females,Percentage,21.1,11.10.3.20


In [10]:
# Confirming datatypes
df.dtypes

Vector ID                        object
Year                              int64
Province                         object
Labour force characteristics     object
Sex                              object
Unit of Measure                  object
Value                           float64
Coordinate                       object
dtype: object

# Employement data can now easily be accessed and filtered by 'Labour force characteristics' types (unemployement,part-time employment etc) 

In [11]:
# Store cleaned file, to a new CSV 
df.to_csv('../../Output/employement_clean.csv', header= True, index= True)