# Information regarding the data that this notebook refers to

# fill out the following fields. Check that input data and notebook name match.

In [1]:
notebook_id= '014'
name_string= '_s_CA_1975_1998_cl_STATCAN'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'Canada'
nb_start_time= '1975'
nb_stop_time= '1998'
nb_attribute_1= 'class'
nb_attribute_2= 'registered'
nb_attribute_3= ''
nb_attribute_4= ''
nb_data_source= 'STATCAN'
nb_data_source_url= 'https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=2310023501'
nb_comment= ''

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
df=pd.read_excel(xls, sheet_name= 'data', )

 #### replace class titles with codes
  

In [6]:
# read in the class_metatdata sheet which has equivalent terms for class types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
classMetadata = pd.read_excel(metaxls, sheet_name= 'class_metadata', index_col= 0)

# limit the data to the rows specific to relevent source
sourceClassDict= classMetadata[classMetadata.source == nb_data_source] 

sourceClassLabel= sourceClassDict.source_label.to_list()

sourceClassCode = sourceClassDict.code.to_list()


df['class'].replace(to_replace=sourceClassLabel, value=sourceClassCode, inplace=True )

#find codes which did not match 
df.loc[~df['class'].isin(sourceClassCode)]

Unnamed: 0,class,1975,1976,1977,1978,1979,1980,1981,1982,1983,...,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998
0,all,11278513,11786309,12547247,12819457,13338630,13717152,13851732,14310717,14619609,...,16719558,16981130,16443808,16580960,16717501,16971747,17048297,17182626,17477742,17988141


## rearrange data into proper format

In [7]:
df.head()

Unnamed: 0,class,1975,1976,1977,1978,1979,1980,1981,1982,1983,...,1989,1990,1991,1992,1993,1994,1995,1996,1997,1998
0,all,11278513,11786309,12547247,12819457,13338630,13717152,13851732,14310717,14619609,...,16719558,16981130,16443808,16580960,16717501,16971747,17048297,17182626,17477742,17988141
1,CANPASS,8692821,9016258,9509290,9661405,9985146,10255511,10199638,10530355,10731520,...,12380258,12622038,12577578,12781067,12925122,13131434,13192272,13251146,13486957,13887270
2,CANTRUCK,2129467,2268214,2494704,2652357,2854217,2902730,3137987,3239341,3307746,...,3826963,3867385,3399449,3349502,3345214,3392811,3410862,3476193,3526933,3625818
3,CANBUS,47943,50437,52003,52350,54157,52569,55429,55299,56437,...,62494,63962,63898,63912,63651,64457,64339,64550,64261,68259
4,CANMC,356122,395780,437800,392000,389057,456187,446263,473139,509573,...,377997,359015,349549,338504,336305,329784,320381,311781,319183,333502


In [8]:
#.melt() pivots the table bringing the column headers into a new attribute
melted = df.melt(id_vars=['class'], var_name= 'year')
melted

Unnamed: 0,class,year,value
0,all,1975,11278513
1,CANPASS,1975,8692821
2,CANTRUCK,1975,2129467
3,CANBUS,1975,47943
4,CANMC,1975,356122
...,...,...,...
139,CANPASS,1998,13887270
140,CANTRUCK,1998,3625818
141,CANBUS,1998,68259
142,CANMC,1998,333502


## structuring the data into format of datastructure


In [9]:
df = melted

# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']= '' #df['year_of_measurement'].astype(str) + '-12-31'
df.head(2)

df.loc[:,'geo']= nb_geography
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= df['class']
df.loc[:,'vehicle_segment']= 'all' #df['segment']
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= ''
df.loc[:,'year_of_first_registraion']=''
# df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-08-20'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= ''

# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [10]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
014000000000,1975,,Canada,r,all,all,all,,,11278513,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000001,1975,,Canada,r,CANPASS,all,all,,,8692821,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000002,1975,,Canada,r,CANTRUCK,all,all,,,2129467,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000003,1975,,Canada,r,CANBUS,all,all,,,47943,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000004,1975,,Canada,r,CANMC,all,all,,,356122,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
014000000139,1998,,Canada,r,CANPASS,all,all,,,13887270,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000140,1998,,Canada,r,CANTRUCK,all,all,,,3625818,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000141,1998,,Canada,r,CANBUS,all,all,,,68259,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000142,1998,,Canada,r,CANMC,all,all,,,333502,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,


## at this point restart kernel and run all cells

In [11]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [12]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

In [13]:
# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([df_out,stock_pickle]).drop_duplicates()

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata]).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [14]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [15]:
stock_df

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
014000000000,1975,,Canada,r,all,all,all,,,11278513,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000001,1975,,Canada,r,CANPASS,all,all,,,8692821,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000002,1975,,Canada,r,CANTRUCK,all,all,,,2129467,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000003,1975,,Canada,r,CANBUS,all,all,,,47943,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
014000000004,1975,,Canada,r,CANMC,all,all,,,356122,nr,STATCAN,2020-08-20,014_s_CA_1975_1998_cl_STATCAN.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
004000003800,2018,,Sweden,r,EUM1,all,ICE,all,,212504,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003801,2018,,United Kingdom of Great Britain and Northern I...,r,EUM1,all,ICE,all,,343,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,"definition differs, see eurostat metadata"
004000003803,2018,,Norway,r,EUM1,all,ICE,all,,1008,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003804,2018,,Switzerland,r,EUM1,all,ICE,all,,3700,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
