# Information regarding the data that this notebook refers to

# fill out the following fields. Check that input data and notebook name match.

In [1]:
notebook_id= '013'
name_string= '_s_CA_1998-2019_cl_sg_STATCAN'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'Canada'
nb_start_time= '1998'
nb_stop_time= '2019'
nb_attribute_1= 'class'
nb_attribute_2= 'registered'
nb_attribute_3= 'some segment'
nb_attribute_4= ''
nb_data_source= 'STATCAN'
nb_data_source_url= 'https://www150.statcan.gc.ca/t1/tbl1/en/tv.action?pid=2310006701'
nb_comment= ''

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
df=pd.read_excel(xls, sheet_name= 'data', )

 #### replace class titles with codes
  

In [6]:
# read in the class_metatdata sheet which has equivalent terms for class types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
classMetadata = pd.read_excel(metaxls, sheet_name= 'class_metadata', index_col= 0)

# limit the data to the rows specific to relevent source
sourceClassDict= classMetadata[classMetadata.source == nb_data_source] 

sourceClassLabel= sourceClassDict.source_label.to_list()

sourceClassCode = sourceClassDict.code.to_list()


df['class'].replace(to_replace=sourceClassLabel, value=sourceClassCode, inplace=True )

#find codes which did not match 
df.loc[~df['class'].isin(sourceClassCode)]

Unnamed: 0,class,segment,1999,2000,2001,2002,2003,2004,2005,2006,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,all,all,17534315,17882229,18101675,18617413,18883584,19156055,19515295,20065171,...,21387132,21847601,22246916,22366270,23006222,23538817,23923806,24269868,24566696,25060399
1,all,"Vehicles weighing less than 4,500 kilograms",16538054,16832180,17054798,17543659,17768773,17989919,18275275,18738941,...,19876990,20267982,20608101,20651993,21261660,21729596,22067778,22410030,22678328,23137203
2,all,"Vehicles weighing 4,500 kilograms to 14,999 ki...",386804,391285,387330,366962,379079,393528,415764,442607,...,503505,480780,505702,533824,550572,575363,591897,590023,605353,620481
3,all,"Vehicles weighing 15,000 kilograms or more",262326,270148,267129,277339,282420,285942,301574,318272,...,326190,396232,415422,431614,432684,455004,464322,462908,471541,481182


In [7]:
# read in the segment_metatdata sheet which has equivalent terms for segment types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
segmentMetadata = pd.read_excel(metaxls, sheet_name= 'segment_metadata', index_col= 0)

# limit the data to the rows specific to relevent source
sourceSegmentDict= segmentMetadata[segmentMetadata.source == nb_data_source] 

sourceSegmentLabel= sourceSegmentDict.source_label.to_list()

sourceSegmentCode = sourceSegmentDict.code.to_list()


df['segment'].replace(to_replace=sourceSegmentLabel, value=sourceSegmentCode, inplace=True )

#find codes which did not match 
df.loc[~df['segment'].isin(sourceSegmentCode)]

Unnamed: 0,class,segment,1999,2000,2001,2002,2003,2004,2005,2006,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,all,all,17534315,17882229,18101675,18617413,18883584,19156055,19515295,20065171,...,21387132,21847601,22246916,22366270,23006222,23538817,23923806,24269868,24566696,25060399
4,CANBUS,all,73174,77341,74086,79364,79948,77842,78962,80447,...,85579,86327,86594,87387,88878,90650,90551,90643,90925,91906
5,CANMC,all,273957,311275,318330,350088,373362,408822,443718,484903,...,594866,616280,631097,661452,672428,688204,709258,716264,720549,729627


## rearrange data into proper format

In [8]:
df.head()

Unnamed: 0,class,segment,1999,2000,2001,2002,2003,2004,2005,2006,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,all,all,17534315,17882229,18101675,18617413,18883584,19156055,19515295,20065171,...,21387132,21847601,22246916,22366270,23006222,23538817,23923806,24269868,24566696,25060399
1,all,<4500_kg,16538054,16832180,17054798,17543659,17768773,17989919,18275275,18738941,...,19876990,20267982,20608101,20651993,21261660,21729596,22067778,22410030,22678328,23137203
2,all,4500-14999_kg,386804,391285,387330,366962,379079,393528,415764,442607,...,503505,480780,505702,533824,550572,575363,591897,590023,605353,620481
3,all,>15000_kg,262326,270148,267129,277339,282420,285942,301574,318272,...,326190,396232,415422,431614,432684,455004,464322,462908,471541,481182
4,CANBUS,all,73174,77341,74086,79364,79948,77842,78962,80447,...,85579,86327,86594,87387,88878,90650,90551,90643,90925,91906


In [9]:
#.melt() pivots the table bringing the column headers into a new attribute
melted = df.melt(id_vars=['class','segment'], var_name= 'year')
melted

Unnamed: 0,class,segment,year,value
0,all,all,1999,17534315
1,all,<4500_kg,1999,16538054
2,all,4500-14999_kg,1999,386804
3,all,>15000_kg,1999,262326
4,CANBUS,all,1999,73174
...,...,...,...,...
115,all,<4500_kg,2018,23137203
116,all,4500-14999_kg,2018,620481
117,all,>15000_kg,2018,481182
118,CANBUS,all,2018,91906


## structuring the data into format of datastructure


In [10]:
df = melted

# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']= '' #df['year_of_measurement'].astype(str) + '-12-31'
df.head(2)

df.loc[:,'geo']= nb_geography
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= df['class']
df.loc[:,'vehicle_segment']= df['segment']
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= ''
df.loc[:,'year_of_first_registraion']=''
# df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-08-20'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= ''

# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [11]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
013000000000,1999,,Canada,r,all,all,all,,,17534315,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000001,1999,,Canada,r,all,<4500_kg,all,,,16538054,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000002,1999,,Canada,r,all,4500-14999_kg,all,,,386804,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000003,1999,,Canada,r,all,>15000_kg,all,,,262326,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000004,1999,,Canada,r,CANBUS,all,all,,,73174,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
013000000115,2018,,Canada,r,all,<4500_kg,all,,,23137203,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000116,2018,,Canada,r,all,4500-14999_kg,all,,,620481,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000117,2018,,Canada,r,all,>15000_kg,all,,,481182,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000118,2018,,Canada,r,CANBUS,all,all,,,91906,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,


## at this point restart kernel and run all cells

In [12]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [13]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

In [14]:
# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([df_out,stock_pickle]).drop_duplicates()

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata]).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [15]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [16]:
stock_df

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
013000000000,1999,,Canada,r,all,all,all,,,17534315,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000001,1999,,Canada,r,all,<4500_kg,all,,,16538054,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000002,1999,,Canada,r,all,4500-14999_kg,all,,,386804,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000003,1999,,Canada,r,all,>15000_kg,all,,,262326,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
013000000004,1999,,Canada,r,CANBUS,all,all,,,73174,nr,STATCAN,2020-08-20,013_s_CA_1998-2019_cl_sg_STATCAN.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
004000003800,2018,,Sweden,r,EUM1,all,ICE,all,,212504,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003801,2018,,United Kingdom of Great Britain and Northern I...,r,EUM1,all,ICE,all,,343,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,"definition differs, see eurostat metadata"
004000003803,2018,,Norway,r,EUM1,all,ICE,all,,1008,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003804,2018,,Switzerland,r,EUM1,all,ICE,all,,3700,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
