# Information regarding the data that this notebook refers to

# fill out the following fields. Check that input data and notebook name match.

# reference vehicle_fleet_data.xlsx notebook metadata to select notebook_id

In [1]:
notebook_id= '009'
name_string= '_s_NO_1950_2007_cl_statbank'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'Norway'
nb_start_time= '1950'
nb_stop_time= '2007'
nb_attribute_1= 'class'
nb_attribute_2= 'registered'
nb_attribute_3= ''
nb_attribute_4= ''
nb_data_source= 'STATBANK'
nb_data_source_url= 'https://www.ssb.no/en/statbank/table/01960/'
nb_comment= ''

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [13]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [14]:
# extract the data into a dataframe or several
df = pd.read_excel(xls, sheet_name= 'Personbiler1')
df

Unnamed: 0,class,1950,1951,1952,1953,1954,1955,1956,1957,1958,...,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
0,Passenger cars,65028,69509,78643,90771,108034,122143,134111,153391,172630,...,1786404,1813642,1851929,1872862,1899767,1933660,1977922,2028909,2084193,2154837
1,Buses,3797,3998,4207,4269,4499,4633,4715,4860,4911,...,36218,37039,36686,35667,34110,32374,30592,28783,26954,25204
2,Vans,15808,19519,25105,28917,30671,32597,35515,40671,46135,...,208955,219731,233248,249939,258133,268326,284029,302956,331052,361911
3,Lorries,31055,35880,39031,41219,44580,46956,47951,47938,48213,...,74653,75959,76224,76414,77442,79356,80623,82778,83609,84742
4,Combined vehicles,1960,1892,1857,1932,1950,1920,1936,2009,1990,...,107221,107349,104868,100624,95453,90600,85149,79705,73904,67020
5,Tractors etc,2140,3101,4291,5706,7700,8756,8722,8646,8582,...,215379,218287,220135,222805,225905,229688,229265,231630,234243,237484
6,Special purpose vehicles,..,..,..,..,..,..,..,..,..,...,8705,8949,9069,9116,9086,9038,8547,8361,8255,8208
7,Mopeds,..,..,826,1510,2770,7122,14276,36789,55012,...,113868,114868,115892,121159,130528,140796,144855,148161,151670,156287
8,Light motor cycles,8162,8733,8593,8754,15545,17669,18418,19811,21197,...,6389,7541,8785,9583,10510,11521,12485,13630,14975,16589
9,Heavy motor cycles,16768,18284,21239,27533,30086,36755,42164,48515,52370,...,64090,70592,76887,80685,84135,87279,91231,95708,101900,109618


 #### replace class titles with codes
  

In [15]:
# read in the class_metatdata sheet which has equivalent terms for class types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
classMetadata = pd.read_excel(metaxls, sheet_name= 'class_metadata', index_col= 0)

In [16]:
classMetadata

Unnamed: 0_level_0,code,label,description,footnote,source,sourcelink1,sourcelink2,Unnamed: 8,Unnamed: 9,Unnamed: 10,PCMO,SML0,classification scheme3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1.0,EUM1,passenger car,"Designed for the carriage of passengers, with ...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,p,m,
2.0,EUM2,light bus,"Designed for the carriage of passengers, carry...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,l,
3.0,EUM3,heavy bus,"Designed for the carriage of passengers, havin...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,l,
4.0,EUN1,light commercial vehicle,"Designed for the carriage of goods, having a m...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,m,
5.0,EUN2,heavy commercial vehicle,"Designed for the carriage of goods, having a m...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,l,
6.0,EUN3,transport truck,"Designed for the carriage of goods, having a m...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,l,
7.0,EUL,"motorcycles, tricycles, and quadracycles",Motor vehicles with less than four wheels and ...,,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,m,s,
8.0,OIPC,passenger car,"Passenger cars are road motor vehicles, other ...",,OICA,http://www.oica.net/wp-content/uploads//DEFINI...,,,,,p,m,
9.0,OICV,commercial vehicle,Commercial vehicles include light commercial v...,,OICA,http://www.oica.net/wp-content/uploads//DEFINI...,,,,,c,l,
10.0,USLDS,"light duty vehicle, short wheel base","Light duty vehicle, short wheel base replaced ...",,USDOT,https://www.bts.gov/content/number-us-aircraft...,https://www.bts.gov/sites/bts.dot.gov/files/do...,,,,p,m,


In [17]:
# limit the data to the rows specific to relevent source
sourceClassDF= classMetadata[classMetadata.source == nb_data_source] 

In [18]:
sourceClassDF

Unnamed: 0_level_0,code,label,description,footnote,source,sourcelink1,sourcelink2,Unnamed: 8,Unnamed: 9,Unnamed: 10,PCMO,SML0,classification scheme3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
24.0,NOPC,Private cars,A private car is a car mainly with the purpose...,,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,p,p,
25.0,NOAMB,Ambulances,Motor vehicle specialised for transport of sic...,,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,c,m,
26.0,NOBUS,Buses,A bus is a vehicle for passenger transport wit...,,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,c,m,
,NOVAN,Vans,A van is a car for transport of goods with max...,,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,c,m,
27.0,NOLO,Lorries,A lorry is a vehicle for transport of goods,,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,c,l,
28.0,NOCV,Combines vehicles,Vehicle mainly with the purpose of transportin...,,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,c,m,
29.0,NOTR,Tractors,A tractor is a motor vehicle mostly used for a...,,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,o,o,
30.0,NOMP,Mopeds,"Includes all kind of mopeds, both two wheel-, ...",,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,m,s,
31.0,NOLMC,Light motor cycles,A light motor cycle is a motorcycle with cylin...,,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,m,s,
32.0,NOHMC,Heavy motor cycles,Motor cycle with a cylinder capacity above 125...,,STATBANK,https://www.ssb.no/a/metadata/conceptvariable/...,,,,,m,s,


In [19]:
sourceLabel= sourceClassDF.label.to_list()

In [20]:
sourceCode = sourceClassDF.code.to_list()


df['class'].replace(to_replace=sourceLabel, value=sourceCode, inplace=True )

#find codes which did not match 
df.loc[~df['class'].isin(sourceCode)]

Unnamed: 0,class,1950,1951,1952,1953,1954,1955,1956,1957,1958,...,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
0,Passenger cars,65028,69509,78643,90771,108034,122143,134111,153391,172630,...,1786404,1813642,1851929,1872862,1899767,1933660,1977922,2028909,2084193,2154837
4,Combined vehicles,1960,1892,1857,1932,1950,1920,1936,2009,1990,...,107221,107349,104868,100624,95453,90600,85149,79705,73904,67020
5,Tractors etc,2140,3101,4291,5706,7700,8756,8722,8646,8582,...,215379,218287,220135,222805,225905,229688,229265,231630,234243,237484
6,Special purpose vehicles,..,..,..,..,..,..,..,..,..,...,8705,8949,9069,9116,9086,9038,8547,8361,8255,8208


## rearrange data into proper format

In [21]:
df.head()

Unnamed: 0,class,1950,1951,1952,1953,1954,1955,1956,1957,1958,...,1998,1999,2000,2001,2002,2003,2004,2005,2006,2007
0,Passenger cars,65028,69509,78643,90771,108034,122143,134111,153391,172630,...,1786404,1813642,1851929,1872862,1899767,1933660,1977922,2028909,2084193,2154837
1,NOBUS,3797,3998,4207,4269,4499,4633,4715,4860,4911,...,36218,37039,36686,35667,34110,32374,30592,28783,26954,25204
2,NOVAN,15808,19519,25105,28917,30671,32597,35515,40671,46135,...,208955,219731,233248,249939,258133,268326,284029,302956,331052,361911
3,NOLO,31055,35880,39031,41219,44580,46956,47951,47938,48213,...,74653,75959,76224,76414,77442,79356,80623,82778,83609,84742
4,Combined vehicles,1960,1892,1857,1932,1950,1920,1936,2009,1990,...,107221,107349,104868,100624,95453,90600,85149,79705,73904,67020


In [24]:
#.melt() pivots the table bringing the column headers into a new attribute
melted = df.melt(id_vars=['class'], var_name= 'year')
melted

Unnamed: 0,class,year,value
0,Passenger cars,1950,65028
1,NOBUS,1950,3797
2,NOVAN,1950,15808
3,NOLO,1950,31055
4,Combined vehicles,1950,1960
...,...,...,...
575,Tractors etc,2007,237484
576,Special purpose vehicles,2007,8208
577,NOMP,2007,156287
578,NOLMC,2007,16589


## structuring the data into format of datastructure


In [25]:
#rename the data frame to df
df=melted
# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [26]:
#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']=df['year_of_measurement'].astype(str) + '-12-31'
df.head(2)

Unnamed: 0,class,year,value,year_of_measurement,date_of_measurement
9000000000,Passenger cars,1950,65028,1950,1950-12-31
9000000001,NOBUS,1950,3797,1950,1950-12-31


In [35]:
df.loc[:,'geo']= 'Norway'
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= df['class']
df.loc[:,'vehicle_segment']='all'
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-08-04'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= ''

In [36]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [37]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
009000000000,1950,1950-12-31,Norway,r,Passenger cars,all,all,all,,65028,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000001,1950,1950-12-31,Norway,r,NOBUS,all,all,all,,3797,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000002,1950,1950-12-31,Norway,r,NOVAN,all,all,all,,15808,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000003,1950,1950-12-31,Norway,r,NOLO,all,all,all,,31055,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000004,1950,1950-12-31,Norway,r,Combined vehicles,all,all,all,,1960,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
009000000575,2007,2007-12-31,Norway,r,Tractors etc,all,all,all,,237484,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000576,2007,2007-12-31,Norway,r,Special purpose vehicles,all,all,all,,8208,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000577,2007,2007-12-31,Norway,r,NOMP,all,all,all,,156287,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000578,2007,2007-12-31,Norway,r,NOLMC,all,all,all,,16589,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,


## at this point restart kernel and run all cells

In [38]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [39]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

In [40]:
# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([df_out,stock_pickle]).drop_duplicates(subset=['geo','notebook','source','value','year_of_measurement'])

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([notebook_metadata,metadata_pickle]).drop_duplicates(subset= 'notebook_name' )

metadata_df.to_pickle('metadata_df.pickle')

In [41]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [42]:
stock_df.notebook.unique()

array(['009_s_NO_1950_2007_cl_statbank.ipynb',
       '007_s_NO_2008_2015_cl_me_statbank.ipynb',
       '003_s_GL_2005_2015_cl(cv)_oica.ipynb',
       '006_s_IN_2001_2016_cl_inmorth.ipynb',
       '008_s_NO_2016_2019_cl_me_statbank.ipynb',
       '005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb',
       'oica_st_2005_2015_iu',
       '004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb'], dtype=object)