# Information regarding the data that this notebook refers to

# fill out the following fields. Check that input data and notebook name match.

In [1]:
notebook_id= '008'
name_string= '_s_NO_2016_2019_cl_me_statbank'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.csv'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'Norway'
nb_start_time= '2016'
nb_stop_time= '2019'
nb_attribute_1= 'class'
nb_attribute_2= 'registered'
nb_attribute_3= 'motor energy'
nb_attribute_4= ''
nb_data_source= 'STATBANK'
nb_data_source_url= 'https://www.ssb.no/en/statbank/table/11823/'
nb_comment= 'hybrids in separate categories'

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
df = pd.read_csv(nb_input_workbook, delimiter= ';', skiprows=2)

In [5]:
df.rename(columns= {'type of fuel':'motor_energy','contents':'class','11823: Registered vehicles,':'value'}, inplace=True )

In [6]:
df.head(2)


Unnamed: 0,class,motor_energy,year,value
0,Private cars,Petrol,2016,1196148
1,Private cars,Petrol,2017,1139998


In [7]:
# MOTOR ENERGY code replacement
# read in motor energy dictionary metadata,
metadataXls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
motorEnergyDictionary= pd.read_excel(metadataXls, sheet_name= 'motor_energy_dictionary', index_col= 0)

sourceMEDict = motorEnergyDictionary.loc[motorEnergyDictionary['source']== nb_data_source]

motorEnergySource = sourceMEDict['source_code'].to_list()
motorEnergyCode = sourceMEDict['output_code'].to_list()

df['motor_energy'].replace(to_replace=motorEnergySource, value= motorEnergyCode, inplace=True )
#find codes which did not match 
df.loc[~df['motor_energy'].isin(motorEnergyCode)]

Unnamed: 0,class,motor_energy,year,value


 #### replace class titles with codes
  

In [8]:
df.rename(columns={'class':'vehicle_class'}, inplace= True)
# read in the class_metatdata sheet which has equivalent terms for class types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
classMetadata = pd.read_excel(metaxls, sheet_name= 'class_metadata', index_col= 0)

# limit the data to the rows specific to relevent source
sourceClassDict= classMetadata[classMetadata.source == nb_data_source] 

sourceClassLabel= sourceClassDict.source_label.to_list()

sourceClassCode = sourceClassDict.code.to_list()


df['vehicle_class'].replace(to_replace=sourceClassLabel, value=sourceClassCode, inplace=True )
df['vehicle_class'].replace(to_replace='Grand Total ', value='all', inplace=True )

#find codes which did not match 
df.loc[~df['vehicle_class'].isin(classMetadata.code)]

Unnamed: 0,vehicle_class,motor_energy,year,value


## rearrange data into proper format

In [9]:
df.head(1)

Unnamed: 0,vehicle_class,motor_energy,year,value
0,NOPC,ICE,2016,1196148


## structuring the data into format of datastructure


In [10]:
# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [11]:
#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']=df['year_of_measurement'].astype(str) + '-12-31'
df.head(2)

Unnamed: 0,vehicle_class,motor_energy,year,value,year_of_measurement,date_of_measurement
8000000000,NOPC,ICE,2016,1196148,2016,2016-12-31
8000000001,NOPC,ICE,2017,1139998,2017,2017-12-31


In [12]:
df.loc[:,'geo']= 'Norway'
df.loc[:,'process']= 'r'
# df.loc[:,'vehicle_class']= df['class']
df.loc[:,'vehicle_segment']='all'
# df.loc[:,'motor_energy']= df['motor_energy']
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
# df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-08-04'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= 'hybrids in own motor energy category'

In [13]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [14]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
008000000000,2016,2016-12-31,Norway,r,NOPC,all,ICE,all,,1196148,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000001,2017,2017-12-31,Norway,r,NOPC,all,ICE,all,,1139998,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000002,2018,2018-12-31,Norway,r,NOPC,all,ICE,all,,1075179,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000003,2019,2019-12-31,Norway,r,NOPC,all,ICE,all,,1031207,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000004,2016,2016-12-31,Norway,r,NOPC,all,ICE,all,,1276947,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
008000000391,2019,2019-12-31,Norway,r,NOHMC,all,HEV,all,,0,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000392,2016,2016-12-31,Norway,r,NOHMC,all,OTH,all,,6,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000393,2017,2017-12-31,Norway,r,NOHMC,all,OTH,all,,5,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000394,2018,2018-12-31,Norway,r,NOHMC,all,OTH,all,,5,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category


## at this point restart kernel and run all cells

In [15]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [16]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

In [17]:
# concatenate the out_df to the stock_pickle and remove any duplicate rows
# here it is important to think about the categories of duplicates to drop.
stock_df = pd.concat([df_out,stock_pickle]).drop_duplicates(subset=['geo','notebook','source','value','year_of_measurement'])

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata]).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [18]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [19]:
stock_df

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
008000000000,2016,2016-12-31,Norway,r,NOPC,all,ICE,all,,1196148,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000001,2017,2017-12-31,Norway,r,NOPC,all,ICE,all,,1139998,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000002,2018,2018-12-31,Norway,r,NOPC,all,ICE,all,,1075179,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000003,2019,2019-12-31,Norway,r,NOPC,all,ICE,all,,1031207,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
008000000004,2016,2016-12-31,Norway,r,NOPC,all,ICE,all,,1276947,nr,STATBANK,2020-08-04,008_s_NO_2016_2019_cl_me_statbank.ipynb,hybrids in own motor energy category
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
004000003800,2018,,Sweden,r,EUM1,all,ICE,all,,212504,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003801,2018,,United Kingdom of Great Britain and Northern I...,r,EUM1,all,ICE,all,,343,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,"definition differs, see eurostat metadata"
004000003803,2018,,Norway,r,EUM1,all,ICE,all,,1008,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003804,2018,,Switzerland,r,EUM1,all,ICE,all,,3700,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
