# Information regarding the data that this notebook refers to

# fill out the following fields. Check that input data and notebook name match.

In [12]:
notebook_id= '011'
name_string= '_s_USA_1960_2018_cl_usdot'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'USA'
nb_start_time= '1960'
nb_stop_time= '2018'
nb_attribute_1= 'class'
nb_attribute_2= 'registered'
nb_attribute_3= ''
nb_attribute_4= ''
nb_data_source= 'USDOT'
nb_data_source_url= 'https://www.bts.gov/content/number-us-aircraft-vehicles-vessels-and-other-conveyances'
nb_comment= 'multiple footnotes, see source metadata sheet'

In [13]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

In [14]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [33]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [34]:
df=pd.read_excel(xls, sheet_name= 'data')

In [48]:
df

Unnamed: 0,class,footnote,1960,1965,1970,1975,1980,1985,1990,1991,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,"Highway, total (registered vehicles)",,73857768,90357667,111242295,137912779,161490159,177133282,193057376,192313834,...,254212611,250070048,253215681,253639386,255876822,260350938,263610219,268799083,272480899,273602100
1,USLDS,"c,d",61671390,75257588,92067655,111670004,127294783,133329597,137959958,132476966,...,193979654,190202782,183522635,183171882,184497490,187554928,189618308,192774508,193672370,192856211
2,Motorcycle,d,U,U,U,U,U,U,U,U,...,7929724,8009503,8437502,8454939,8404687,8417718,8600936,8679380,8715204,8666185
3,USLDL,"c,d",U,U,14210591,20418250,27875934,37213863,48274555,53033443,...,40488025,40241658,50318787,50588676,51512740,52600309,53298884,54870473,56880878,57853642
4,"Truck, single-unit 2-axle 6-tire or more","e,f",U,13999285,3681405,4231622,4373784,4593071,4486981,4480815,...,8356097,8217189,7819055,8190286,8126007,8328759,8456302,8746518,9336998,10327899
5,"Truck, combination","e,f",11914249,786510,905082,1130747,1416869,1403266,1708895,1691331,...,2617118,2552865,2451638,2469094,2471349,2577197,2746882,2752043,2892218,2906011
6,Bus,,272129,314284,377562,462156,528789,593485,626987,631279,...,841993,846051,666064,764509,864549,872027,888907,976161,983231,992152


 #### replace class titles with codes
  

In [55]:
# read in the class_metatdata sheet which has equivalent terms for class types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
classMetadata = pd.read_excel(metaxls, sheet_name= 'class_metadata', index_col= 0)

# limit the data to the rows specific to relevent source
sourceClassDict= classMetadata[classMetadata.source == nb_data_source] 

sourceClassLabel= sourceClassDict.label.to_list()

sourceClassCode = sourceClassDict.code.to_list()


df['class'].replace(to_replace=sourceClassLabel, value=sourceClassCode, inplace=True )

#find codes which did not match 
df.loc[~df['class'].isin(sourceClassCode)]

## rearrange data into proper format

In [61]:
df.head()

Unnamed: 0,class,footnote,1960,1965,1970,1975,1980,1985,1990,1991,...,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018
0,all,,73857768,90357667,111242295,137912779,161490159,177133282,193057376,192313834,...,254212611,250070048,253215681,253639386,255876822,260350938,263610219,268799083,272480899,273602100
1,USLDS,"c,d",61671390,75257588,92067655,111670004,127294783,133329597,137959958,132476966,...,193979654,190202782,183522635,183171882,184497490,187554928,189618308,192774508,193672370,192856211
2,USMCY,d,U,U,U,U,U,U,U,U,...,7929724,8009503,8437502,8454939,8404687,8417718,8600936,8679380,8715204,8666185
3,USLDL,"c,d",U,U,14210591,20418250,27875934,37213863,48274555,53033443,...,40488025,40241658,50318787,50588676,51512740,52600309,53298884,54870473,56880878,57853642
4,USTSU,"e,f",U,13999285,3681405,4231622,4373784,4593071,4486981,4480815,...,8356097,8217189,7819055,8190286,8126007,8328759,8456302,8746518,9336998,10327899


In [69]:
#.melt() pivots the table bringing the column headers into a new attribute
melted = df.melt(id_vars=['class','footnote'], var_name= 'year')
melted

Unnamed: 0,class,footnote,year,value
0,all,,1960,73857768
1,USLDS,"c,d",1960,61671390
2,USMCY,d,1960,U
3,USLDL,"c,d",1960,U
4,USTSU,"e,f",1960,U
...,...,...,...,...
240,USMCY,d,2018,8666185
241,USLDL,"c,d",2018,57853642
242,USTSU,"e,f",2018,10327899
243,USTCO,"e,f",2018,2906011


## There is an opportunity to do a lambda function here to associate the proper footnotes with the appropriate years
##

## structuring the data into format of datastructure


In [70]:
df = melted
df.head(2)

Unnamed: 0,class,footnote,year,value
0,all,,1960,73857768
1,USLDS,"c,d",1960,61671390


In [71]:
# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [72]:
#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']=''#df['year_of_measurement'].astype(str) + '-12-31'
df.head(2)

Unnamed: 0,class,footnote,year,value,year_of_measurement,date_of_measurement
11000000000,all,,1960,73857768,1960,
11000000001,USLDS,"c,d",1960,61671390,1960,


In [73]:
df.loc[:,'geo']= 'United States of America'
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= df['class']
df.loc[:,'vehicle_segment']= ''#df['segment']
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= ''
df.loc[:,'year_of_first_registraion']=''
# df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-06-29'
df.loc[:,'notebook']= nb_name
# df.loc[:,'footnote']= ''

In [74]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [75]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
011000000000,1960,,United States of America,r,all,,all,,,73857768,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,
011000000001,1960,,United States of America,r,USLDS,,all,,,61671390,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,"c,d"
011000000002,1960,,United States of America,r,USMCY,,all,,,U,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,d
011000000003,1960,,United States of America,r,USLDL,,all,,,U,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,"c,d"
011000000004,1960,,United States of America,r,USTSU,,all,,,U,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,"e,f"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
011000000240,2018,,United States of America,r,USMCY,,all,,,8666185,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,d
011000000241,2018,,United States of America,r,USLDL,,all,,,57853642,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,"c,d"
011000000242,2018,,United States of America,r,USTSU,,all,,,10327899,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,"e,f"
011000000243,2018,,United States of America,r,USTCO,,all,,,2906011,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,"e,f"


## at this point restart kernel and run all cells

In [76]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [77]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

In [78]:
# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([df_out,stock_pickle]).drop_duplicates(subset=['geo','notebook','source','value','year_of_measurement'])

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata]).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [79]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [80]:
stock_df

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
011000000000,1960,,United States of America,r,all,,all,,,73857768,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,
011000000001,1960,,United States of America,r,USLDS,,all,,,61671390,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,"c,d"
011000000002,1960,,United States of America,r,USMCY,,all,,,U,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,d
011000000005,1960,,United States of America,r,USTCO,,all,,,11914249,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,"e,f"
011000000006,1960,,United States of America,r,USBUS,,all,,,272129,nr,USDOT,2020-06-29,011_s_USA_1960_2018_cl_usdot.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
004000003800,2018,,Sweden,r,EUM1,all,ICE,all,,212504,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003801,2018,,United Kingdom of Great Britain and Northern I...,r,EUM1,all,ICE,all,,343,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,"definition differs, see eurostat metadata"
004000003803,2018,,Norway,r,EUM1,all,ICE,all,,1008,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003804,2018,,Switzerland,r,EUM1,all,ICE,all,,3700,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
