# Information regarding the data that this notebook refers to

# fill out the following fields. Check that input data and notebook name match.

In [1]:
notebook_id= '012'
name_string= '_s_Japan_1970_2019_cl_yofr_JPAMA'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'Japan'
nb_start_time= '1970'
nb_stop_time= '2019'
nb_attribute_1= 'class'
nb_attribute_2= 'registered'
nb_attribute_3= 'year of first registration'
nb_attribute_4= ''
nb_data_source= 'JPAMA'
nb_data_source_url= 'http://www.jama-english.jp/publications/The_Motor_Industry_of_Japan_2020.pdf'
nb_comment= ''

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
df=pd.read_excel(xls, sheet_name= 'data')

 #### replace class titles with codes
  

In [6]:
# read in the class_metatdata sheet which has equivalent terms for class types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
classMetadata = pd.read_excel(metaxls, sheet_name= 'class_metadata', index_col= 0)

# limit the data to the rows specific to relevent source
sourceClassDict= classMetadata[classMetadata.source == nb_data_source] 

sourceClassLabel= sourceClassDict.source_label.to_list()

sourceClassCode = sourceClassDict.code.to_list()


df['class'].replace(to_replace=sourceClassLabel, value=sourceClassCode, inplace=True )

#find codes which did not match 
df.loc[~df['class'].isin(classMetadata.code)]

Unnamed: 0,class,segment,1970,1975,1980,1985,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019


In [7]:
# read in the segment_metatdata sheet which has equivalent terms for segment types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
segmentMetadata = pd.read_excel(metaxls, sheet_name= 'segment_metadata', index_col= 0)

# limit the data to the rows specific to relevent source
sourceSegmentDict= segmentMetadata[segmentMetadata.source == nb_data_source] 

sourceSegmentLabel= sourceSegmentDict.source_label.to_list()

sourceSegmentCode = sourceSegmentDict.code.to_list()


df['segment'].replace(to_replace=sourceSegmentLabel, value=sourceSegmentCode, inplace=True )

#find codes which did not match 
df.loc[~df['segment'].isin(sourceSegmentCode)]

Unnamed: 0,class,segment,1970,1975,1980,1985,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
3,JPPASS,all,8778972,17236321,23659520,27844580,34924172,44680037,52437375,57090789,58347387,58670314,59421009,60035297,60667517,60987342,61403630,61803118,62025916,62140475
7,JPTRUCK,all,8281759,10043853,13177479,17139806,21321439,20430149,18225508,16733871,15284625,14970422,14835120,14703886,14624986,14503218,14411953,14321167,14296113,14297185
10,JPBUS,all,187980,226284,230020,231228,245668,243095,235483,231733,227271,225948,226079,225927,226944,229389,232321,233466,233223,232166
11,JPSP,all,333132,584100,789155,941647,1206390,1500219,1750733,1630062,1502593,1646203,1643325,1653956,1669019,1684382,1702616,1720118,1734185,1746765
12,all,all,17581843,28090558,37856174,46157261,57697669,66853500,72649099,75686455,75361876,75512887,76125533,76619066,77188466,77404331,77750520,78077869,78289437,78416591


## rearrange data into proper format

In [8]:
df.head()

Unnamed: 0,class,segment,1970,1975,1980,1985,1990,1995,2000,2005,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,JPPASS,>2000_cc,77374,207511,472314,711914,1784594,7874189,13942626,16634529,16890402,17039684,17294021,17509103,17714352,17935861,18357734,18799713,19198666,19603788
1,JPPASS,660-2000_cc,6457181,14417680,21011096,25116179,30554652,31030462,28593491,26254546,23470003,23143892,22868749,22435835,21974741,21547282,21195621,20842558,20383197,19858361
2,JPPASS,<660_cc,2244417,2611130,2176110,2016487,2584926,5775386,9901258,14201714,17986982,18486738,19258239,20090359,20978424,21504199,21850275,22160847,22444053,22678326
3,JPPASS,all,8778972,17236321,23659520,27844580,34924172,44680037,52437375,57090789,58347387,58670314,59421009,60035297,60667517,60987342,61403630,61803118,62025916,62140475
4,JPTRUCK,>2000_cc,798256,1158465,1494464,1668852,2176488,2574433,2596421,2474378,2281711,2266420,2266836,2270812,2294449,2316208,2337230,2356279,2382877,2413551


In [9]:
#.melt() pivots the table bringing the column headers into a new attribute
melted = df.melt(id_vars=['class','segment'], var_name= 'year')
melted

Unnamed: 0,class,segment,year,value
0,JPPASS,>2000_cc,1970,77374
1,JPPASS,660-2000_cc,1970,6457181
2,JPPASS,<660_cc,1970,2244417
3,JPPASS,all,1970,8778972
4,JPTRUCK,>2000_cc,1970,798256
...,...,...,...,...
229,JPBUS,JPlargeBus,2019,112169
230,JPBUS,JPsmallBus,2019,119997
231,JPBUS,all,2019,232166
232,JPSP,all,2019,1746765


## structuring the data into format of datastructure


In [10]:
df = melted

# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']= df['year_of_measurement'].astype(str) + '-12-31'
df.head(2)

df.loc[:,'geo']= 'Japan'
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= df['class']
df.loc[:,'vehicle_segment']= df['segment']
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= ''
df.loc[:,'year_of_first_registraion']=''
# df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-08-19'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= ''

# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [11]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
012000000000,1970,1970-12-31,Japan,r,JPPASS,>2000_cc,all,,,77374,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000001,1970,1970-12-31,Japan,r,JPPASS,660-2000_cc,all,,,6457181,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000002,1970,1970-12-31,Japan,r,JPPASS,<660_cc,all,,,2244417,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000003,1970,1970-12-31,Japan,r,JPPASS,all,all,,,8778972,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000004,1970,1970-12-31,Japan,r,JPTRUCK,>2000_cc,all,,,798256,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
012000000229,2019,2019-12-31,Japan,r,JPBUS,JPlargeBus,all,,,112169,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000230,2019,2019-12-31,Japan,r,JPBUS,JPsmallBus,all,,,119997,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000231,2019,2019-12-31,Japan,r,JPBUS,all,all,,,232166,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000232,2019,2019-12-31,Japan,r,JPSP,all,all,,,1746765,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,


In [12]:
df2= pd.read_excel(xls, sheet_name= 'year of first registration', index_col=0)

In [13]:
df2.head(2)

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
234,2019,2020-12-31,Japan,r,Passenger Cars,all,all,,2019,2836599,nr,JPAMA,19-08-2020,,year of first registration is counted 12 month...
235,2019,2021-01-01,Japan,r,Passenger Cars,all,all,,2018,2839306,nr,JPAMA,19-08-2021,,year of first registration is counted 12 month...


In [14]:
# read in the class_metatdata sheet which has equivalent terms for class types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
classMetadata = pd.read_excel(metaxls, sheet_name= 'class_metadata', index_col= 0)

# limit the data to the rows specific to relevent source
sourceClassDict= classMetadata[classMetadata.source == nb_data_source] 

sourceClassLabel= sourceClassDict.source_label.to_list()

sourceClassCode = sourceClassDict.code.to_list()


df2['vehicle_class'].replace(to_replace=sourceClassLabel, value=sourceClassCode, inplace=True )

#find codes which did not match 
df2.loc[~df2['vehicle_class'].isin(classMetadata.code)]

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote


In [15]:
df2.head()

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
234,2019,2020-12-31,Japan,r,JPPASS,all,all,,2019,2836599,nr,JPAMA,19-08-2020,,year of first registration is counted 12 month...
235,2019,2021-01-01,Japan,r,JPPASS,all,all,,2018,2839306,nr,JPAMA,19-08-2021,,year of first registration is counted 12 month...
236,2019,2021-01-02,Japan,r,JPPASS,all,all,,2017,2800925,nr,JPAMA,19-08-2022,,year of first registration is counted 12 month...
237,2019,2021-01-03,Japan,r,JPPASS,all,all,,2016,2491284,nr,JPAMA,19-08-2023,,year of first registration is counted 12 month...
238,2019,2021-01-04,Japan,r,JPPASS,all,all,,2015,2459920,nr,JPAMA,19-08-2024,,year of first registration is counted 12 month...


In [16]:
# add in a column of indexes
df2.index = notebook_id + df2.index.astype(str).str.zfill(9)

#rename or add all necessary columns
# df2.loc[:,'year_of_measurement']= df2.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df2.loc[:,'date_of_measurement']= df2['year_of_measurement'].astype(str) + '-12-31'

df2.loc[:,'geo']= 'Japan'
df2.loc[:,'process']= 'r'
# df2.loc[:,'vehicle_class']= df2['class']
# df2.loc[:,'vehicle_segment']= df2['segment']
df2.loc[:,'motor_energy']= 'all'
# df2.loc[:,'model_year']= ''
# df2.loc[:,'year_of_first_registraion']= 
# df2.loc[:,'value']= df2['value']
df2.loc[:,'unit']= 'nr'
df2.loc[:,'source']= nb_data_source
df2.loc[:,'accessed']= '2020-08-19'
df2.loc[:,'notebook']= nb_name
df2.loc[:,'footnote']= 'year of first registration is counted 12 months previous to March 31 of the year given'

# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df2_out= df2[heading_list]
df2_out.head(2)

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
12000000234,2019,2019-12-31,Japan,r,JPPASS,all,all,,2019,2836599,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
12000000235,2019,2019-12-31,Japan,r,JPPASS,all,all,,2018,2839306,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...


In [17]:
df_out = pd.concat([df2_out, df_out]).drop_duplicates()
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
012000000234,2019,2019-12-31,Japan,r,JPPASS,all,all,,2019,2836599,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
012000000235,2019,2019-12-31,Japan,r,JPPASS,all,all,,2018,2839306,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
012000000236,2019,2019-12-31,Japan,r,JPPASS,all,all,,2017,2800925,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
012000000237,2019,2019-12-31,Japan,r,JPPASS,all,all,,2016,2491284,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
012000000238,2019,2019-12-31,Japan,r,JPPASS,all,all,,2015,2459920,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
012000000229,2019,2019-12-31,Japan,r,JPBUS,JPlargeBus,all,,,112169,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000230,2019,2019-12-31,Japan,r,JPBUS,JPsmallBus,all,,,119997,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000231,2019,2019-12-31,Japan,r,JPBUS,all,all,,,232166,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,
012000000232,2019,2019-12-31,Japan,r,JPSP,all,all,,,1746765,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,


## at this point restart kernel and run all cells

In [18]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [19]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

In [20]:
# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([df_out,stock_pickle]).drop_duplicates(subset= ['year_of_measurement','value', 'year_of_first_registraion'])

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata]).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [21]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [22]:
stock_df

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
012000000234,2019,2019-12-31,Japan,r,JPPASS,all,all,,2019,2836599,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
012000000235,2019,2019-12-31,Japan,r,JPPASS,all,all,,2018,2839306,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
012000000236,2019,2019-12-31,Japan,r,JPPASS,all,all,,2017,2800925,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
012000000237,2019,2019-12-31,Japan,r,JPPASS,all,all,,2016,2491284,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
012000000238,2019,2019-12-31,Japan,r,JPPASS,all,all,,2015,2459920,nr,JPAMA,2020-08-19,012_s_Japan_1970_2019_cl_yofr_JPAMA.ipynb,year of first registration is counted 12 month...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
004000003800,2018,,Sweden,r,EUM1,all,ICE,all,,212504,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003801,2018,,United Kingdom of Great Britain and Northern I...,r,EUM1,all,ICE,all,,343,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,"definition differs, see eurostat metadata"
004000003803,2018,,Norway,r,EUM1,all,ICE,all,,1008,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003804,2018,,Switzerland,r,EUM1,all,ICE,all,,3700,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,


In [23]:
stock_df[~stock_df.vehicle_class.isin(classMetadata.code)]


Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
009000000000,1950,1950-12-31,Norway,r,Passenger cars,all,all,all,,65028,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000004,1950,1950-12-31,Norway,r,Combined vehicles,all,all,all,,1960,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000005,1950,1950-12-31,Norway,r,Tractors etc,all,all,all,,2140,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000006,1950,1950-12-31,Norway,r,Special purpose vehicles,all,all,all,,..,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
009000000010,1951,1951-12-31,Norway,r,Passenger cars,all,all,all,,69509,nr,STATBANK,2020-08-04,009_s_NO_1950_2007_cl_statbank.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
006000000059,2012,,India,r,M,all,all,all,,7.65839e+06,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000060,2013,,India,r,M,all,all,all,,8.59676e+06,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000061,2014,,India,r,M,all,all,all,,8.69754e+06,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000062,2015,,India,r,M,all,all,all,,9.34446e+06,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
