# Information regarding the data that this notebook refers to

# fill out the following fields. Check that input data and notebook name match.

In [1]:
notebook_id= '010'
name_string= '_s_china_1949_2018_cl_sg_chinastat'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'China'
nb_start_time= '1949'
nb_stop_time= '2018'
nb_attribute_1= 'class'
nb_attribute_2= 'registered'
nb_attribute_3= 'segment'
nb_attribute_4= ''
nb_data_source= 'NBSCHINA'
nb_data_source_url= 'http://www.stats.gov.cn/tjsj/ndsj/2019/indexeh.htm'
nb_comment= 'source describes change in category details see source metadata'

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
df=pd.read_excel(xls, sheet_name= 'data')

In [27]:
df.tail()

Unnamed: 0,class,segment,Unit,year,value,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,unit,source,accessed,notebook,footnote
10000002054,CHCOM,all,10000.0,2019,,2019,,China,r,CHCOM,all,all,all,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
10000002055,CHCOM,all passenger,10000.0,2019,,2019,,China,r,CHCOM,all passenger,all,all,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
10000002056,CHCOM,all truck,10000.0,2019,,2019,,China,r,CHCOM,all truck,all,all,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
10000002057,Note: business vehicles only included those ow...,,,2019,,2019,,China,r,Note: business vehicles only included those ow...,,all,all,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
10000002058,"Note: Since 2002, there has been adjustment to...",,,2019,,2019,,China,r,"Note: Since 2002, there has been adjustment to...",,all,all,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"


 #### replace class titles with codes
  

In [7]:
# read in the class_metatdata sheet which has equivalent terms for class types according to source
metaxls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
classMetadata = pd.read_excel(metaxls, sheet_name= 'class_metadata', index_col= 0)

In [8]:
classMetadata

Unnamed: 0_level_0,code,label,description,footnote,source,sourcelink1,sourcelink2,Unnamed: 8,Unnamed: 9,Unnamed: 10,PCMO,SML0,classification scheme3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1.0,EUM1,passenger car,"Designed for the carriage of passengers, with ...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,p,m,
2.0,EUM2,light bus,"Designed for the carriage of passengers, carry...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,l,
3.0,EUM3,heavy bus,"Designed for the carriage of passengers, havin...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,l,
4.0,EUN1,light commercial vehicle,"Designed for the carriage of goods, having a m...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,m,
5.0,EUN2,heavy commercial vehicle,"Designed for the carriage of goods, having a m...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,l,
6.0,EUN3,transport truck,"Designed for the carriage of goods, having a m...",,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,c,l,
7.0,EUL,"motorcycles, tricycles, and quadracycles",Motor vehicles with less than four wheels and ...,,European Comission,https://ec.europa.eu/growth/sectors/automotive...,,,,,m,s,
8.0,OIPC,passenger car,"Passenger cars are road motor vehicles, other ...",,OICA,http://www.oica.net/wp-content/uploads//DEFINI...,,,,,p,m,
9.0,OICV,commercial vehicle,Commercial vehicles include light commercial v...,,OICA,http://www.oica.net/wp-content/uploads//DEFINI...,,,,,c,l,
10.0,USLDS,"light duty vehicle, short wheel base","Light duty vehicle, short wheel base replaced ...",,USDOT,https://www.bts.gov/content/number-us-aircraft...,https://www.bts.gov/sites/bts.dot.gov/files/do...,,,,p,m,


In [9]:
# limit the data to the rows specific to relevent source
sourceClassDict= classMetadata[classMetadata.source == nb_data_source] 

In [10]:
sourceClassDict

Unnamed: 0_level_0,code,label,description,footnote,source,sourcelink1,sourcelink2,Unnamed: 8,Unnamed: 9,Unnamed: 10,PCMO,SML0,classification scheme3
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
35.0,CHPRIV,private vehicles,not used for business or government purposes,,NBSCHINA,http://data.stats.gov.cn/english/,,,,,,,
36.0,CHCIV,civil vehicles,"includes private, agriculatural, commercial, a...",private and business are believed to be includ...,NBSCHINA,http://data.stats.gov.cn/english/,,,,,,,
37.0,CHCOM,business vehicles,vehicles used for the commercial purposes,Note: business vehicles only included those ow...,NBSCHINA,http://data.stats.gov.cn/english/,,,,,,,


In [11]:
sourceClassLabel= sourceClassDict.label.to_list()

In [12]:
sourceClassCode = sourceClassDict.code.to_list()


df['class'].replace(to_replace=sourceClassLabel, value=sourceClassCode, inplace=True )

#find codes which did not match 
df.loc[~df['class'].isin(sourceClassCode)]

Unnamed: 0,class,segment,Unit,1949,1950,1951,1952,1953,1954,1955,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
27,Note: business vehicles only included those ow...,,,,,,,,,,...,,,,,,,,,,
28,"Note: Since 2002, there has been adjustment to...",,,,,,,,,,...,,,,,,,,,,


## rearrange data into proper format

In [13]:
df.head()

Unnamed: 0,class,segment,Unit,1949,1950,1951,1952,1953,1954,1955,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,CHCIV,all,10000.0,5.09,5.43,6.19,6.63,7.81,8.95,10.14,...,7801.83,9356.32,10933.09,12670.14,14598.11,16284.45,18574.54,20906.67,23231.23,25387.2


In [14]:
#.melt() pivots the table bringing the column headers into a new attribute
melted = df.melt(id_vars=['class','segment','Unit'], var_name= 'year')
melted

Unnamed: 0,class,segment,Unit,year,value
0,CHCIV,all,10000.0,1949,5.09
1,CHCIV,all passenger,10000.0,1949,1.71
2,CHCIV,large passenger vehicles,10000.0,1949,
3,CHCIV,medium passenger vehicles,10000.0,1949,
4,CHCIV,small passenger vehicles,10000.0,1949,
...,...,...,...,...,...
2054,CHCOM,all,10000.0,2019,
2055,CHCOM,all passenger,10000.0,2019,
2056,CHCOM,all truck,10000.0,2019,
2057,Note: business vehicles only included those ow...,,,2019,


In [15]:
melted.value = 10000 * melted.value


## structuring the data into format of datastructure


In [16]:
df = melted
df.head(2)

Unnamed: 0,class,segment,Unit,year,value
0,CHCIV,all,10000.0,1949,50900.0
1,CHCIV,all passenger,10000.0,1949,17100.0


In [17]:
# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [18]:
#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']=''#df['year_of_measurement'].astype(str) + '-12-31'
df.head(2)

Unnamed: 0,class,segment,Unit,year,value,year_of_measurement,date_of_measurement
10000000000,CHCIV,all,10000.0,1949,50900.0,1949,
10000000001,CHCIV,all passenger,10000.0,1949,17100.0,1949,


In [19]:
df.loc[:,'geo']= 'China'
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= df['class']
df.loc[:,'vehicle_segment']= df['segment']
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
# df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-08-04'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= 'missing data means unknown, not zero'

In [20]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [21]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
010000000000,1949,,China,r,CHCIV,all,all,all,,50900.0,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000000001,1949,,China,r,CHCIV,all passenger,all,all,,17100.0,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000000002,1949,,China,r,CHCIV,large passenger vehicles,all,all,,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000000003,1949,,China,r,CHCIV,medium passenger vehicles,all,all,,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000000004,1949,,China,r,CHCIV,small passenger vehicles,all,all,,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
010000002054,2019,,China,r,CHCOM,all,all,all,,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000002055,2019,,China,r,CHCOM,all passenger,all,all,,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000002056,2019,,China,r,CHCOM,all truck,all,all,,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000002057,2019,,China,r,Note: business vehicles only included those ow...,,all,all,,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"


## at this point restart kernel and run all cells

In [22]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [23]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

In [24]:
# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([df_out,stock_pickle]).drop_duplicates(subset=['geo','notebook','source','value','year_of_measurement'])

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata]).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [25]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [26]:
stock_df

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
010000000000,1949,,China,r,CHCIV,all,all,all,,50900,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000000001,1949,,China,r,CHCIV,all passenger,all,all,,17100,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000000002,1949,,China,r,CHCIV,large passenger vehicles,all,all,,,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000000006,1949,,China,r,CHCIV,all truck,all,all,,32500,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
010000000011,1949,,China,r,CHCIV,other,all,all,,1300,nr,NBSCHINA,2020-08-04,010_s_china_1949_2018_cl_sg_chinastat.ipynb,"missing data means unknown, not zero"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
004000003800,2018,,Sweden,r,EUM1,all,ICE,all,,212504,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003801,2018,,United Kingdom of Great Britain and Northern I...,r,EUM1,all,ICE,all,,343,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,"definition differs, see eurostat metadata"
004000003803,2018,,Norway,r,EUM1,all,ICE,all,,1008,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
004000003804,2018,,Switzerland,r,EUM1,all,ICE,all,,3700,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,
