# Information regarding the data that this notebook refers to

In [1]:
# fill out the following fields. Check that input data and notebook name match.

# reference vehicle_fleet_data.xlsx notebook metadata to select notebook_id
notebook_id= '005'
name_string= '_s_EU_1996_2018_cl(pc)_wt_eurostat'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'Europe'
nb_start_time= '1996'
nb_stop_time= '2018'
nb_attribute_1= 'passenger cars'
nb_attribute_2= 'registered'
nb_attribute_3= 'motor energy'
nb_attribute_4= ''
nb_data_source= 'EUROSTAT'
nb_data_source_url= 'https://appsso.eurostat.ec.europa.eu/nui/show.do?dataset=road_eqs_carpda&lang=en'
nb_comment= ''

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
# extract the data into a dataframe or several
df = pd.read_excel(xls)
df

Unnamed: 0,UNIT,GEO,WEIGHT,TIME,Value,Flag and Footnotes
0,NR,Austria,KG_LT1000,1996,:,
1,NR,Austria,KG_LT1000,1997,:,
2,NR,Austria,KG_LT1000,1998,:,
3,NR,Austria,KG_LT1000,1999,:,
4,NR,Austria,KG_LT1000,2000,:,
...,...,...,...,...,...,...
2203,NR,Kosovo (under United Nations Security Council ...,KG_GE1500,2014,:,
2204,NR,Kosovo (under United Nations Security Council ...,KG_GE1500,2015,:,
2205,NR,Kosovo (under United Nations Security Council ...,KG_GE1500,2016,:,
2206,NR,Kosovo (under United Nations Security Council ...,KG_GE1500,2017,45993,


## Check that country names are found on the list of country names.

In [7]:
# rename region to 'geo'
df.rename(columns={'GEO':'geo'},inplace= True)
df.rename(columns={'Flag and Footnotes':'footnote'},inplace= True)
df.rename(columns={'WEIGHT':'weight'},inplace= True)
df.rename(columns={'TIME':'year_of_measurement'},inplace= True)

In [8]:
xls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
geographyMetadata = xls.parse('geography_metadata', skiprows=1, index_col=None)
geographyMetadata.drop('id', axis=1, inplace=True)

missing = []
edited = []

for i in range(1, len(df['geo'])):
    for j in range(0, len(geographyMetadata['name'])): # we need to use different indexes here because the dataframes might have different lengths
        if df['geo'][i].lower() not in geographyMetadata['name'].str.lower().tolist(): # if the country is not in name we try checking in the other cols
            missing.append(df['geo'][i])

            if df['geo'][i].lower() == geographyMetadata['alternate name1'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j])
            elif df['geo'][i].lower() == geographyMetadata['alternate name2'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j])
            elif df['geo'][i].lower() == geographyMetadata['alternate name3'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j])
            elif df['geo'][i].lower() == geographyMetadata['alternate name4'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j]) 

print('missing:'+ str(set(missing)))    # This is the list of countries that are not in the names column of the metadata sheet
print('edited:' + str(set(edited)))    # This is the list of countries that were edited in the loop


missing:set()
edited:set()


In [9]:
#find contries which did not match 
df.loc[~df['geo'].str.lower().isin(geographyMetadata.name.str.lower())]

Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote


In [10]:
#FLAGS and FOOTNOTES
# read in motor energy dictionary metadata,
metadataXls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
footnoteDictionary= pd.read_excel(metadataXls, sheet_name= 'footnote_metadata', index_col= 0)

footnoteDictionaryRelevant = footnoteDictionary.loc[footnoteDictionary['source']== nb_data_source]

sourceCodeFootnote = footnoteDictionaryRelevant['source_code'].to_list()
outputCodeFootnote = footnoteDictionaryRelevant['output_code'].to_list()

df['footnote'].replace(to_replace=sourceCodeFootnote, value= outputCodeFootnote, inplace=True )
#find codes which did not match 
df.loc[~df['footnote'].isin(outputCodeFootnote)]

Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote


### Map weight codes

In [11]:
# WEIGHT CODES FOUND IN SEGMENT METADATA 

### mark no data values

In [12]:
'''
what does the missing data mean? Does it mean there were no vehicles of that type? or does it mean that there is no data recorded for that type? """
'''
df = df[df['Value']!= ':']
df

Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote
10,NR,Austria,KG_LT1000,2006,829000,'
11,NR,Austria,KG_LT1000,2007,772000,'
12,NR,Austria,KG_LT1000,2008,728000,'
13,NR,Austria,KG_LT1000,2009,688000,'
14,NR,Austria,KG_LT1000,2010,657000,'
...,...,...,...,...,...,...
2161,NR,Kosovo (under United Nations Security Council ...,KG1000-1249,2018,35517,'
2183,NR,Kosovo (under United Nations Security Council ...,KG1250-1499,2017,58321,'
2184,NR,Kosovo (under United Nations Security Council ...,KG1250-1499,2018,56957,'
2206,NR,Kosovo (under United Nations Security Council ...,KG_GE1500,2017,45993,'


## structuring the data into format of datastructure


In [13]:
#rename the data frame to df

# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [14]:
#rename or add all necessary columns

# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']='' #df['year_of_measurement']#.astype(str) + '-03-31'
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote,date_of_measurement
5000000010,NR,Austria,KG_LT1000,2006,829000,',
5000000011,NR,Austria,KG_LT1000,2007,772000,',


In [15]:
# df.loc[:,'geo']= 
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= 'EUM1'
df.loc[:,'vehicle_segment']=df.weight
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
df.loc[:,'value']= df['Value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-07-06'
df.loc[:,'notebook']= nb_name
# df.loc[:,'footnote']= 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [16]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
005000000010,2006,,Austria,r,EUM1,KG_LT1000,all,all,,829000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,'
005000000011,2007,,Austria,r,EUM1,KG_LT1000,all,all,,772000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,'
005000000012,2008,,Austria,r,EUM1,KG_LT1000,all,all,,728000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,'
005000000013,2009,,Austria,r,EUM1,KG_LT1000,all,all,,688000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,'
005000000014,2010,,Austria,r,EUM1,KG_LT1000,all,all,,657000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
005000002161,2018,,Kosovo (under United Nations Security Council ...,r,EUM1,KG1000-1249,all,all,,35517,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,'
005000002183,2017,,Kosovo (under United Nations Security Council ...,r,EUM1,KG1250-1499,all,all,,58321,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,'
005000002184,2018,,Kosovo (under United Nations Security Council ...,r,EUM1,KG1250-1499,all,all,,56957,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,'
005000002206,2017,,Kosovo (under United Nations Security Council ...,r,EUM1,KG_GE1500,all,all,,45993,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,'


## at this point restart kernal and run all code

In [17]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [27]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([stock_pickle,df_out], sort=False).drop_duplicates(subset=['notebook','source','value','year_of_measurement'])

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata], sort=False).drop_duplicates(subset=['output_file'])

metadata_df.to_pickle('metadata_df.pickle')

In [28]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [29]:
metadata_df

Unnamed: 0,notebook_name,input_file,output_file,source,geography,start_time,stop_time,attribute1,attribute2,attribute3,attribute4,source_url,comment
9,009_s_NO_1950_2007_cl_statbank.ipynb,in_s_NO_1950_2007_cl_statbank.xlsx,009.xlsx,STATBANK,Norway,1950,2007,class,registered,,,https://www.ssb.no/en/statbank/table/01960/,
7,007_s_NO_2008_2015_cl_me_statbank.ipynb,in_s_NO_2008_2015_cl_me_statbank.xlsx,007.xlsx,STATBANK,Norway,2008,2015,class,registered,motor energy,,https://www.ssb.no/en/statbank/table/01963,hybrids included in OTH category
5,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,in_s_EU_1996_2018_cl(pc)_wt_eurostat.xlsx,005.xlsx,EUROSTAT,150,1996,2018,passenger cars,registered,motor energy,,https://appsso.eurostat.ec.europa.eu/nui/show....,
2,002_s_GL_2005_2015_cl(pc)_oica.ipynb,in_s_GL_2005_2015_cl(pc)_oica.xlsx,002.xlsx,OICA,001,2005,2015,passenger cars,registered,,,http://www.oica.net/category/vehicles-in-use/,original data believed to be in thousands
3,003_s_GL_2005_2015_cl(cv)_oica.ipynb,in_s_GL_2005_2015_cl(cv)_oica.xlsx,003.xlsx,OICA,001,2005,2015,commercial vehicles,registered,,,http://www.oica.net/category/vehicles-in-use/,original data believed to be in units of thous...
4,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,in_s_EU_2012_2018_cl(pc)_me_eurostat.xlsx,004.xlsx,EUROSTAT,150,2012,2018,passenger cars,registered,motor energy,,https://appsso.eurostat.ec.europa.eu/nui/show....,
6,006_s_IN_2001_2016_cl_inmorth.ipynb,in_s_IN_2001_2016_cl_inmorth.xlsx,006.xlsx,MOSPI,356,2001,2016,class,registered,,,http://mospi.nic.in/statistical-year-book-indi...,
8,008_s_NO_2016_2019_cl_me_statbank.ipynb,in_s_NO_2016_2019_cl_me_statbank.csv,008.xlsx,STATBANK,Norway,2016,2019,class,registered,motor energy,,https://www.ssb.no/en/statbank/table/11823/,hybrids in separate categories
10,010_s_china_1949_2018_cl_sg_chinastat.ipynb,in_s_china_1949_2018_cl_sg_chinastat.xlsx,010.xlsx,NBSCHINA,China,1949,2018,class,registered,segment,,http://www.stats.gov.cn/tjsj/ndsj/2019/indexeh...,source describes change in category details se...
