# Information regarding the data that this notebook refers to

In [1]:
# fill out the following fields. Check that input data and notebook name match.

# reference vehicle_fleet_data.xlsx notebook metadata to select notebook_id
notebook_id= '004'
name_string= '_s_EU_2012_2018_cl(pc)_me_eurostat'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'Europe'
nb_start_time= '2012'
nb_stop_time= '2018'
nb_attribute_1= 'passenger cars'
nb_attribute_2= 'registered'
nb_attribute_3= 'motor energy'
nb_attribute_4= ''
nb_data_source= 'EUROSTAT'
nb_data_source_url= 'https://appsso.eurostat.ec.europa.eu/nui/show.do?dataset=road_eqs_carpda&lang=en'
nb_comment= ''

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
nb_input_workbook

'in_s_EU_2012_2018_cl(pc)_me_eurostat.xlsx'

In [5]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [6]:
# extract the data into a dataframe or several
df = pd.read_excel(xls)
df.head(2)

Unnamed: 0,TIME,MOT_NRG,GEO,UNIT,Value,Flag and Footnotes
0,2012,Total,Belgium,NR,5444000,
1,2012,Total,Bulgaria,NR,2807000,


## Import dictionary for country codes and replace country names with 3 digit codes

In [7]:
# rename region to 'geo'
df.rename(columns={'GEO':'geo'},inplace= True)
df.rename(columns={'Flag and Footnotes':'footnote'},inplace= True)
df.rename(columns={'MOT_NRG':'motor_energy'},inplace= True)
df.rename(columns={'TIME':'year_of_measurement'},inplace= True)

In [8]:
xls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
geographyMetadata = xls.parse('geography_metadata', skiprows=1, index_col=None)
geographyMetadata.drop('id', axis=1, inplace=True)

missing = []
edited = []

for i in range(1, len(df['geo'])):
    for j in range(0, len(geographyMetadata['name'])): # we need to use different indexes here because the dataframes might have different lengths
        if df['geo'][i].lower() not in geographyMetadata['name'].str.lower().tolist(): # if the country is not in name we try checking in the other cols
            missing.append(df['geo'][i])

            if df['geo'][i].lower() == geographyMetadata['alternate name1'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j])
            elif df['geo'][i].lower() == geographyMetadata['alternate name2'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j])
            elif df['geo'][i].lower() == geographyMetadata['alternate name3'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j])
            elif df['geo'][i].lower() == geographyMetadata['alternate name4'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j]) 

print('missing:'+ str(set(missing)))    # This is the list of countries that are not in the names column of the metadata sheet
print('edited:' + str(set(edited)))    # This is the list of countries that were edited in the loop


missing:set()
edited:set()


In [9]:
#find contries which did not match 
df.loc[~df['geo'].str.lower().isin(geographyMetadata.name.str.lower())]

Unnamed: 0,year_of_measurement,motor_energy,geo,UNIT,Value,footnote


In [10]:
# Drop codes that are not in code list (after checking for completeness)
df = df.loc[df['geo'].str.lower().isin(geographyMetadata.name.str.lower())]

In [11]:
# MOTOR ENERGY code replacement
# read in motor energy dictionary metadata,
metadataXls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
motorEnergyDictionary= pd.read_excel(metadataXls, sheet_name= 'motor_energy_dictionary', index_col= 0)

sourceMEDict = motorEnergyDictionary.loc[motorEnergyDictionary['source']== nb_data_source]

motorEnergySource = sourceMEDict['source_code'].to_list()
motorEnergyCode = sourceMEDict['output_code'].to_list()

df['motor_energy'].replace(to_replace=motorEnergySource, value= motorEnergyCode, inplace=True )
#find codes which did not match 
df.loc[~df['motor_energy'].isin(motorEnergyCode)]

Unnamed: 0,year_of_measurement,motor_energy,geo,UNIT,Value,footnote
0,2012,Total,Belgium,NR,5444000,
1,2012,Total,Bulgaria,NR,2807000,
2,2012,Total,Czechia,NR,4706000,d
3,2012,Total,Denmark,NR,:,
4,2012,Total,Germany,NR,43431000,
...,...,...,...,...,...,...
3803,2018,Other,Norway,NR,1008,
3804,2018,Other,Switzerland,NR,3700,
3805,2018,Other,North Macedonia,NR,0,
3806,2018,Other,Turkey,NR,38815,d


In [15]:
#FLAGS and FOOTNOTES
# read in motor energy dictionary metadata,
metadataXls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
footnoteDictionary= pd.read_excel(metadataXls, sheet_name= 'footnote_metadata', index_col= 0)

footnoteDictionaryRelevant = footnoteDictionary.loc[footnoteDictionary['source']== nb_data_source]

sourceCodeFootnote = footnoteDictionaryRelevant['source_code'].to_list()
outputCodeFootnote = footnoteDictionaryRelevant['output_code'].to_list()

df['footnote'].replace(to_replace=sourceCodeFootnote, value= outputCodeFootnote, inplace=True )
#find codes which did not match 
df.loc[~df['footnote'].isin(outputCodeFootnote)]

Unnamed: 0,year_of_measurement,motor_energy,geo,UNIT,Value,footnote


In [16]:
'''
what does the missing data mean? Does it mean there were no vehicles of that type? or does it mean that there is no data recorded for that type? """
'''

df=df[df['Value']!= ':']
df

Unnamed: 0,year_of_measurement,motor_energy,geo,UNIT,Value,footnote
0,2012,Total,Belgium,NR,5444000,'
1,2012,Total,Bulgaria,NR,2807000,'
2,2012,Total,Czechia,NR,4706000,"definition differs, see eurostat source links"
4,2012,Total,Germany,NR,43431000,'
5,2012,Total,Estonia,NR,602100,'
...,...,...,...,...,...,...
3803,2018,Other,Norway,NR,1008,'
3804,2018,Other,Switzerland,NR,3700,'
3805,2018,Other,North Macedonia,NR,0,'
3806,2018,Other,Turkey,NR,38815,"definition differs, see eurostat source links"


## structuring the data into format of datastructure


In [17]:
#rename the data frame to df

# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [19]:
#rename or add all necessary columns
# df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df['date_of_measurement']=''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_of_measurement']=''


In [20]:
# df.loc[:,'geo']= 
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= 'EUM1'
df.loc[:,'vehicle_segment']='all'
df.loc[:,'motor_energy']= df.motor_energy
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
df.loc[:,'value']= df['Value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-07-03'
df.loc[:,'notebook']= nb_name
# df.loc[:,'footnote']= ''

In [21]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
004000000000,2012,,Belgium,r,EUM1,all,Total,all,,5444000,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,'
004000000001,2012,,Bulgaria,r,EUM1,all,Total,all,,2807000,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,'
004000000002,2012,,Czechia,r,EUM1,all,Total,all,,4706000,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,"definition differs, see eurostat source links"
004000000004,2012,,Germany,r,EUM1,all,Total,all,,43431000,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,'
004000000005,2012,,Estonia,r,EUM1,all,Total,all,,602100,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,'
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
004000003803,2018,,Norway,r,EUM1,all,Other,all,,1008,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,'
004000003804,2018,,Switzerland,r,EUM1,all,Other,all,,3700,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,'
004000003805,2018,,North Macedonia,r,EUM1,all,Other,all,,0,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,'
004000003806,2018,,Turkey,r,EUM1,all,Other,all,,38815,nr,EUROSTAT,2020-07-03,004_s_EU_2012_2018_cl(pc)_me_eurostat.ipynb,"definition differs, see eurostat source links"


In [22]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [23]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([stock_pickle,df_out], sort=False).drop_duplicates()

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata], sort=False).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [24]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()