# Information regarding the data that this notebook refers to

In [1]:
# fill out the following fields. Check that input data and notebook name match.

# reference vehicle_fleet_data.xlsx notebook metadata to select notebook_id
notebook_id= '005'
name_string= '_s_EU_1996_2018_cl(pc)_wt_eurostat'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= '150'
nb_start_time= '1996'
nb_stop_time= '2018'
nb_attribute_1= 'passenger cars'
nb_attribute_2= 'registered'
nb_attribute_3= 'motor energy'
nb_attribute_4= ''
nb_data_source= 'EUROSTAT'
nb_data_source_url= 'https://appsso.eurostat.ec.europa.eu/nui/show.do?dataset=road_eqs_carpda&lang=en'
nb_comment= ''

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt
import pickle

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
# extract the data into a dataframe or several
df = pd.read_excel(xls)
df

Unnamed: 0,UNIT,GEO,WEIGHT,TIME,Value,Flag and Footnotes
0,NR,Austria,KG_LT1000,1996,:,
1,NR,Austria,KG_LT1000,1997,:,
2,NR,Austria,KG_LT1000,1998,:,
3,NR,Austria,KG_LT1000,1999,:,
4,NR,Austria,KG_LT1000,2000,:,
...,...,...,...,...,...,...
2203,NR,Kosovo (under United Nations Security Council ...,KG_GE1500,2014,:,
2204,NR,Kosovo (under United Nations Security Council ...,KG_GE1500,2015,:,
2205,NR,Kosovo (under United Nations Security Council ...,KG_GE1500,2016,:,
2206,NR,Kosovo (under United Nations Security Council ...,KG_GE1500,2017,45993,


## Import dictionary for country codes and replace country names with 3 digit codes

In [6]:
# rename region to 'geo'
df.rename(columns={'GEO':'geo'},inplace= True)
df.rename(columns={'Flag and Footnotes':'footnote'},inplace= True)
df.rename(columns={'WEIGHT':'weight'},inplace= True)
df.rename(columns={'TIME':'year_of_measurement'},inplace= True)

In [7]:
# read in data, skip rows so that headings become column names
dictxls = pd.ExcelFile('vehicle_fleet_dictionary.xlsx')

# read in data, set the header to be the desired column titles
geo_dict = pd.read_excel(dictxls, sheet_name= 'geo_dictionary', index_col= 0)
geo_name = geo_dict['name'].str.lower().to_list()
country_code = geo_dict['country_code'].astype(str).to_list()
region_code = geo_dict['region_code'].astype(str).to_list()
geo_dict.head(2)

Unnamed: 0_level_0,name,country_code,region_code,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
282,World,1,1,,,,
314,all countries,1,1,,,,


In [8]:
# lowercase the strings in the geo column and replace them with the 3 number codes
df['geo']= df.geo.str.lower()
df.replace(to_replace= geo_name, value= country_code, inplace=True )
df

Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote
0,NR,40,KG_LT1000,1996,:,
1,NR,40,KG_LT1000,1997,:,
2,NR,40,KG_LT1000,1998,:,
3,NR,40,KG_LT1000,1999,:,
4,NR,40,KG_LT1000,2000,:,
...,...,...,...,...,...,...
2203,NR,955,KG_GE1500,2014,:,
2204,NR,955,KG_GE1500,2015,:,
2205,NR,955,KG_GE1500,2016,:,
2206,NR,955,KG_GE1500,2017,45993,


In [9]:
#find contries which did not match 
df.loc[~df['geo'].isin(country_code)]

Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote


In [10]:
#FLAGS and FOOTNOTES
# read in data, skip rows so that headings become column names
dictxls = pd.ExcelFile('vehicle_fleet_dictionary.xlsx')
# assemble drive train code dictionary
flag_dict = pd.read_excel(dictxls, sheet_name= 'footnote', index_col= 0)
flag_eurostat = flag_dict['flag_eurostat'].to_list()
flag_code = flag_dict['flag_code'].to_list()
flag_dict.tail(2)

df['footnote'].replace(to_replace=flag_code, value=flag_eurostat, inplace=True )
#find codes which did not match 
df.loc[~df['footnote'].isin(flag_eurostat)]

Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote
0,NR,40,KG_LT1000,1996,:,
1,NR,40,KG_LT1000,1997,:,
2,NR,40,KG_LT1000,1998,:,
3,NR,40,KG_LT1000,1999,:,
4,NR,40,KG_LT1000,2000,:,
...,...,...,...,...,...,...
2203,NR,955,KG_GE1500,2014,:,
2204,NR,955,KG_GE1500,2015,:,
2205,NR,955,KG_GE1500,2016,:,
2206,NR,955,KG_GE1500,2017,45993,


### Map weight codes

In [11]:
# WEIGHT CODES

# read in data, skip rows so that headings become column names
dictxls = pd.ExcelFile('vehicle_fleet_dictionary.xlsx')
# assemble drive train code dictionary
weight_dict = pd.read_excel(dictxls, sheet_name= 'weight', index_col= 0)
weight_range = weight_dict['weight_range'].to_list()
weight_code = weight_dict['weight_code'].to_list()
weight_dict.tail(2)

df['weight'].replace(to_replace=weight_range, value=weight_code, inplace=True )

#find codes which did not match 
df.loc[~df['weight'].isin(weight_code)]

Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote


### mark no data values

In [12]:
'''
what does the missing data mean? Does it mean there were no vehicles of that type? or does it mean that there is no data recorded for that type? """
'''
df = df[df['Value']!= ':']
df

Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote
10,NR,40,b100,2006,829000,
11,NR,40,b100,2007,772000,
12,NR,40,b100,2008,728000,
13,NR,40,b100,2009,688000,
14,NR,40,b100,2010,657000,
...,...,...,...,...,...,...
2161,NR,955,b125,2018,35517,
2183,NR,955,b150,2017,58321,
2184,NR,955,b150,2018,56957,
2206,NR,955,a151,2017,45993,


## structuring the data into format of datastructure


In [13]:
#rename the data frame to df

# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [14]:
#rename or add all necessary columns

# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']='' #df['year_of_measurement']#.astype(str) + '-03-31'
df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Unnamed: 0,UNIT,geo,weight,year_of_measurement,Value,footnote,date_of_measurement
5000000010,NR,40,b100,2006,829000,,
5000000011,NR,40,b100,2007,772000,,


In [15]:
# df.loc[:,'geo']= 
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= 'EUM1'
df.loc[:,'vehicle_segment']=df.weight
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
df.loc[:,'value']= df['Value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-07-06'
df.loc[:,'notebook']= nb_name
# df.loc[:,'footnote']= 

In [16]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
005000000010,2006,,40,r,EUM1,b100,all,all,,829000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,
005000000011,2007,,40,r,EUM1,b100,all,all,,772000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,
005000000012,2008,,40,r,EUM1,b100,all,all,,728000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,
005000000013,2009,,40,r,EUM1,b100,all,all,,688000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,
005000000014,2010,,40,r,EUM1,b100,all,all,,657000,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
005000002161,2018,,955,r,EUM1,b125,all,all,,35517,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,
005000002183,2017,,955,r,EUM1,b150,all,all,,58321,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,
005000002184,2018,,955,r,EUM1,b150,all,all,,56957,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,
005000002206,2017,,955,r,EUM1,a151,all,all,,45993,nr,EUROSTAT,2020-07-06,005_s_EU_1996_2018_cl(pc)_wt_eurostat.ipynb,


## at this point restart kernal and run all code

In [17]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [18]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([stock_pickle,df_out], sort=False).drop_duplicates()

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata], sort=False).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [19]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()