# Information regarding the data that this notebook refers to

In [1]:
# fill out the following fields. Check that input data and notebook name match.

# reference vehicle_fleet_data.xlsx notebook metadata to select notebook_id
notebook_id= '005'
name_string= '_s_EU_1996_2018_cl(pc)_wt_eurostat'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= '150'
nb_start_time= '1996'
nb_stop_time= '2018'
nb_attribute_1= 'passenger cars'
nb_attribute_2= 'registered'
nb_attribute_3= 'motor energy'
nb_attribute_4= ''
nb_data_source= 'EUROSTAT'
nb_data_source_url= 'https://appsso.eurostat.ec.europa.eu/nui/show.do?dataset=road_eqs_carpda&lang=en'
nb_comment= ''

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
# extract the data into a dataframe or several
df = pd.read_excel(xls,sheetname= ,header=17, usecols='c,g',nrows=30)
df

SyntaxError: invalid syntax (<ipython-input-5-ca1fcb828c24>, line 2)

In [None]:
# use .dropna() to drop the rows and columns with no data. 
# thresh=2 drops columns that do contain up to 2 pieces of non NAN values
df.dropna(axis=1, thresh=2, inplace=True)
df.dropna(axis=0, thresh=2, inplace=True)
#drop columns or rows that will not be useful
df.drop(columns= 'Variation 2015/2014', inplace= True)

In [None]:
# drop rows by location
# pd.drop(index=[24,25,26,29], inplace=True)

In [None]:
# if there are multiple sheets to be concatonated into one use pd.concat:
df2 = pd.concat([dfa,dfb,dfc,dfe], sort=False)

## Import dictionary for country codes and replace country names with 3 digit codes

In [None]:
# rename region to 'geo'
df.rename(columns={'REGIONS/COUNTRIES':'geo'},inplace= True)

In [None]:
# read in data, skip rows so that headings become column names
dictxls = pd.ExcelFile('vehicle_fleet_dictionary.xlsx')

# read in data, set the header to be the desired column titles
geo_dict = pd.read_excel(dictxls, sheet_name= 'geo_dictionary', index_col= 0)
geo_name = geo_dict['name'].str.lower().to_list()
country_code = geo_dict['country_code'].astype(str).to_list()
region_code = geo_dict['region_code'].astype(str).to_list()
geo_dict.head(2)

In [None]:
# lowercase the strings in the geo column and replace them with the 3 number codes
df['geo']= df.geo.str.lower()
df.replace(to_replace= geo_name, value= country_code, inplace=True )
df

In [None]:
#find contries which did not match 
df.loc[~df['geo'].isin(country_code)]

In [None]:
#alternately could use
# df = df.loc[df['geo'].isin(country_code)]

In [None]:
# MOTOR ENERGY code replacement
# read in data, skip rows so that headings become column names
dictxls = pd.ExcelFile('vehicle_fleet_dictionary.xlsx')
# assemble drive train code dictionary
me_dict = pd.read_excel(dictxls, sheet_name= 'fuel_types', index_col= 0)
me_eurostat = me_dict['eurostat'].to_list()
me_code = me_dict['dt_code'].to_list()

df['motor_energy'].replace(to_replace=dt_eurostat, value=dt_code, inplace=True )
#find codes which did not match 
df.loc[~df['motor_energy'].isin(dt_code)]

In [None]:
#FLAGS and FOOTNOTES
# read in data, skip rows so that headings become column names
dictxls = pd.ExcelFile('vehicle_fleet_dictionary.xlsx')
# assemble drive train code dictionary
flag_dict = pd.read_excel(dictxls, sheet_name= 'footnote', index_col= 0)
flag_eurostat = flag_dict['flag_eurostat'].to_list()
flag_code = flag_dict['flag_code'].to_list()
flag_dict.tail(2)

df['footnote'].replace(to_replace=flag_code, value=flag_eurostat, inplace=True )
#find codes which did not match 
df.loc[~df['footnote'].isin(flag_eurostat)]

### Map weight codes

In [None]:
# WEIGHT CODES
# rename to 'weight'
df.rename(columns={'WEIGHT':'weight'},inplace= True)
# read in data, skip rows so that headings become column names
dictxls = pd.ExcelFile('vehicle_fleet_dictionary.xlsx')
# assemble drive train code dictionary
weight_dict = pd.read_excel(dictxls, sheet_name= 'weight', index_col= 0)
weight_range = weight_dict['weight_range'].to_list()
weight_code = weight_dict['weight_code'].to_list()
weight_dict.tail(2)

df['weight'].replace(to_replace=weight_range, value=weight_code, inplace=True )

#find codes which did not match 
df.loc[~df['weight'].isin(weight_code)]

### mark no data values

In [None]:
'''
what does the missing data mean? Does it mean there were no vehicles of that type? or does it mean that there is no data recorded for that type? """
'''
df.replace(to_replace=':', value='no_data', inplace=True )


## rearrange data into proper format

In [None]:
#.melt() pivots the table bringing the column headers into a new attribute
melt = df.melt(id_vars=['geo'], var_name= 'year')
melt

## structuring the data into format of datastructure


In [None]:
#rename the data frame to df
df=melted
# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [None]:
#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']=df['year_of_measurement'].astype(str) + '-03-31'
df.head(2)

In [None]:
# df.loc[:,'geo']= 
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= 'all'
df.loc[:,'vehicle_segment']='all'
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-06-30'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= ''

In [70]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [71]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,drive_train,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
003000000000,2005,2005-03-31,150,r,OICV,all,all,all,,45053.2,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000000001,2005,2005-03-31,40,r,OICV,all,all,all,,367,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000000002,2005,2005-03-31,56,r,OICV,all,all,all,,674.465,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000000003,2005,2005-03-31,208,r,OICV,all,all,all,,479,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000000004,2005,2005-03-31,246,r,OICV,all,all,all,,86.69,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
003000001590,2015,2015-03-31,788,r,OICV,all,all,all,,460,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000001591,2015,2015-03-31,800,r,OICV,all,all,all,,340,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000001592,2015,2015-03-31,894,r,OICV,all,all,all,,120,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000001593,2015,2015-03-31,716,r,OICV,all,all,all,,110,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,


## at this point restart kernel and run all cells

In [68]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [66]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([stock_pickle,df_out]).drop_duplicates()

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata]).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [65]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [None]:
# this code initializes the pickle files 
# df_out.to_pickle('stock_df.pickle')
# notebook_metadata.to_pickle('metadata_df.pickle')