# Information regarding the data that this notebook refers to

In [1]:
# fill out the following fields. Check that input data and notebook name match.

# reference vehicle_fleet_data.xlsx notebook metadata to select notebook_id
notebook_id= '003'
name_string= '_s_GL_2005_2015_cl(cv)_oica'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= '001'
nb_start_time= '2005'
nb_stop_time= '2015'
nb_attribute_1= 'commercial vehicles'
nb_attribute_2= 'registered'
nb_attribute_3= ''
nb_attribute_4= ''
nb_data_source= 'OICA'
nb_data_source_url= 'http://www.oica.net/category/vehicles-in-use/'
nb_comment= 'original data believed to be in units of thousands'


In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
# read in data, set the header to be the desired column titles
df = pd.read_excel(xls, header = 5)
df.head(2) # view the data frame

Unnamed: 0,REGIONS/COUNTRIES,Unnamed: 1,Unnamed: 2,Unnamed: 3,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,Unnamed: 15,Variation 2015/2014
0,,,,,in thousand units,,,,,,,,,,,,
1,EUROPE,,,,45053.2,46598.161,47999.341,49363.389,49562.42,50241.362,51097.143,52228.084,52649.729,53293.14,53966.294,,0.012631


In [6]:
# use .dropna() to drop the rows and columns with no data. 
# thresh=2 drops columns that do contain up to 2 pieces of non NAN values
df.dropna(axis=1, thresh=2, inplace=True)
df.dropna(axis=0, thresh=2, inplace=True)
#drop columns or rows that will not be useful
df.drop(columns= 'Variation 2015/2014', inplace= True)

## Import dictionary for country codes and replace country names with 3 digit codes

In [7]:
# rename region to 'geo'
df.rename(columns={'REGIONS/COUNTRIES':'geo'},inplace= True)

In [8]:
# read in data, skip rows so that headings become column names
dictxls = pd.ExcelFile('dictionary_vehicle_fleet.xlsx')

# read in data, set the header to be the desired column titles
geo_dict = pd.read_excel(dictxls, sheet_name= 'geo_dictionary', index_col= 0)
geo_name = geo_dict['name'].str.lower().to_list()
country_code = geo_dict['country_code'].astype(str).to_list()
region_code = geo_dict['region_code'].astype(str).to_list()
geo_dict.head(2)

Unnamed: 0_level_0,name,country_code,region_code,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
282,World,1,1,,,,
314,all countries,1,1,,,,


In [9]:
# lowercase the strings in the geo column and replace them with the 3 number codes
df['geo']= df.geo.str.lower()
df.replace(to_replace= geo_name, value= country_code, inplace=True )
df

Unnamed: 0,geo,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
1,150,45053.188000,46598.161000,47999.341000,49363.389000,49562.420000,50241.362,51097.143000,52228.084000,52649.729,53293.140,53966.294
2,eu 28 countries + efta,34748.824000,35793.506000,36583.103000,37375.604000,37452.621000,37782.755,38258.398000,38264.743000,38384.255,38653.294,39182.700
3,eu 15 countries + efta,29441.385000,30359.283000,30788.574000,31131.387000,31086.289000,31282.767,31544.028000,31470.890000,31475.788,31710.740,31984.333
4,40,367.000000,374.000000,382.000000,391.000000,398.000000,406.000,417.000000,426.000000,434.331,444.500,453.702
5,56,674.465000,691.094000,712.271000,730.696000,743.333000,761.264,778.745000,791.033000,801.722,817.089,838.721
...,...,...,...,...,...,...,...,...,...,...,...,...
151,788,122.000000,126.000000,220.000000,390.000000,400.000000,411.000,414.000000,417.000000,438.900,450.000,460.000
152,800,171.000000,176.000000,212.000000,219.000000,259.000000,265.000,270.000000,280.000000,300.000,320.000,340.000
153,894,73.000000,74.000000,80.000000,83.000000,108.000000,110.000,110.000000,110.000000,113.000,120.000,120.000
154,716,89.000000,91.000000,90.000000,93.000000,95.000000,97.000,100.000000,100.000000,103.000,110.000,110.000


In [10]:
#find contries which did not match 
df.loc[~df['geo'].isin(country_code)]

Unnamed: 0,geo,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
2,eu 28 countries + efta,34748.824,35793.506,36583.103,37375.604,37452.621,37782.755,38258.398,38264.743,38384.255,38653.294,39182.7
3,eu 15 countries + efta,29441.385,30359.283,30788.574,31131.387,31086.289,31282.767,31544.028,31470.89,31475.788,31710.74,31984.333
22,europe new members,5307.439,5434.223,5794.529,6244.217,6366.332,6499.988,6714.37,6793.853,6908.467,6942.554,7198.367
36,"russia, turkey & other europe",10304.364,10804.655,11416.238,11987.785,12109.799,12458.607,12838.745,13963.341,14265.474,14639.846,14783.594
49,nafta,112824.0,118210.0,122523.0,123625.0,126560.0,129471.0,131954.0,141371.459,143623.173,148373.844,153435.383
53,central & south america,12359.055,12977.394,13849.887,14934.235,15581.608,16714.118,17976.278,19128.881,20033.167,21195.292,22029.622
82,asia/oceania/middle east,59429.880935,60709.613875,64465.811725,66367.137667,69086.885333,72183.531,75716.050667,81663.431333,86759.792,91673.85,91765.805


In [11]:
# for country names that should have been included, add new rows to vechicle_fleet_dictionary.xlsx
# and repeat cells above
# retain only geos on the list:
df = df.loc[df['geo'].isin(country_code)]

## rearrange data into proper format

In [12]:
#.melt() pivots the table bringing the column headers into a new attribute
melt = df.melt(id_vars=['geo'], var_name= 'year')
melt

Unnamed: 0,geo,year,value
0,150,2005,45053.188
1,40,2005,367.000
2,56,2005,674.465
3,208,2005,479.000
4,246,2005,86.690
...,...,...,...
1590,788,2015,460.000
1591,800,2015,340.000
1592,894,2015,120.000
1593,716,2015,110.000


## structuring the data into format of datastructure


In [13]:
#rename the data frame to df
df=melt
# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [14]:
#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']=df['year_of_measurement'].astype(str) + '-03-31'
df.head(2)

Unnamed: 0,geo,year,value,year_of_measurement,date_of_measurement
3000000000,150,2005,45053.188,2005,2005-03-31
3000000001,40,2005,367.0,2005,2005-03-31


In [15]:
# df.loc[:,'geo']= 
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= 'OICA'
df.loc[:,'vehicle_segment']='all'
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-06-30'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= ''

In [16]:
# This section of code should convert the geo codes back into legible names
geographyMetadata = pd.read_excel('metadata_vehicle_fleet.xlsx', sheet_name= 'geography_metadata', skiprows= 1 ,index_col=[0])


# make 2 lists of the UN_codes and country names
geoCode= geographyMetadata.code.astype(str).tolist()
geoName= geographyMetadata.name.tolist()


# df['geo'].replace(to_replace= sourceLabel, value= sourceCode, inplace=True)
df['geo'].replace(to_replace= geoCode,     value= geoName,    inplace=True)


#check if there are any rows which do not have country names within list
df.loc[~df['geo'].isin(geoName)]

Unnamed: 0,geo,year,value,year_of_measurement,date_of_measurement,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,unit,source,accessed,notebook,footnote


In [17]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [18]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
003000000000,2005,2005-03-31,Europe,r,OICA,all,all,all,,45053.188,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000000001,2005,2005-03-31,Austria,r,OICA,all,all,all,,367.000,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000000002,2005,2005-03-31,Belgium,r,OICA,all,all,all,,674.465,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000000003,2005,2005-03-31,Denmark,r,OICA,all,all,all,,479.000,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000000004,2005,2005-03-31,Finland,r,OICA,all,all,all,,86.690,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
003000001590,2015,2015-03-31,Tunisia,r,OICA,all,all,all,,460.000,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000001591,2015,2015-03-31,Uganda,r,OICA,all,all,all,,340.000,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000001592,2015,2015-03-31,Zambia,r,OICA,all,all,all,,120.000,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,
003000001593,2015,2015-03-31,Zimbabwe,r,OICA,all,all,all,,110.000,nr,OICA,2020-06-30,003_s_GL_2005_2015_cl(cv)_oica.ipynb,


## at this point restart kernel and run all cells

In [19]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [20]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

# concatenate the out_df to the stock_pickle and remove any duplicate rows
# here it is important to think about the categories of duplicates to drop.
stock_df = pd.concat([df_out,stock_pickle]).drop_duplicates(subset=['geo','notebook','value','year_of_measurement'])

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata], sort=False).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [24]:
stock_df.drop_duplicates(subset= ['notebook','value','year_of_measurement'], inplace= True)

In [25]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()