# Information regarding the data that this notebook refers to

In [1]:
# fill out the following fields. Check that input data and notebook name match.

# reference vehicle_fleet_data.xlsx notebook metadata to select notebook_id
notebook_id= '002'
name_string= '_s_GL_2005_2015_cl(pc)_oica'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= '001'
nb_start_time= '2005'
nb_stop_time= '2015'
nb_attribute_1= 'passenger cars'
nb_attribute_2= 'registered'
nb_attribute_3= ''
nb_attribute_4= ''
nb_data_source= 'OICA'
nb_data_source_url= 'http://www.oica.net/category/vehicles-in-use/'
nb_comment= 'original data believed to be in thousands'

In [2]:
# import libraries
import pickle
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
# read in data, set the header to be the desired column titles
df = pd.read_excel(xls, header = 5)
df.head(2) # view the data frame

Unnamed: 0,REGIONS/COUNTRIES,Unnamed: 1,Unnamed: 2,Unnamed: 3,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,Unnamed: 15,Variation 2015/2014
0,,,,,in thousand units,,,,,,,,,,,,
1,EUROPE,,,,276663,282958.387,286109.448,294524.057,298202.45,303360.968,309504.27,314674.338,321125.574,326843.229,333553.037,,0.020529


In [6]:
# use .describe() or .info() to gather information on dataset
# df.describe() 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 17 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   REGIONS/COUNTRIES    152 non-null    object 
 1   Unnamed: 1           0 non-null      float64
 2   Unnamed: 2           1 non-null      float64
 3   Unnamed: 3           1 non-null      float64
 4   2005                 153 non-null    object 
 5   2006                 152 non-null    float64
 6   2007                 152 non-null    float64
 7   2008                 152 non-null    float64
 8   2009                 152 non-null    float64
 9   2010                 152 non-null    float64
 10  2011                 152 non-null    float64
 11  2012                 152 non-null    float64
 12  2013                 152 non-null    float64
 13  2014                 152 non-null    float64
 14  2015                 152 non-null    float64
 15  Unnamed: 15          0 non-null      flo

In [7]:
# use .dropna() to drop the rows and columns with no data. 
# thresh=2 drops columns that do contain up to 2 pieces of non NAN values
df.dropna(axis=1, thresh=2, inplace=True)
df.dropna(axis=0, thresh=2, inplace=True)
#drop columns or rows that will not be useful
df.drop(columns= 'Variation 2015/2014', inplace= True)

In [8]:
# rename column indexes for ease, consistency and clarity
df.rename(columns={'REGIONS/COUNTRIES':'geo'},inplace= True)

In [9]:
df.reset_index(drop=True, inplace=True)
df.head(2)

Unnamed: 0,geo,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,EUROPE,276663,282958.387,286109.448,294524.057,298202.45,303360.968,309504.27,314674.338,321125.574,326843.229,333553.037
1,EU 28 countries + EFTA,234823,239118.016,238743.582,243450.584,245232.556,248213.966,251245.927,253591.321,256135.079,258637.36,263399.404


In [11]:
xls = pd.ExcelFile('metadata_vehicle_fleet.xlsx')
geographyMetadata = xls.parse('geography_metadata', skiprows=1, index_col=None)
geographyMetadata.drop('id', axis=1, inplace=True)

missing = []
edited = []

for i in range(1, len(df['geo'])):
    for j in range(0, len(geographyMetadata['name'])): # we need to use different indexes here because the dataframes might have different lengths
        if df['geo'][i].lower() not in geographyMetadata['name'].str.lower().tolist(): # if the country is not in name we try checking in the other cols
            missing.append(df['geo'][i])

            if df['geo'][i].lower() == geographyMetadata['alternate name1'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j])
            elif df['geo'][i].lower() == geographyMetadata['alternate name2'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j])
            elif df['geo'][i].lower() == geographyMetadata['alternate name3'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j])
            elif df['geo'][i].lower() == geographyMetadata['alternate name4'][j].lower():
                df['geo'][i] = geographyMetadata['name'][j]
                edited.append(geographyMetadata['name'][j]) 

print('missing:'+ str(set(missing)))    # This is the list of countries that are not in the names column of the metadata sheet
print('edited:' + str(set(edited)))    # This is the list of countries that were edited in the loop


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


missing:{'CZECH REPUBLIC', 'NAFTA', 'RUSSIA', 'VENEZUELA', 'AFRICA', 'HONG-KONG', 'CENTRAL & SOUTH AMERICA', 'MOLDAVIA', 'EU 28 countries + EFTA', 'IRAN', 'ASIA/OCEANIA/MIDDLE EAST', 'AMERICA', 'EUROPE NEW MEMBERS', 'BRUNEI', 'BOSNIA', 'PALESTINE', 'RUSSIA, TURKEY & OTHER EUROPE', 'CONGO KINSHASA', 'EU 15 countries + EFTA', 'MACEDONIA', 'ALL COUNTRIES', 'BOLIVIA'}
edited:{'Russian Federation', 'Iran (Islamic Republic of)', 'China, Hong Kong Special Administrative Region', 'State of Palestine', 'Bosnia and Herzegovina', 'United States of America', 'North Macedonia', 'Republic of Moldova', 'Bolivia (Plurinational State of)', 'Czechia', 'Venezuela (Bolivarian Republic of)', 'Brunei Darussalam', 'Democratic Republic of the Congo'}
unedited:{'CZECH REPUBLIC', 'NAFTA', 'RUSSIA', 'VENEZUELA', 'AFRICA', 'HONG-KONG', 'CENTRAL & SOUTH AMERICA', 'MOLDAVIA', 'EU 28 countries + EFTA', 'IRAN', 'ASIA/OCEANIA/MIDDLE EAST', 'AMERICA', 'EUROPE NEW MEMBERS', 'BRUNEI', 'BOSNIA', 'PALESTINE', 'RUSSIA, TURK

In [12]:
#find contries which did not match 
df.loc[~df['geo'].str.lower().isin(geographyMetadata.name.str.lower())]

Unnamed: 0,geo,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,EUROPE,276663.0,282958.387,286109.448,294524.057,298202.45,303360.968,309504.27,314674.338,321125.574,326843.229,333553.037
1,EU 28 countries + EFTA,234823.0,239118.016,238743.582,243450.584,245232.556,248213.966,251245.927,253591.321,256135.079,258637.36,263399.404
2,EU 15 countries + EFTA,202828.0,206373.2,203635.883,205545.628,206520.616,208740.752,210843.397,211824.866,213122.502,214930.009,218052.596
21,EUROPE NEW MEMBERS,31995.3,32744.816,35107.699,37904.956,38711.94,39473.214,40402.53,41766.455,43012.577,43707.351,45346.808
35,"RUSSIA, TURKEY & OTHER EUROPE",41839.5,43840.371,47365.866,51073.473,52969.894,55147.002,58258.343,61083.017,64990.495,68205.869,70153.633
47,NAFTA,165333.0,170198.0,172118.0,174916.0,172896.0,170474.0,170559.0,165123.628,166295.633,168257.315,171327.125
51,CENTRAL & SOUTH AMERICA,37271.0,39204.084395,42021.35603,45324.81766,48001.602645,51541.075905,55217.36471,58847.70194,62014.583185,65195.19783,66932.812
79,ASIA/OCEANIA/MIDDLE EAST,157369.0,167210.696,180771.942,190904.653667,206111.210333,226368.208,248126.654,269981.205667,292036.611333,317688.428,344456.315
118,AFRICA,17218.2,19582.67,20669.54,21735.419,22687.848,23828.572,24993.72,26288.775,27610.571,29077.611,30810.329
151,ALL COUNTRIES,653854.0,679153.837395,701690.28603,727404.947327,747899.110978,775572.823905,808401.00871,834915.648607,869082.972518,907061.78083,947079.618


In [13]:
# Drop codes that are not in code list (after checking for completeness)
df = df.loc[df['geo'].str.lower().isin(geographyMetadata.name.str.lower())]

In [25]:
df.geo.unique()

array(['Austria', 'Belgium', 'Denmark', 'Finland', 'France', 'Germany',
       'Greece', 'Iceland', 'Ireland', 'Italy', 'Luxembourg',
       'Netherlands', 'Norway', 'Portugal', 'Spain', 'Sweden',
       'Switzerland', 'United Kingdom', 'Bulgaria', 'Croatia', 'Cyprus',
       'Czechia', 'Estonia', 'Hungary', 'Latvia', 'Lithuania', 'Malta',
       'Poland', 'Romania', 'Slovakia', 'Slovenia', 'Albania', 'Belarus',
       'Bosnia And Herzegovina', 'Georgia', 'North Macedonia',
       'Republic Of Moldova', 'Russian Federation', 'Serbia', 'Turkey',
       'Ukraine', 'United States Of America', 'Canada', 'Mexico',
       'Argentina', 'Bahamas', 'Barbados', 'Belize', 'Bermuda',
       'Bolivia (Plurinational State Of)', 'Brazil', 'Chile', 'Colombia',
       'Costa Rica', 'Cuba', 'Dominican Republic', 'Ecuador',
       'El Salvador', 'Guatemala', 'Haiti', 'Honduras', 'Jamaica',
       'Nicaragua', 'Panama', 'Paraguay', 'Peru', 'Puerto Rico',
       'Suriname', 'Trinidad And Tobago', 'Uruguay'

## Arrange data into format with each data point as a row

In [15]:
#.melt() pivots the table bringing the column headers into a new attribute
melted = df.melt(id_vars=['geo'], var_name= 'year')
melted

Unnamed: 0,geo,year,value
0,AUSTRIA,2005,4157
1,BELGIUM,2005,4861
2,DENMARK,2005,1971
3,FINLAND,2005,2414.48
4,FRANCE,2005,30100
...,...,...,...
1557,TOGO,2015,140
1558,TUNISIA,2015,990
1559,UGANDA,2015,150
1560,ZAMBIA,2015,250


In [16]:
# data orginially in thousands, convert to nr
melted.value *= 1000

## structuring the data into format of datastructure


In [17]:
#rename the data frame to df
df=melted
# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [18]:
#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']= '' #df['year_of_measurement'].astype(str) + '-03-31'
df.head(2)

Unnamed: 0,geo,year,value,year_of_measurement,date_of_measurement
2000000000,AUSTRIA,2005,4157000,2005,
2000000001,BELGIUM,2005,4861000,2005,


In [19]:
df.loc[:,'geo']= df.geo.str.title()
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= 'OIPC'
df.loc[:,'vehicle_segment']='all'
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-06-24'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= ''

In [20]:
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
002000000000,2005,,Austria,r,OIPC,all,all,all,,4157000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000000001,2005,,Belgium,r,OIPC,all,all,all,,4861000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000000002,2005,,Denmark,r,OIPC,all,all,all,,1971000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000000003,2005,,Finland,r,OIPC,all,all,all,,2.41448e+06,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000000004,2005,,France,r,OIPC,all,all,all,,30100000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
002000001557,2015,,Togo,r,OIPC,all,all,all,,140000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000001558,2015,,Tunisia,r,OIPC,all,all,all,,990000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000001559,2015,,Uganda,r,OIPC,all,all,all,,150000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000001560,2015,,Zambia,r,OIPC,all,all,all,,250000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,


In [21]:
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [22]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([stock_pickle,df_out], sort=False).drop_duplicates(subset=['geo','notebook','source','value','year_of_measurement'])

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata], sort=False).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [23]:
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()