# Information regarding the data that this notebook refers to

In [1]:
# fill out the following fields. Check that input data and notebook name match.

# reference vehicle_fleet_data.xlsx notebook metadata to select notebook_id
notebook_id= '002'
name_string= '_s_GL_2005_2015_cl(pc)_oica'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= '001'
nb_start_time= '2005'
nb_stop_time= '2015'
nb_attribute_1= 'passenger cars'
nb_attribute_2= 'registered'
nb_attribute_3= ''
nb_attribute_4= ''
nb_data_source= 'OICA'
nb_data_source_url= 'http://www.oica.net/category/vehicles-in-use/'
nb_comment= 'original data believed to be in thousands'

In [2]:
# import libraries
import pickle
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [4]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [5]:
# read in data, set the header to be the desired column titles
df = pd.read_excel(xls, header = 5)
df.head(2) # view the data frame

Unnamed: 0,REGIONS/COUNTRIES,Unnamed: 1,Unnamed: 2,Unnamed: 3,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,Unnamed: 15,Variation 2015/2014
0,,,,,in thousand units,,,,,,,,,,,,
1,EUROPE,,,,276663,282958.387,286109.448,294524.057,298202.45,303360.968,309504.27,314674.338,321125.574,326843.229,333553.037,,0.020529


In [6]:
# use .describe() or .info() to gather information on dataset
# df.describe() 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 17 columns):
REGIONS/COUNTRIES      152 non-null object
Unnamed: 1             0 non-null float64
Unnamed: 2             1 non-null float64
Unnamed: 3             1 non-null float64
2005                   153 non-null object
2006                   152 non-null float64
2007                   152 non-null float64
2008                   152 non-null float64
2009                   152 non-null float64
2010                   152 non-null float64
2011                   152 non-null float64
2012                   152 non-null float64
2013                   152 non-null float64
2014                   152 non-null float64
2015                   152 non-null float64
Unnamed: 15            0 non-null float64
Variation 2015/2014    152 non-null float64
dtypes: float64(15), object(2)
memory usage: 21.0+ KB


In [7]:
# use .dropna() to drop the rows and columns with no data. 
# thresh=2 drops columns that do contain up to 2 pieces of non NAN values
df.dropna(axis=1, thresh=2, inplace=True)
df.dropna(axis=0, thresh=2, inplace=True)
#drop columns or rows that will not be useful
df.drop(columns= 'Variation 2015/2014', inplace= True)

In [8]:
# rename column indexes for ease, consistency and clarity
df.rename(columns={'REGIONS/COUNTRIES':'geo'},inplace= True)

In [9]:
df.head(2)

Unnamed: 0,geo,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
1,EUROPE,276663,282958.387,286109.448,294524.057,298202.45,303360.968,309504.27,314674.338,321125.574,326843.229,333553.037
2,EU 28 countries + EFTA,234823,239118.016,238743.582,243450.584,245232.556,248213.966,251245.927,253591.321,256135.079,258637.36,263399.404


In [10]:
# read in data, skip rows so that headings become column names
dictxls = pd.ExcelFile('vehicle_fleet_dictionary.xlsx')

# read in data, set the header to be the desired column titles
geo_dict = pd.read_excel(dictxls, sheet_name= 'geo_dictionary', index_col= 0)
geo_name = geo_dict['name'].str.lower().to_list()
country_code = geo_dict['country_code'].astype(str).to_list()
region_code = geo_dict['region_code'].astype(str).to_list()
geo_dict.head(2)

Unnamed: 0_level_0,name,country_code,region_code,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
282,World,1,1,,,,
314,all countries,1,1,,,,


In [11]:
# lowercase the strings in the geo column and replace them with the 3 number codes
df['geo']= df.geo.str.lower()
df.replace(to_replace= geo_name, value= country_code, inplace=True )
df

Unnamed: 0,geo,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
1,150,276663,282958.387000,286109.44800,294524.057000,298202.450000,303360.968000,309504.27000,314674.338000,321125.574000,326843.22900,333553.037
2,eu 28 countries + efta,234823,239118.016000,238743.58200,243450.584000,245232.556000,248213.966000,251245.92700,253591.321000,256135.079000,258637.36000,263399.404
3,eu 15 countries + efta,202828,206373.200000,203635.88300,205545.628000,206520.616000,208740.752000,210843.39700,211824.866000,213122.502000,214930.00900,218052.596
4,40,4157,4205.000000,4246.00000,4285.000000,4360.000000,4441.000000,4513.00000,4584.000000,4641.308000,4694.92100,4748.048
5,56,4861,4929.000000,5006.00000,5087.000000,5160.000000,5279.000000,5359.00000,5393.000000,5439.295000,5511.08000,5587.415
...,...,...,...,...,...,...,...,...,...,...,...,...
151,788,852,874.000000,830.00000,787.000000,820.000000,848.000000,855.00000,862.000000,891.100000,940.00000,990.000
152,800,57,59.000000,91.00000,94.000000,97.000000,101.000000,110.00000,120.000000,130.000000,140.00000,150.000
153,894,294,303.000000,200.00000,135.000000,190.000000,200.000000,210.00000,220.000000,230.000000,240.00000,250.000
154,716,590,599.000000,610.00000,617.000000,626.000000,650.000000,680.00000,710.000000,750.000000,790.00000,830.000


In [12]:
#find contries which did not match 
df.loc[~df['geo'].isin(country_code)]

Unnamed: 0,geo,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
2,eu 28 countries + efta,234823.0,239118.016,238743.582,243450.584,245232.556,248213.966,251245.927,253591.321,256135.079,258637.36,263399.404
3,eu 15 countries + efta,202828.0,206373.2,203635.883,205545.628,206520.616,208740.752,210843.397,211824.866,213122.502,214930.009,218052.596
22,europe new members,31995.3,32744.816,35107.699,37904.956,38711.94,39473.214,40402.53,41766.455,43012.577,43707.351,45346.808
36,"russia, turkey & other europe",41839.5,43840.371,47365.866,51073.473,52969.894,55147.002,58258.343,61083.017,64990.495,68205.869,70153.633
49,nafta,165333.0,170198.0,172118.0,174916.0,172896.0,170474.0,170559.0,165123.628,166295.633,168257.315,171327.125
53,central & south america,37271.0,39204.084395,42021.35603,45324.81766,48001.602645,51541.075905,55217.36471,58847.70194,62014.583185,65195.19783,66932.812
82,asia/oceania/middle east,157369.0,167210.696,180771.942,190904.653667,206111.210333,226368.208,248126.654,269981.205667,292036.611333,317688.428,344456.315


In [13]:
# Drop codes that are not in code list (after checking for completeness)
df = df.loc[df['geo'].isin(country_code)]

In [14]:
df

Unnamed: 0,geo,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
1,150,276663,282958.387000,286109.44800,294524.057000,298202.450000,303360.968000,309504.27000,314674.338000,321125.574000,326843.22900,333553.037
4,40,4157,4205.000000,4246.00000,4285.000000,4360.000000,4441.000000,4513.00000,4584.000000,4641.308000,4694.92100,4748.048
5,56,4861,4929.000000,5006.00000,5087.000000,5160.000000,5279.000000,5359.00000,5393.000000,5439.295000,5511.08000,5587.415
6,208,1971,2026.000000,2075.00000,2105.000000,2126.000000,2169.000000,2203.00000,2240.000000,2279.792000,2321.17200,2392.079
7,246,2414.48,2489.287000,2480.88000,2449.722000,2449.604000,2486.283000,2532.49600,2560.190000,2575.951000,2595.86700,2612.922
...,...,...,...,...,...,...,...,...,...,...,...,...
151,788,852,874.000000,830.00000,787.000000,820.000000,848.000000,855.00000,862.000000,891.100000,940.00000,990.000
152,800,57,59.000000,91.00000,94.000000,97.000000,101.000000,110.00000,120.000000,130.000000,140.00000,150.000
153,894,294,303.000000,200.00000,135.000000,190.000000,200.000000,210.00000,220.000000,230.000000,240.00000,250.000
154,716,590,599.000000,610.00000,617.000000,626.000000,650.000000,680.00000,710.000000,750.000000,790.00000,830.000


## Arrange data into format with each data point as a row

In [15]:
#.melt() pivots the table bringing the column headers into a new attribute
melted = df.melt(id_vars=['geo'], var_name= 'year')
melted

Unnamed: 0,geo,year,value
0,150,2005,276663
1,40,2005,4157
2,56,2005,4861
3,208,2005,1971
4,246,2005,2414.48
...,...,...,...
1590,788,2015,990
1591,800,2015,150
1592,894,2015,250
1593,716,2015,830


In [16]:
# data orginially in thousands, convert to nr
melted.value *= 1000

## structuring the data into format of datastructure


In [17]:
#rename the data frame to df
df=melted
# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [18]:
#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']= '' #df['year_of_measurement'].astype(str) + '-03-31'
df.head(2)

Unnamed: 0,geo,year,value,year_of_measurement,date_of_measurement
2000000000,150,2005,276663000.0,2005,
2000000001,40,2005,4157000.0,2005,


In [19]:
# df.loc[:,'geo']= 
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= 'OIPC'
df.loc[:,'vehicle_segment']='all'
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= 'OICA'
df.loc[:,'accessed']= '2020-06-24'
df.loc[:,'notebook']= 'oica_st_2005_2015_iu'
df.loc[:,'footnote']= ''

In [20]:
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
002000000000,2005,,150,r,OIPC,all,all,all,,2.76663e+08,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000000001,2005,,40,r,OIPC,all,all,all,,4157000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000000002,2005,,56,r,OIPC,all,all,all,,4861000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000000003,2005,,208,r,OIPC,all,all,all,,1971000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000000004,2005,,246,r,OIPC,all,all,all,,2.41448e+06,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
002000001590,2015,,788,r,OIPC,all,all,all,,990000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000001591,2015,,800,r,OIPC,all,all,all,,150000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000001592,2015,,894,r,OIPC,all,all,all,,250000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,
002000001593,2015,,716,r,OIPC,all,all,all,,830000,nr,OICA,2020-06-24,oica_st_2005_2015_iu,


In [21]:
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [22]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles

import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')

# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([stock_pickle,df_out], sort=False).drop_duplicates()

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata], sort=False).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [32]:
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()