# Information regarding the data that this notebook refers to

In [1]:
# fill out the following fields. Check that input data and notebook name match.

# reference vehicle_fleet_data.xlsx notebook metadata to select notebook_id
notebook_id= '006'
name_string= '_s_IN_2001_2016_cl_inmorth'

nb_name= notebook_id + name_string +'.ipynb'
nb_input_workbook= 'in' + name_string + '.xlsx'
nb_output_workbook= notebook_id + '.xlsx'
nb_stock_or_flow= 'stock'
nb_geography= 'India'
nb_start_time= '2001'
nb_stop_time= '2016'
nb_attribute_1= 'class'
nb_attribute_2= 'registered'
nb_attribute_3= ''
nb_attribute_4= ''
nb_data_source= 'MOSPI'
nb_data_source_url= 'http://mospi.nic.in/statistical-year-book-india/2017/189'
nb_data_accessed= '2020-07-01'
nb_comment= ''

In [2]:
# import libraries
import numpy as np
import pandas as pd
import matplotlib as plt

In [3]:
# write the notbook metadata into a data frame
notebook_metadata = pd.DataFrame({'notebook_name':nb_name,
                                 'input_file': nb_input_workbook, 
                                 'output_file': nb_output_workbook, 
                                 'source': nb_data_source, 
                                 'geography': nb_geography,
                                 'start_time': nb_start_time,
                                 'stop_time': nb_stop_time, 
                                 'attribute1': nb_attribute_1, 
                                 'attribute2': nb_attribute_2,
                                 'attribute3': nb_attribute_3,
                                 'attribute4': nb_attribute_4,
                                 'source_url' : nb_data_source_url,
                                 'comment': nb_comment}, index=[notebook_id])

In [4]:
notebook_metadata_headers = notebook_metadata.columns.to_list()

## Reading in the excel data and merging the sheets into one dataframe with category info attached to row data

In [5]:
# read in data,
xls = pd.ExcelFile(nb_input_workbook)

In [6]:
# read in data, set the header to be the desired column titles
df = pd.read_excel(xls, header=6 , nrows=19)
df # view the data frame

Unnamed: 0,Year,Buses,Taxis,Light Motor Vehicles(Passengers),M,Two-wheelers,Cars,Jeeps,Miscellaneous(b),Grand Total,...,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,2,3.0,4.0,5.0,6.0,7.0,8.0,9.0,10.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,2001.0,633900(b),634357.0,1777130.0,2948300.0,38556026.0,5297219.0,1126148.0,4017946.0,54991026.0,...,,,,,,,,,,
4,2002.0,635006,688204.0,1878261.0,2973740.0,41581058.0,5748036.0,1177245.0,4242787.0,58924337.0,...,,,,,,,,,,
5,2003.0,720696,825416.0,2113781.0,3491637.0,47519489.0,6594166.0,1180057.0,4562042.0,67007284.0,...,,,,,,,,,,
6,2004.0,767593,901889.0,2167324.0,3748484.0,51921973.0,7267174.0,1282113.0,4661385.0,72717935.0,...,,,,,,,,,,
7,2005.0,678521,939738.0,2337264.0,3877622.0,58799702.0,8072650.0,1307926.0,5488296.0,81501719.0,...,,,,,,,,,,
8,2006.0,762341,1039845.0,2492726.0,4274984.0,64743126.0,9109855.0,1376744.0,5818646.0,89618267.0,...,,,,,,,,,,
9,2007.0,1098422,1042347.0,2697449.0,5118880.0,69128762.0,10146468.0,1460364.0,6014568.0,96707260.0,...,,,,,,,,,,


In [7]:
#drop columns with NaN
df.dropna(axis= 1, how= 'all', inplace= True)

# drop first 3 rows
df.drop(index=[0,1,2], inplace=True)

In [8]:
# remove the (b) from the single instance
df.iloc[0,1] = 633900

In [9]:
#change values to int
df.astype(int)

Unnamed: 0,Year,Buses,Taxis,Light Motor Vehicles(Passengers),M,Two-wheelers,Cars,Jeeps,Miscellaneous(b),Grand Total
3,2001,633900,634357,1777130,2948300,38556026,5297219,1126148,4017946,54991026
4,2002,635006,688204,1878261,2973740,41581058,5748036,1177245,4242787,58924337
5,2003,720696,825416,2113781,3491637,47519489,6594166,1180057,4562042,67007284
6,2004,767593,901889,2167324,3748484,51921973,7267174,1282113,4661385,72717935
7,2005,678521,939738,2337264,3877622,58799702,8072650,1307926,5488296,81501719
8,2006,762341,1039845,2492726,4274984,64743126,9109855,1376744,5818646,89618267
9,2007,1098422,1042347,2697449,5118880,69128762,10146468,1460364,6014568,96707260
10,2008,1156568,1201862,2903821,5600938,75336026,11200142,1547825,6405672,105353854
11,2009,1205793,1307805,3146619,6040924,82402105,12365806,1638975,6843006,114951033
12,2010,176642,3615086,3615086,6431926,91597791,13749406,1760428,7552876,127745972


In [10]:
# replacing column names with class codes
df.rename(columns={' Year':'year', 'Buses ':'INBUS', 'Taxis':'INTAX', 'Light Motor Vehicles(Passengers)':'INLVP',
       'Goods vehicles(a)':'INGDV', 'Two-wheelers':'INMCY', 'Cars':'INCAR', 'Jeeps':'INJEP',
       'Miscellaneous(b)':'INMSC', 'Grand Total ':'all'}, inplace=True)

In [11]:
#meltin the table so each row becomes a data point
melted= df.melt(id_vars='year')
melted

Unnamed: 0,year,variable,value
0,2001.0,INBUS,633900
1,2002.0,INBUS,635006
2,2003.0,INBUS,720696
3,2004.0,INBUS,767593
4,2005.0,INBUS,678521
...,...,...,...
139,2012.0,all,1.59491e+08
140,2013.0,all,1.82445e+08
141,2014.0,all,1.90704e+08
142,2015.0,all,2.10023e+08


## structuring the data into format of datastructure


In [12]:
#rename the data frame to df
df=melted
# add in a column of indexes
df.index = notebook_id + df.index.astype(str).str.zfill(9)

In [13]:
#rename or add all necessary columns
df.loc[:,'year_of_measurement']= df.loc[:,'year']
# date of measurement is concatonated to year of measurement to achieve desired format
df.loc[:,'date_of_measurement']=''#df['year_of_measurement'].astype(str) + '-03-31'
df.head(2)

Unnamed: 0,year,variable,value,year_of_measurement,date_of_measurement
6000000000,2001.0,INBUS,633900,2001.0,
6000000001,2002.0,INBUS,635006,2002.0,


In [14]:
df.loc[:,'geo']= nb_geography
df.loc[:,'process']= 'r'
df.loc[:,'vehicle_class']= df['variable']
df.loc[:,'vehicle_segment']='all'
df.loc[:,'motor_energy']= 'all'
df.loc[:,'model_year']= 'all'
df.loc[:,'year_of_first_registraion']=''
df.loc[:,'value']= df['value']
df.loc[:,'unit']= 'nr'
df.loc[:,'source']= nb_data_source
df.loc[:,'accessed']= '2020-06-30'
df.loc[:,'notebook']= nb_name
df.loc[:,'footnote']= ''

In [15]:
# create a finalized dataframe for output with columns in arranged order
heading_list= ['year_of_measurement','date_of_measurement','geo','process','vehicle_class','vehicle_segment','motor_energy','model_year','year_of_first_registraion','value','unit','source','accessed','notebook','footnote']
df_out= df[heading_list]

In [16]:
#check df_out before saving
df_out

Unnamed: 0,year_of_measurement,date_of_measurement,geo,process,vehicle_class,vehicle_segment,motor_energy,model_year,year_of_first_registraion,value,unit,source,accessed,notebook,footnote
006000000000,2001.0,,India,r,INBUS,all,all,all,,633900,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000001,2002.0,,India,r,INBUS,all,all,all,,635006,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000002,2003.0,,India,r,INBUS,all,all,all,,720696,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000003,2004.0,,India,r,INBUS,all,all,all,,767593,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000004,2005.0,,India,r,INBUS,all,all,all,,678521,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
006000000139,2012.0,,India,r,all,all,all,all,,1.59491e+08,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000140,2013.0,,India,r,all,all,all,all,,1.82445e+08,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000141,2014.0,,India,r,all,all,all,all,,1.90704e+08,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,
006000000142,2015.0,,India,r,all,all,all,all,,2.10023e+08,nr,MOSPI,2020-06-30,006_s_IN_2001_2016_cl_inmorth.ipynb,


## at this point restart kernel and run all cells

In [17]:
# write an output file named according to notebook id with relevant data and metadata
writer = pd.ExcelWriter(nb_output_workbook, engine='xlsxwriter')
df_out.to_excel(writer, sheet_name='data', merge_cells=False)
notebook_metadata.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()

In [19]:
# add the data from the notebook to the stock dataframe and notebook metadata pickles
import pickle
# load in the stock DataFrame pickle
stock_pickle = pd.read_pickle('stock_df.pickle')
# concatenate the out_df to the stock_pickle and remove any duplicate rows
stock_df = pd.concat([df_out,stock_pickle]).drop_duplicates(subset=['geo','notebook','source','value','year_of_measurement'])

# write the updated stock dataframe to pickle
stock_df.to_pickle('stock_df.pickle')

# repeat the process for the metadata
metadata_pickle = pd.read_pickle('metadata_df.pickle')

metadata_df = pd.concat([metadata_pickle, notebook_metadata]).drop_duplicates()

metadata_df.to_pickle('metadata_df.pickle')

In [20]:
# update the stock metadata 
writer = pd.ExcelWriter('vehicle_fleet_stock.xlsx', engine='xlsxwriter')
stock_df.to_excel(writer, sheet_name='data', merge_cells=False)
metadata_df.to_excel(writer, sheet_name= 'notebook_metadata')
writer.save()