In [63]:
# Dependencies
import pandas as pd
from sqlalchemy import create_engine

In [64]:
# Define the excel file routes
pev_xl = "Resources/10567_pev_sales.xlsx"
hev_xl = "Resources/10301_hev_sale.xlsx"

In [65]:
# Define the HEV Table. Exclude the static in the excel file.
hev = pd.read_excel(hev_xl, header = 2, usecols="B:BP", skipfooter=19)

In [66]:
# Melt the HEV table. Our only id var, due to the set up, is Year.
hdf = hev.melt(id_vars=['Year'], var_name = 'Vehicle', value_name = 'Sales')

In [67]:
# Define the PEV table.  Exclude the static, including the totals.
pev = pd.read_excel(pev_xl, header=2, usecols="B:J", skipfooter=11)

In [68]:
# Melt the PEV df, retaining type as another id_var
pdf = pev.melt(id_vars=['Vehicle','Type'], var_name='Year', value_name = 'Sales')

In [69]:
# To get both dataframes into a similar format, we now have to add a 'Type' column to the HEV df
hdf['Type'] = 'HEV'

In [70]:
pdf.head()

Unnamed: 0,Vehicle,Type,Year,Sales
0,Audi A3 Plug In,PHEV,2011,
1,BMW Active E,EV,2011,
2,BMW i3,EV,2011,
3,BMW i8,PHEV,2011,
4,BMW X5,PHEV,2011,


In [71]:
hdf.head()

Unnamed: 0,Year,Vehicle,Sales,Type
0,1999,Volkswagen Touareg Hybrid,,HEV
1,2000,Volkswagen Touareg Hybrid,,HEV
2,2001,Volkswagen Touareg Hybrid,,HEV
3,2002,Volkswagen Touareg Hybrid,,HEV
4,2003,Volkswagen Touareg Hybrid,,HEV


In [72]:
# Dropping all rows for which 'Sales' is null. All other columns have values,
# so we are only dropping where 'Sales' is null.
hdf = hdf.dropna()
pdf = pdf.dropna()

In [73]:
# Reorder the HEV df to have the same column order as the PEV df
hdf = hdf[['Vehicle','Type','Year','Sales']]

In [74]:
# Split the 'Vehicle' column. We will create a new dataframe, which will then be fed back into the original dataframe. 
# While there may be a way to do it without creating a new dataframe, this works.
makemod = hdf.Vehicle.str.split(' ',1, expand=True)

In [75]:
makemod.head()

Unnamed: 0,0,1
12,Volkswagen,Touareg Hybrid
13,Volkswagen,Touareg Hybrid
14,Volkswagen,Touareg Hybrid
15,Volkswagen,Touareg Hybrid
16,Volkswagen,Touareg Hybrid


In [76]:
hdf['Make'] = makemod[0]
hdf['Model'] = makemod[1]
hdf.head()

Unnamed: 0,Vehicle,Type,Year,Sales,Make,Model
12,Volkswagen Touareg Hybrid,HEV,2011,221.0,Volkswagen,Touareg Hybrid
13,Volkswagen Touareg Hybrid,HEV,2012,250.0,Volkswagen,Touareg Hybrid
14,Volkswagen Touareg Hybrid,HEV,2013,118.0,Volkswagen,Touareg Hybrid
15,Volkswagen Touareg Hybrid,HEV,2014,30.0,Volkswagen,Touareg Hybrid
16,Volkswagen Touareg Hybrid,HEV,2015,16.0,Volkswagen,Touareg Hybrid


In [77]:
# Again, we apply the same split to the PEV dataframe.
makemod = pdf.Vehicle.str.split(' ',1, expand=True)

In [78]:
pdf['Make'] = makemod[0]
pdf['Model'] = makemod[1]
pdf.head()

Unnamed: 0,Vehicle,Type,Year,Sales,Make,Model
12,Chevrolet Volt,PHEV,2011,7671.0,Chevrolet,Volt
31,Mitsubishi i-MiEV,EV,2011,76.0,Mitsubishi,i-MiEV
32,Nissan LEAF,EV,2011,9674.0,Nissan,LEAF
35,Smart ED,EV,2011,310.0,Smart,ED
47,BMW Active E,EV,2012,965.0,BMW,Active E


In [79]:
# Now we drop the original vehicle column from both dataframes and reorder the columns to the desired order
pdf.drop(columns = ['Vehicle'], inplace = True)
hdf.drop(columns = ['Vehicle'], inplace = True)

In [80]:
pdf = pdf[['Make', 'Model','Type','Year','Sales']]
hdf = hdf[['Make', 'Model','Type','Year','Sales']]

In [81]:
pdf.head()

Unnamed: 0,Make,Model,Type,Year,Sales
12,Chevrolet,Volt,PHEV,2011,7671.0
31,Mitsubishi,i-MiEV,EV,2011,76.0
32,Nissan,LEAF,EV,2011,9674.0
35,Smart,ED,EV,2011,310.0
47,BMW,Active E,EV,2012,965.0


In [82]:
hdf.head()

Unnamed: 0,Make,Model,Type,Year,Sales
12,Volkswagen,Touareg Hybrid,HEV,2011,221.0
13,Volkswagen,Touareg Hybrid,HEV,2012,250.0
14,Volkswagen,Touareg Hybrid,HEV,2013,118.0
15,Volkswagen,Touareg Hybrid,HEV,2014,30.0
16,Volkswagen,Touareg Hybrid,HEV,2015,16.0


In [83]:
# Check for misspellings in 'Make'. We find that Acura is misspelled as Acrua
hdf.Make.unique()

array(['Volkswagen', 'Toyota', 'Subaru', 'Saturn', 'Porsche', 'Nissan',
       'Mercury', 'Mercedes', 'Mazda', 'Lincoln', 'Lexus', 'Kia',
       'Infiniti', 'Hyundai', 'Honda', 'GMC', 'Ford', 'Dodge', 'Chrysler',
       'Chevrolet', 'Cadillac', 'Buick', 'BMW', 'Audi', 'Acura', 'Acrua'],
      dtype=object)

In [84]:
# Check for misspellings in 'Make'. To ensure consistency, we will rename VW to Volkswagen to reflect the HEV df.
pdf.Make.unique()

array(['Chevrolet', 'Mitsubishi', 'Nissan', 'Smart', 'BMW', 'Ford',
       'Honda', 'Tesla', 'Toyota', 'Cadillac', 'Fiat', 'Porsche', 'Kia',
       'Mercedes', 'VW', 'Volvo', 'Audi', 'Hyundai', 'Chrysler', 'Mini'],
      dtype=object)

In [85]:
# Implementing the changes mentioned above
hdf.Make = hdf.Make.replace({'Acrua':'Acura'})
pdf.Make = pdf.Make.replace({'VW':'Volkswagen'})

In [86]:
# Connecting to Postgres so we can create a SQL database in which to store both dataframes
rds_connection_string = "postgres:postgres@localhost:5432/ETL-EV_proj1"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [87]:
engine.table_names()

[]