What features and parameters dictate electric car demand?

In [27]:
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from sklearn.model_selection import train_test_split
import psycopg2
import sqlalchemy
from sklearn.preprocessing import StandardScaler

In [28]:
# # Loading Data

connection = psycopg2.connect(
    host = 'teamev.c5ap8dqhexya.us-east-1.rds.amazonaws.com',
    port = 5432,
    user = 'postgres',
    password = 'dogecoin',
    database='postgres'
    )
cursor=connection.cursor()

sql = """
SELECT * FROM maclea; 
"""

df = pd.read_sql(sql, con=connection)
df.head(10)

Unnamed: 0,month,total_ev,elec_price,gas_price,model_id_count,brand_id_count,ev_stations,elecv_totals,hybrid_totals
0,2012-01-01,1257.0,0.128,3.38,4,4,2057.0,652.0,605.0
1,2012-02-01,1566.0,0.128,3.579,4,4,2130.0,522.0,1044.0
2,2012-03-01,3815.0,0.127,3.852,4,4,2213.0,635.0,3180.0
3,2012-04-01,3565.0,0.127,3.9,4,4,2273.0,449.0,3116.0
4,2012-05-01,3367.0,0.129,3.732,5,5,2367.0,595.0,2772.0
5,2012-06-01,2524.0,0.135,3.539,6,6,2439.0,580.0,1944.0
6,2012-07-01,3029.0,0.133,3.439,7,7,2497.0,454.0,2575.0
7,2012-08-01,4686.0,0.133,3.722,7,7,2550.0,774.0,3912.0
8,2012-09-01,5745.0,0.133,3.849,8,7,2576.0,1183.0,4562.0
9,2012-10-01,7084.0,0.128,3.746,9,7,2771.0,1972.0,5112.0


In [29]:
df2 = df.drop(columns=['month', 'elecv_totals', 'hybrid_totals'])
df2

Unnamed: 0,total_ev,elec_price,gas_price,model_id_count,brand_id_count,ev_stations
0,1257.0,0.128,3.380,4,4,2057.0
1,1566.0,0.128,3.579,4,4,2130.0
2,3815.0,0.127,3.852,4,4,2213.0
3,3565.0,0.127,3.900,4,4,2273.0
4,3367.0,0.129,3.732,5,5,2367.0
...,...,...,...,...,...,...
91,26455.0,0.139,2.621,40,22,15982.0
92,31833.0,0.139,2.592,39,22,16600.0
93,8665.0,0.136,2.627,32,16,16926.0
94,10180.0,0.133,2.598,30,15,17280.0


In [30]:
# Creating the scaler instance
data_scaler = StandardScaler()

In [31]:
# Fitting the scaler
scaled_array = data_scaler.fit_transform(df2)
scaled_array

array([[-1.35233351e+00, -1.77305924e+00,  9.34723090e-01,
        -1.82124544e+00, -2.06186065e+00, -1.24223914e+00],
       [-1.31988635e+00, -1.77305924e+00,  1.28571275e+00,
        -1.82124544e+00, -2.06186065e+00, -1.22530363e+00],
       [-1.08372565e+00, -2.01587478e+00,  1.76722118e+00,
        -1.82124544e+00, -2.06186065e+00, -1.20604819e+00],
       [-1.10997740e+00, -2.01587478e+00,  1.85188200e+00,
        -1.82124544e+00, -2.06186065e+00, -1.19212860e+00],
       [-1.13076878e+00, -1.53024371e+00,  1.55556912e+00,
        -1.73285335e+00, -1.86799324e+00, -1.17032124e+00],
       [-1.21928966e+00, -7.33505251e-02,  1.21516206e+00,
        -1.64446125e+00, -1.67412584e+00, -1.15361772e+00],
       [-1.16626114e+00, -5.58981588e-01,  1.03878535e+00,
        -1.55606916e+00, -1.48025843e+00, -1.14016212e+00],
       [-9.92264569e-01, -5.58981588e-01,  1.53793145e+00,
        -1.55606916e+00, -1.48025843e+00, -1.12786648e+00],
       [-8.81062176e-01, -5.58981588e-01,  1.761

In [32]:
# Convert the array to a dataframce with scaled values
df_scaled = pd.DataFrame(scaled_array,columns=['total_ev', 'elec_price', 'gas_price', 'model_id_count', 'brand_id_count', 'ev_stations'])
df_scaled

Unnamed: 0,total_ev,elec_price,gas_price,model_id_count,brand_id_count,ev_stations
0,-1.352334,-1.773059,0.934723,-1.821245,-2.061861,-1.242239
1,-1.319886,-1.773059,1.285713,-1.821245,-2.061861,-1.225304
2,-1.083726,-2.015875,1.767221,-1.821245,-2.061861,-1.206048
3,-1.109977,-2.015875,1.851882,-1.821245,-2.061861,-1.192129
4,-1.130769,-1.530244,1.555569,-1.732853,-1.867993,-1.170321
...,...,...,...,...,...,...
91,1.293632,0.897912,-0.403976,1.360870,1.427753,1.988267
92,1.858360,0.897912,-0.455125,1.272478,1.427753,2.131638
93,-0.574442,0.169465,-0.393394,0.653733,0.264548,2.207268
94,-0.415356,-0.558982,-0.444543,0.476949,0.070681,2.289394


In [33]:
num_corr_col = [col for col in df_scaled.columns if df_scaled[col].dtype == 'int64' or df_scaled[col].dtype == 'float64']
corr_dict = {}
#Getting absolute values of correlation since we would need to inspect negative correlation too
for col in num_corr_col:
    corr_dict[col] = abs(df_scaled[col].corr(df_scaled['total_ev']))

In [34]:
for w in sorted(corr_dict, key=corr_dict.get):
    print(w, corr_dict[w])

gas_price 0.3456399598024233
elec_price 0.3570816055619479
ev_stations 0.711506989843572
brand_id_count 0.7489882500777195
model_id_count 0.7760864133115364
total_ev 1.0
