In [45]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from uszipcode import SearchEngine
from IPython.display import Image
from geopy.geocoders import Nominatim
from geopy.distance import geodesic

# Linear Regression w/ l2 norm (Ridge)
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge

# Import Data
Get data from different sources before combining
* Cleaned up EV data: TX_WA_CO_NY.csv
* Average EV price and new car data over time: Avg_EV_Price.csv
* Census data (pop, household income, zipcode): census.csv 

In [23]:
# Import data
df_reg = pd.read_csv('./Data/TX_WA_CO_NY.csv')
df_ev = pd.read_csv('./Data/Avg_EV_Price.csv')
df_c = pd.read_csv('./Data/Census Data/census.csv')

# Convert dates to datetime dtype
df_reg['Registration Date'] = pd.to_datetime(df_reg['Registration Date'])
df_ev['Month'] = pd.to_datetime(df_ev['Month'], format='%b-%y')


# Merge Data
## Aggregate Registration Data by County

In [4]:
#Aggregate by County
# create a SearchEngine object
search = SearchEngine()

# define a function to map zip codes to counties
def zipcode_to_county(zipcode):
    #This county does not get populated for some reason
    if zipcode == 75033:
        return "Collin County"
    
    zipcode_data = search.by_zipcode(zipcode)
    county = zipcode_data.county
    return county

# apply the function to create a new column "County"
df_reg['County'] = df_reg['ZIP Code'].apply(zipcode_to_county)

In [5]:
df_reg.shape

(264426, 6)

In [6]:
nan_rows = df_reg[df_reg.isna().any(axis=1)]

In [7]:
df_reg = df_reg.groupby(["State", "Registration Date", "Drivetrain Type", "County"]).agg('sum').drop(columns = ["ZIP Code"]).reset_index()

start_date = pd.to_datetime('2017-01-01')
end_date = pd.to_datetime('2021-12-31')
df_reg = df_reg[(df_reg['Registration Date'] >= start_date) & (df_reg['Registration Date'] <= end_date)]

In [8]:
df_reg[df_reg['County'] == ""]
df_reg.shape

(19398, 5)

## EV and New Car Prices

In [9]:
# merge ev data in main df
df_reg_ev = pd.merge(df_reg, df_ev, left_on='Registration Date', right_on='Month', how='left')
df_reg_ev = df_reg_ev.drop(['Month'], axis=1)

# Since we don't have ev price data for earlier dates, set all NaN to price for 2020-01-01
fill_val = {'Average EV Price' : df_ev['Average EV Price'][0], 'New Car Average' : df_ev['New Car Average'][0]}
df_reg_ev = df_reg_ev.fillna(value=fill_val)

In [10]:
# check if any nan values
nan_rows = df_reg_ev[df_reg_ev.isna().any(axis=1)]
nan_rows
df_reg_ev.shape

(19398, 7)

## Census Data

In [11]:
# merge census data
df_reg_ev_c = pd.merge(df_reg_ev, df_c, left_on=['County', "State"], right_on=['county', 'state'], how='left')

df_reg_ev_c = df_reg_ev_c.drop(['Unnamed: 0', 'county'], axis=1)

# check if any nan values
nan_rows = df_reg_ev_c[df_reg_ev_c.isna().any(axis=1)]

print(nan_rows)

# Extract info of missing census data
print('Missing census data in:')
print('counties = ',nan_rows['County'].unique())
print('states = ',nan_rows['State'].unique())
print('Total num of countires = ',len(nan_rows['County'].unique()))
print('Total entries w/ nan = ', len(nan_rows))
#print('Total entries in df = ', len(df))

# The number of missing data is < 1% of total data, just drop
df_reg_ev_c = df_reg_ev_c.dropna()

df_reg_ev_c[['population', 'household_income']] = df_reg_ev_c[['population', 'household_income']].astype(int)


Empty DataFrame
Columns: [State, Registration Date, Drivetrain Type, County, Vehicle Count, Average EV Price, New Car Average, Unnamed: 0.1, population, household_income, state]
Index: []
Missing census data in:
counties =  []
states =  []
Total num of countires =  0
Total entries w/ nan =  0


In [12]:
print(df_reg_ev_c.dtypes)

State                        object
Registration Date    datetime64[ns]
Drivetrain Type              object
County                       object
Vehicle Count                 int64
Average EV Price             object
New Car Average              object
Unnamed: 0.1                  int64
population                    int32
household_income              int32
state                        object
dtype: object


# Urban/Rural Divide
Source: https://www2.census.gov/geo/docs/reference/ua/2020_UA_COUNTY.xlsx 
Website: https://www.census.gov/programs-surveys/geography/guidance/geo-areas/urban-rural.html

In [14]:
county_pop_density = pd.read_excel("./Data/2020_UA_COUNTY.xlsx")#, sheet = "2020_UA_COUNTY")
county_pop_density['STATE_NAME']
def state_to_abbreviation(state):
    if state == 'Texas':
        return 'TX'
    elif state == 'New York':
        return 'NY'
    elif state == 'Colorado':
        return 'CO'
    elif state == 'Washington':
        return 'WA'
    else:
        return None # or whatever you want to return if the input is not a valid state name

county_pop_density["STATE_NAME"] = county_pop_density["STATE_NAME"].apply(state_to_abbreviation)

In [15]:
county_pop_density_df = county_pop_density[["STATE_NAME","COUNTY_NAME", "POPDEN_COU"]]
county_pop_density_df['urban_flag'] = [1 if x > 500 else 0 for x in county_pop_density_df['POPDEN_COU']]
county_pop_density_df['COUNTY_NAME'] = county_pop_density_df['COUNTY_NAME'].apply(lambda x: x + ' County')
county_pop_density_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  county_pop_density_df['urban_flag'] = [1 if x > 500 else 0 for x in county_pop_density_df['POPDEN_COU']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  county_pop_density_df['COUNTY_NAME'] = county_pop_density_df['COUNTY_NAME'].apply(lambda x: x + ' County')


Unnamed: 0,STATE_NAME,COUNTY_NAME,POPDEN_COU,urban_flag
0,,Autauga County,98.922916,0
1,,Baldwin County,145.781265,0
2,,Barbour County,28.500467,0
3,,Bibb County,35.814001,0
4,,Blount County,91.696680,0
...,...,...,...,...
3229,,Yabucoa County,551.052867,1
3230,,Yauco County,504.673516,1
3231,,St. Croix County,491.862425,0
3232,,St. John County,197.101643,0


In [16]:
df_gas_elec_prices = pd.read_csv("./Data/electricity_gas_prices_reformatted.csv")

# Define a dictionary to map month names to numerical values
month_dict = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 
              'May': '05', 'Jun': '06', 'Jul': '07', 'Aug': '08', 
              'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
urban_rural_dict = {"Urban" : 1.0, "Rural" : 0.0}

# Use the map() method to convert the "month" column to numerical values
df_gas_elec_prices['Month'] = df_gas_elec_prices['Month'].map(month_dict)

df_gas_elec_prices['urban_flag'] = df_gas_elec_prices['Population Type'].map(urban_rural_dict)


# Combine "year" and "month" columns into a new column in the format "YYYY-MM"
df_gas_elec_prices['year_month'] = df_gas_elec_prices['Year'].astype(str) + '-' + df_gas_elec_prices['Month']

# Convert the "year_month" column to a datetime object
df_gas_elec_prices['date'] = pd.to_datetime(df_gas_elec_prices['year_month'])


df_gas_elec_prices = df_gas_elec_prices.drop(columns = ["Unnamed: 0", "Population Type", "Year", 
                                            "Month", "year_month"])

df_gas_elec_prices


Unnamed: 0,State,Electricity Price,Gas Price,urban_flag,date
0,TX,10.490,2.139,0.0,2017-01-01
1,TX,10.490,2.083,0.0,2017-02-01
2,TX,10.490,2.089,0.0,2017-03-01
3,TX,10.490,2.195,0.0,2017-04-01
4,TX,10.490,2.187,0.0,2017-05-01
...,...,...,...,...,...
531,CO,0.157,3.691,1.0,2022-10-01
532,CO,0.152,3.429,1.0,2022-11-01
533,CO,0.152,2.979,1.0,2022-12-01
534,CO,0.153,3.479,1.0,2023-01-01


In [17]:
df = pd.merge(df_reg_ev_c, county_pop_density_df, left_on=['State','County'], right_on=['STATE_NAME','COUNTY_NAME'], how='left')
df = pd.merge(df, df_gas_elec_prices, left_on = ["State", "urban_flag" ,"Registration Date"], right_on = ["State", "urban_flag", "date"], how = "left")
df = df.drop(columns = ["Unnamed: 0.1", "state", "STATE_NAME", "COUNTY_NAME", "date", "urban_flag"])
nan_rows = df[df.isna().any(axis=1)]
print(nan_rows)

Empty DataFrame
Columns: [State, Registration Date, Drivetrain Type, County, Vehicle Count, Average EV Price, New Car Average, population, household_income, POPDEN_COU, Electricity Price, Gas Price]
Index: []


# Adjacency Matrix

In [33]:
state_order = ['CO', 'NY', 'TX', 'WA']

In [25]:
# drop counties that have nan in state
df_c.dropna(subset=['state'], inplace=True)
df_c

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,population,household_income,state,county
396,396,396,46304.0,18028.0,NY,Allegany County
397,397,397,76750.0,31999.0,NY,Cattaraugus County
398,398,398,84115.0,34328.0,NY,Chemung County
399,399,399,60016.0,25323.0,NY,Columbia County
400,400,400,293524.0,110095.0,NY,Dutchess County
...,...,...,...,...,...,...
2513,2513,2513,5187.0,2060.0,TX,Wheeler County
2514,2514,2514,21419.0,5882.0,TX,Willacy County
2515,2515,2515,7822.0,2618.0,TX,Winkler County
2516,2516,2516,8612.0,2601.0,TX,Yoakum County


In [26]:
df_c['state'].value_counts()['TX']

254

In [28]:
df_a = pd.read_csv('./Data/county_adjacency2010.csv')
df_a

Unnamed: 0,countyname,fipscounty,neighborname,fipsneighbor
0,"Autauga County, AL",1001,"Autauga County, AL",1001
1,"Autauga County, AL",1001,"Chilton County, AL",1021
2,"Autauga County, AL",1001,"Dallas County, AL",1047
3,"Autauga County, AL",1001,"Elmore County, AL",1051
4,"Autauga County, AL",1001,"Lowndes County, AL",1085
...,...,...,...,...
22195,"St. Croix Island, VI",78010,"St. Croix Island, VI",78010
22196,"St. John Island, VI",78020,"St. John Island, VI",78020
22197,"St. John Island, VI",78020,"St. Thomas Island, VI",78030
22198,"St. Thomas Island, VI",78030,"St. John Island, VI",78020


In [34]:
df_a[['county','state']] = df_a['countyname'].str.split(', ', expand=True)
df_a[['neighbor_county', 'neighbor_state']] = df_a['neighborname'].str.split(', ', expand=True)
df_a = df_a[df_a['state'].isin(state_order)]
df_a

Unnamed: 0,countyname,fipscounty,neighborname,fipsneighbor,county,state,neighbor_county,neighbor_state
1614,"Adams County, CO",8001,"Adams County, CO",8001,Adams County,CO,Adams County,CO
1615,"Adams County, CO",8001,"Arapahoe County, CO",8005,Adams County,CO,Arapahoe County,CO
1616,"Adams County, CO",8001,"Broomfield County, CO",8014,Adams County,CO,Broomfield County,CO
1617,"Adams County, CO",8001,"Denver County, CO",8031,Adams County,CO,Denver County,CO
1618,"Adams County, CO",8001,"Jefferson County, CO",8059,Adams County,CO,Jefferson County,CO
...,...,...,...,...,...,...,...,...
20654,"Yakima County, WA",53077,"Klickitat County, WA",53039,Yakima County,WA,Klickitat County,WA
20655,"Yakima County, WA",53077,"Lewis County, WA",53041,Yakima County,WA,Lewis County,WA
20656,"Yakima County, WA",53077,"Pierce County, WA",53053,Yakima County,WA,Pierce County,WA
20657,"Yakima County, WA",53077,"Skamania County, WA",53059,Yakima County,WA,Skamania County,WA


In [51]:
# Create a geolocator object
geolocator = Nominatim(user_agent='my_app')

# Define a function to calculate distance between two counties
def calculate_distance(row):
    # Look up latitude and longitude coordinates for county and neighbor
    county = geolocator.geocode(row['county'] + ', ' + row['state'] + ', USA')
    neighbor = geolocator.geocode(row['neighbor_county'] + ', ' + row['neighbor_state'] + ', USA')
    # Calculate distance using Haversine formula
    try:
        distance = geodesic((county.latitude, county.longitude), (neighbor.latitude, neighbor.longitude)).miles
    except:
        distance = 0
    return distance

# Apply the function to each row of the dataframe
df_a['distance_(mi)'] = df_a.apply(calculate_distance, axis=1)
df_a.to_csv('./Data/df_adjacency.csv', index = False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_a['distance_(mi)'] = df_a.apply(calculate_distance, axis=1)


In [52]:
df_a

Unnamed: 0,countyname,fipscounty,neighborname,fipsneighbor,county,state,neighbor_county,neighbor_state,count,distance_(mi)
1614,"Adams County, CO",8001,"Adams County, CO",8001,Adams County,CO,Adams County,CO,1,0.000000
1615,"Adams County, CO",8001,"Arapahoe County, CO",8005,Adams County,CO,Arapahoe County,CO,1,16.600485
1616,"Adams County, CO",8001,"Broomfield County, CO",8014,Adams County,CO,Broomfield County,CO,1,683.581802
1617,"Adams County, CO",8001,"Denver County, CO",8031,Adams County,CO,Denver County,CO,1,649.967579
1618,"Adams County, CO",8001,"Jefferson County, CO",8059,Adams County,CO,Jefferson County,CO,1,56.499723
...,...,...,...,...,...,...,...,...,...,...
20654,"Yakima County, WA",53077,"Klickitat County, WA",53039,Yakima County,WA,Klickitat County,WA,1,37.217994
20655,"Yakima County, WA",53077,"Lewis County, WA",53041,Yakima County,WA,Lewis County,WA,1,79.821283
20656,"Yakima County, WA",53077,"Pierce County, WA",53053,Yakima County,WA,Pierce County,WA,1,81.294253
20657,"Yakima County, WA",53077,"Skamania County, WA",53059,Yakima County,WA,Skamania County,WA,1,65.230716


In [54]:
df_zeros = df_a[df_a['distance_(mi)'] == 0]
df_zeros.to_csv('./Data/df_zeros.csv', index = False)

In [61]:
nm_rows = df_a[df_a['neighborname'].str.contains('NM', na=False)]
len(nm_rows)

34

In [59]:
# Create a dataframe that lists all neighboring county pairs
df_a['count'] = 1
neighbor_pairs = df_a[['fipscounty', 'fipsneighbor', 'count', 'distance_(mi)']]

# Create a pivot table with neighboring county names as row and column indices
adjacency_matrix = df_a.pivot_table(index='fipscounty', columns='fipsneighbor', values='count', fill_value=0)

# Convert the pivot table to a numpy array to get the adjacency matrix
distance_matrix = df_a.pivot_table(index='fipscounty', columns='fipsneighbor', values='distance_(mi)', fill_value=0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_a['count'] = 1


In [60]:
adjacency_matrix


fipsneighbor,4001,5081,5091,8001,8003,8005,8007,8009,8011,8013,...,53067,53069,53071,53073,53075,53077,56001,56007,56021,56037
fipscounty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8001,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8003,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8005,0,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8007,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8009,0,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53069,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
53071,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
53073,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
53075,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


In [64]:
fips_list = adjacency_matrix.index.tolist()
len(fips_list)

419

In [68]:
df_A2 = adjacency_matrix.loc[:, fips_list]
df_A2

fipsneighbor,8001,8003,8005,8007,8009,8011,8013,8014,8015,8017,...,53059,53061,53063,53065,53067,53069,53071,53073,53075,53077
fipscounty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8001,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8003,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8005,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8007,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8009,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
53071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
53073,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
53075,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [32]:
df

Unnamed: 0,State,Registration Date,Drivetrain Type,County,Vehicle Count,Average EV Price,New Car Average,population,household_income,POPDEN_COU,Electricity Price,Gas Price
0,CO,2017-01-01,BEV,Adams County,24,"$54,669","$38,747",509844,167290,445.323695,12.66,3.378
1,CO,2017-01-01,BEV,Arapahoe County,65,"$54,669","$38,747",649980,241889,821.038725,0.12,2.429
2,CO,2017-01-01,BEV,Boulder County,167,"$54,669","$38,747",324682,127365,455.351666,12.66,3.378
3,CO,2017-01-01,BEV,Broomfield County,14,"$54,669","$38,747",69444,27199,2248.011733,0.12,2.429
4,CO,2017-01-01,BEV,Denver County,94,"$54,669","$38,747",715878,287756,4674.337363,0.12,2.429
...,...,...,...,...,...,...,...,...,...,...,...,...
19393,WA,2021-12-01,PHEV,Wahkiakum County,6,"$63,821","$47,243",4318,1900,16.818489,9.72,3.908
19394,WA,2021-12-01,PHEV,Walla Walla County,81,"$63,821","$47,243",60785,22773,49.277735,9.72,3.908
19395,WA,2021-12-01,PHEV,Whatcom County,639,"$63,821","$47,243",224538,88978,107.617198,9.72,3.908
19396,WA,2021-12-01,PHEV,Whitman County,56,"$63,821","$47,243",49577,18485,22.217436,9.72,3.908


# Prediction Models
## Linear Regression
### All Data
Will run the first linear regression model with all data even though the trends do not necessarily make sense with a decrease of EV registration after 2021 for TX, NY, and CO

In [19]:
Image(filename='.\Figures\EV Purchases over Time for TX CO NY and WA.jpg')

FileNotFoundError: [Errno 2] No such file or directory: '.\\Figures\\EV Purchases over Time for TX CO NY and WA.jpg'

In [28]:
df

Unnamed: 0,State,Registration Date,Drivetrain Type,County,Vehicle Count,Average EV Price,New Car Average,population,household_income,POPDEN_COU,Electricity Price,Gas Price
0,CO,2017-01-01,BEV,Adams County,24,"$54,669","$38,747",509844,167290,445.323695,12.66,3.378
1,CO,2017-01-01,BEV,Arapahoe County,66,"$54,669","$38,747",649980,241889,821.038725,0.12,2.429
2,CO,2017-01-01,BEV,Boulder County,170,"$54,669","$38,747",324682,127365,455.351666,12.66,3.378
3,CO,2017-01-01,BEV,Broomfield County,15,"$54,669","$38,747",69444,27199,2248.011733,0.12,2.429
4,CO,2017-01-01,BEV,Denver County,95,"$54,669","$38,747",715878,287756,4674.337363,0.12,2.429
...,...,...,...,...,...,...,...,...,...,...,...,...
17001,WA,2019-01-01,PHEV,Thurston County,84,"$54,669","$38,747",284698,112323,408.022371,9.52,3.908
17002,WA,2019-01-01,PHEV,Walla Walla County,6,"$54,669","$38,747",60785,22773,49.277735,9.52,3.908
17003,WA,2019-01-01,PHEV,Whatcom County,35,"$54,669","$38,747",224538,88978,107.617198,9.52,3.908
17004,WA,2019-01-01,PHEV,Whitman County,5,"$54,669","$38,747",49577,18485,22.217436,9.52,3.908


In [33]:
# Desired prediction var
predict_label = 'Vehicle Count'

# Drop zip code since it would increase the number of features by ~4k
# also zip code is highly correlated to population and income
# Drop registration date since we will change to Unix timestamps
drop_col = ['County', 'Registration Date']

# Assemble Categorical Variables
cat_var = ['State', 'Drivetrain Type']
for cat in cat_var:
    # Get dummy variables for cat
    dummy_var = df[cat].unique()
    dummy_var = dummy_var[1:]

    # create df w/ one hot cat features
    df_cat = pd.get_dummies(df[cat], drop_first=True)

    # drop original
    df = df.drop([cat], axis=1)

    # concatenate
    df = pd.concat([df, df_cat], axis=1)

# Add Unix timestamp   
df['Unix Time'] = df['Registration Date'].apply(lambda x: x.timestamp())

# Convert $ price into integer
df['Average EV Price'] = df['Average EV Price'].str.replace('$','').str.replace(',','').astype(int)
df['New Car Average'] = df['New Car Average'].str.replace('$','').str.replace(',','').astype(int)

df

  df['Average EV Price'] = df['Average EV Price'].str.replace('$','').str.replace(',','').astype(int)
  df['New Car Average'] = df['New Car Average'].str.replace('$','').str.replace(',','').astype(int)


Unnamed: 0,Registration Date,County,Vehicle Count,Average EV Price,New Car Average,population,household_income,POPDEN_COU,Electricity Price,Gas Price,NY,TX,WA,PHEV,Unix Time
0,2017-01-01,Adams County,24,54669,38747,509844,167290,445.323695,12.66,3.378,0,0,0,0,1.483229e+09
1,2017-01-01,Arapahoe County,66,54669,38747,649980,241889,821.038725,0.12,2.429,0,0,0,0,1.483229e+09
2,2017-01-01,Boulder County,170,54669,38747,324682,127365,455.351666,12.66,3.378,0,0,0,0,1.483229e+09
3,2017-01-01,Broomfield County,15,54669,38747,69444,27199,2248.011733,0.12,2.429,0,0,0,0,1.483229e+09
4,2017-01-01,Denver County,95,54669,38747,715878,287756,4674.337363,0.12,2.429,0,0,0,0,1.483229e+09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17001,2019-01-01,Thurston County,84,54669,38747,284698,112323,408.022371,9.52,3.908,0,0,1,1,1.546301e+09
17002,2019-01-01,Walla Walla County,6,54669,38747,60785,22773,49.277735,9.52,3.908,0,0,1,1,1.546301e+09
17003,2019-01-01,Whatcom County,35,54669,38747,224538,88978,107.617198,9.52,3.908,0,0,1,1,1.546301e+09
17004,2019-01-01,Whitman County,5,54669,38747,49577,18485,22.217436,9.52,3.908,0,0,1,1,1.546301e+09


In [34]:
# Get labels of all features
features = [c for c in df.columns if c not in [predict_label] + drop_col]

# extract values to np
y = df[predict_label].to_numpy()
X = df[features].to_numpy()

df = df.reset_index(drop=True)
df.to_csv('./Data/df_all_features_county.csv', index = False)
df[features].to_csv('./Data/df_X_county.csv', index = False)
df[predict_label].to_csv('./Data/df_y_county.csv', index = False)
