In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from uszipcode import SearchEngine

# Import Data
Get data from different sources before combining
* Cleaned up EV data: TX_WA_CO_NY.csv
* Average EV price and new car data over time: Avg_EV_Price.csv
* Census data (pop, household income, zipcode): census.csv 

In [54]:
# Import data
df = pd.read_csv('.\Data\TX_WA_CO_NY.csv')
df_ev = pd.read_csv('.\Data\Avg_EV_Price.csv')
df_c = pd.read_csv('.\Data\Census Data\census.csv')

# Convert dates to datetime dtype
df['Registration Date'] = pd.to_datetime(df['Registration Date'])
df_ev['Month'] = pd.to_datetime(df_ev['Month'], format='%b-%y')


In [55]:
# merge ev data in main df
df = pd.merge(df, df_ev, left_on='Registration Date', right_on='Month', how='left')
df = df.drop(['Month'], axis=1)

# Since we don't have ev price data for earlier dates, set all NaN to price for 2020-01-01
fill_val = {'Average EV Price' : df_ev['Average EV Price'][0], 'New Car Average' : df_ev['New Car Average'][0]}
df = df.fillna(value=fill_val)

In [56]:
# check if any nan values
nan_rows = df[df.isna().any(axis=1)]
nan_rows

Unnamed: 0,State,ZIP Code,Registration Date,Drivetrain Type,Vehicle Count,Average EV Price,New Car Average


In [52]:
df_c['zipcode']

0        29590
1        93306
2        93660
3        93110
4        93212
         ...  
33115    16623
33116    16627
33117    16634
33118    16640
33119    17062
Name: zipcode, Length: 33120, dtype: int64

In [57]:
# merge census data
df = pd.merge(df, df_c, left_on='ZIP Code', right_on='zipcode', how='left')
df = df.drop(['Unnamed: 0', 'zipcode'], axis=1)
#df[['population', 'household_income']] = df[['population', 'household_income']].astype(int)
#df

In [67]:
# check if any nan values
nan_rows = df[df.isna().any(axis=1)]

# Extract info of missing census data
print('Missing census data in:')
print('zip codes = ',nan_rows['ZIP Code'].unique())
print('states = ',nan_rows['State'].unique())
print('Total num of zips = ',len(nan_rows['ZIP Code'].unique()))
print('Total entries w/ nan = ', len(nan_rows))
print('Total entries in df = ', len(df))

# The number of missing data is < 1% of total data, just drop
df = df.dropna()

Missing census data in:
zip codes =  [75033 75036 75072 75242 75245 75260 76005 78206 78599 79918 80430 80912
 81228 10041 10105 10106 10107 10118 10120 10122 10123 10151 10155 10166
 10281 11243 11249 12016 12223 12226 12593 12854]
states =  ['TX' 'CO' 'NY']
Total num of zips =  32
Total entries w/ nan =  560
Total entries in df =  229934


In [59]:
df

Unnamed: 0,State,ZIP Code,Registration Date,Drivetrain Type,Vehicle Count,Average EV Price,New Car Average,population,household_income
0,TX,75001,2019-11-01,PHEV,2,"$54,669","$38,747",15418.0,8404.0
1,TX,75001,2020-01-01,PHEV,9,"$54,669","$38,747",15418.0,8404.0
2,TX,75001,2020-02-01,BEV,4,"$56,326","$38,550",15418.0,8404.0
3,TX,75001,2020-04-01,BEV,6,"$57,757","$39,904",15418.0,8404.0
4,TX,75001,2020-05-01,PHEV,16,"$58,863","$39,138",15418.0,8404.0
...,...,...,...,...,...,...,...,...,...
229929,NY,14905,2022-12-01,PHEV,6,"$61,448","$49,507",8330.0,3686.0
229930,NY,14905,2023-01-01,BEV,5,"$58,725","$49,388",8330.0,3686.0
229931,NY,14905,2023-01-01,PHEV,6,"$58,725","$49,388",8330.0,3686.0
229932,NY,14905,2023-02-01,BEV,1,"$58,385","$48,763",8330.0,3686.0


In [20]:
date_extract = pd.to_datetime('2021-01-01').date()
dft = df[df['Registration Date'].dt.date == date_extract]
dft

Unnamed: 0,State,ZIP Code,Registration Date,Drivetrain Type,Vehicle Count,Month,Average EV Price,New Car Average
17,TX,75001,2021-01-01,BEV,47,2021-01-01,"$57,750","$41,248"
18,TX,75001,2021-01-01,PHEV,23,2021-01-01,"$57,750","$41,248"
91,TX,75002,2021-01-01,BEV,283,2021-01-01,"$57,750","$41,248"
92,TX,75002,2021-01-01,PHEV,117,2021-01-01,"$57,750","$41,248"
158,TX,75006,2021-01-01,BEV,108,2021-01-01,"$57,750","$41,248"
...,...,...,...,...,...,...,...,...
229568,NY,14892,2021-01-01,PHEV,25,2021-01-01,"$57,750","$41,248"
229685,NY,14901,2021-01-01,PHEV,23,2021-01-01,"$57,750","$41,248"
229757,NY,14903,2021-01-01,PHEV,25,2021-01-01,"$57,750","$41,248"
229893,NY,14905,2021-01-01,BEV,46,2021-01-01,"$57,750","$41,248"


In [23]:
# create a sample dataframe with a date column
df1 = pd.DataFrame({'date_col': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04']})

# create another dataframe with a date column with a different name and a column with new values
df2 = pd.DataFrame({'other_date_col': ['2023-01-01', '2023-01-02'],
                    'new_col_values': [10, 20]})

# join the two dataframes on the date column and the other_date_col
df_merged = pd.merge(df1, df2, left_on='date_col', right_on='other_date_col', how='left')

# create a new column in the first dataframe based on the values in the second dataframe
df_merged['new_col'] = df_merged['new_col_values']

# drop the unnecessary columns
df_merged = df_merged.drop(['other_date_col', 'new_col_values'], axis=1)

In [24]:
df1

Unnamed: 0,date_col
0,2023-01-01
1,2023-01-02
2,2023-01-03
3,2023-01-04
