In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch  
import torch.nn as nn
from uszipcode import SearchEngine
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split




# Neural Net Class and Training Functions
Define Class and functions

In [8]:
def train_model(nn_model, X_train, y_train, X_eval, y_eval, k, max_iter=50, batch_size=32, print_n=10, verbose=False):
    '''
    Trains neural network model on X_train, y_train data.

    Parameters
    ----------
    X_train: np.array
        matrix of training data features
    y_train: np.array
        vector of training data labels
    k: int
        size of hidden layer to use in neural network
    max_iter: int
        maximum number of iterations to train for
    batch_size: int
        batch size to use when training w/ SGD
    print_n: int
        print training progress every print_n steps

    Returns
    ----------
    nn_model: torch.nn.Module
        trained neural network model
    '''
    # convert to tensors (for Pytorch)
    X_train_tensor = torch.tensor(X_train)
    y_train_tensor = torch.tensor(y_train)
    X_test_tensor = torch.tensor(X_eval)
    y_test_tensor = torch.tensor(y_eval)
    # intialize neural network
    n_samples, n_features = X_train_tensor.shape
    #nn_model = NN(n_features, k)
    nn_model.train()  # put model in train mode
    # initialize mse loss function
    mse_loss = torch.nn.MSELoss()
    # train with (mini-batch) SGD; initialize optimizer
    opt = torch.optim.SGD(nn_model.parameters(), lr=1e-4)
    losses_test = []
    for it in range(max_iter):
        # save losses across all batches
        losses = []
        # loop through data in batches
        for batch_start in range(0, n_samples, batch_size):
            # reset gradients to zero
            opt.zero_grad()
            # form batch
            X_batch = X_train_tensor[batch_start:batch_start+batch_size]
            y_batch = y_train_tensor[batch_start:batch_start+batch_size]
            X_batch_test = X_test_tensor[batch_start:batch_start+batch_size]
            y_batch_test = y_test_tensor[batch_start:batch_start+batch_size]
            # pass batch through neural net to get prediction
            y_pred = nn_model(X_batch.float())
            y_pred_test = nn_model(X_batch_test.float())
            # compute MSE loss
            loss = mse_loss(y_pred, y_batch[:, None].float())
            loss_test = mse_loss(y_pred_test, y_batch_test[:, None].float())
            # back-propagate loss
            loss.backward()
            # update model parameters based on backpropogated gradients
            opt.step()
            losses.append(loss.item())
            losses_test.append(loss.item())
        if verbose and it % print_n == 0:
            print(f"Mean Train MSE at step {it}: {np.mean(losses)}")
    return nn_model, losses_test

def evaluate_model(nn_model, X_eval, y_eval, batch_size=32):
    '''
    Evaluates trained neural network model on X_eval, y_eval data.

    Parameters
    ----------
    nn_model: torch.nn.Module
        trained neural network model
    X_eval: np.array
        matrix of training data features
    y_eval: np.array
        vector of training data labels
    batch_size: int
        batch size to looping over dataset to generate predictions

    Returns
    ----------
    mse: float
        MSE of trained model on X_eval, y_eval data
    '''
    # initialize mse loss function
    mse_loss = torch.nn.MSELoss()
    # convert to tensors (for Pytorch)
    X_eval_tensor = torch.tensor(X_eval)
    y_eval_tensor = torch.tensor(y_eval)
    n_samples = X_eval_tensor.shape[0]
    nn_model.eval() # put in eval mode
    # loop over data and generate predictions
    preds = []
    for batch_start in range(0, n_samples, batch_size):
        # form batch
        X_batch = X_eval_tensor[batch_start:batch_start+batch_size]
        y_batch = y_eval_tensor[batch_start:batch_start+batch_size]
        with torch.no_grad():  # no need to compute gradients during evaluation
            # pass batch through neural net to get prediction
            y_pred = nn_model(X_batch.float())
            preds.append(y_pred)
    # compute MSE across all samples
    all_preds = torch.cat(preds)
    loss = mse_loss(all_preds, y_eval_tensor[:, None].float()).item()
    return loss

class NN1(nn.Module):
    '''
    Class for fully connected neural net.
    '''
    def __init__(self, input_dim, hidden_dim):
        '''
        Parameters
        ----------
        input_dim: int
            input dimension (i.e., # of features in each example passed to the network)
        hidden_dim: int
            number of nodes in hidden layer
        '''
        super().__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ELU(),
            nn.ReLU(),
            nn.Linear(hidden_dim, 1),
            nn.ReLU()
        )    
    

# Import Data
Get data from different sources before combining
* Cleaned up EV data: TX_WA_CO_NY.csv
* Average EV price and new car data over time: Avg_EV_Price.csv
* Census data (pop, household income, zipcode): census.csv 

In [26]:
def main():   
    # Import data
    df_reg = pd.read_csv('./Data/TX_WA_CO_NY.csv')
    df_ev = pd.read_csv('./Data/Avg_EV_Price.csv')
    df_c = pd.read_csv('./Data/Census Data/census.csv')

    # Convert dates to datetime dtype
    df_reg['Registration Date'] = pd.to_datetime(df_reg['Registration Date'])
    df_ev['Month'] = pd.to_datetime(df_ev['Month'], format='%b-%y')

    #Aggregate by County
    # create a SearchEngine object
    search = SearchEngine()

    # define a function to map zip codes to counties
    def zipcode_to_county(zipcode):
        #This county does not get populated for some reason
        if zipcode == 75033:
            return "Collin County"
        
        zipcode_data = search.by_zipcode(zipcode)
        county = zipcode_data.county
        return county

    # apply the function to create a new column "County"
    df_reg['County'] = df_reg['ZIP Code'].apply(zipcode_to_county)

    nan_rows = df_reg[df_reg.isna().any(axis=1)]

    df_reg = df_reg.groupby(["State", "Registration Date", "Drivetrain Type", "County"]).agg('sum').drop(columns = ["ZIP Code"]).reset_index()

    start_date = pd.to_datetime('2017-01-01')
    end_date = pd.to_datetime('2021-12-31')
    df_reg = df_reg[(df_reg['Registration Date'] >= start_date) & (df_reg['Registration Date'] <= end_date)]

    df_reg[df_reg['County'] == ""]

    # merge ev data in main df
    df_reg_ev = pd.merge(df_reg, df_ev, left_on='Registration Date', right_on='Month', how='left')
    df_reg_ev = df_reg_ev.drop(['Month'], axis=1)

    # Since we don't have ev price data for earlier dates, set all NaN to price for 2020-01-01
    fill_val = {'Average EV Price' : df_ev['Average EV Price'][0], 'New Car Average' : df_ev['New Car Average'][0]}
    df_reg_ev = df_reg_ev.fillna(value=fill_val)




In [27]:
if __name__ == '__main__':
    main()

<bound method NDFrame.head of        State  ZIP Code Registration Date Drivetrain Type  Vehicle Count  \
0         TX     75001        2019-11-01            PHEV              2   
1         TX     75001        2020-01-01            PHEV              9   
2         TX     75001        2020-02-01             BEV              4   
3         TX     75001        2020-04-01             BEV              6   
4         TX     75001        2020-05-01            PHEV             16   
...      ...       ...               ...             ...            ...   
229929    NY     14905        2022-12-01            PHEV              6   
229930    NY     14905        2023-01-01             BEV              5   
229931    NY     14905        2023-01-01            PHEV              6   
229932    NY     14905        2023-02-01             BEV              1   
229933    NY     14905        2023-02-01            PHEV              2   

       Average EV Price New Car Average  
0              $54,669     

KeyError: "Column(s) ['household_income', 'population'] do not exist"