# Evaluate Interpolation Methods

Let's try out different interpolation methods! 

In [2]:
import os, sys, pandas as pd, numpy as np
import pyarrow as pa
import pyarrow.parquet as pq
from dotenv import dotenv_values

if os.getlogin() == "WylieTimmerman":
    # Working Paths
    # the following needed to run in vscode jupyter interpreter
    os.environ["GDAL_DATA"] = os.environ["CONDA_PREFIX"] + "\Library\share\gdal"
    path_working = r"C:\Users\WylieTimmerman\Documents\projects_local\WMATA_AVL_datamart"
    os.chdir(os.path.join(path_working))
    sys.path.append(r"C:\Users\WylieTimmerman\Documents\projects_local\WMATA_AVL_datamart")
    path_sp = r"C:\OD\Foursquare ITP\Projects - WMATA Datamart\Task 3 - Bus Priority"
    path_source_data = os.path.join(path_sp,"data","00-Raw")
    path_processed_data = os.path.join(path_sp, "Data","02-Processed")
    # Server credentials
    config = dotenv_values(os.path.join(path_working, '.env'))
    # other things for wylie's dev environment            


We'll read in some already decomposed data, since it includes some flags for stop window changes that will be useful. The error here comes from a duplicated column that comes from a bad join we didn't address earlier.

In [3]:
rawnav = (
    pd.read_csv(
        os.path.join(path_sp,"data","01-interim","test_decomp_mov9_all.csv")
    )
    .drop(['index_run_end','lat_raw','long_raw','sat_cnt','odom_interp_fail','odom_low','odom_hi','blank'], axis = "columns")
)


  exec(code_obj, self.user_global_ns, self.user_ns)


In [4]:
rawnav.info(verbose = True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 840726 entries, 0 to 840725
Data columns (total 46 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Unnamed: 0            840726 non-null  int64  
 1   filename              840726 non-null  object 
 2   index_run_start       840726 non-null  float64
 3   route_pattern         840726 non-null  object 
 4   pattern               840726 non-null  int64  
 5   route                 840726 non-null  object 
 6   wday                  840726 non-null  object 
 7   start_date_time       840726 non-null  object 
 8   index_loc             840726 non-null  float64
 9   odom_ft               840724 non-null  float64
 10  sec_past_st           840726 non-null  float64
 11  heading               840726 non-null  float64
 12  door_state            840726 non-null  object 
 13  veh_state             840726 non-null  object 
 14  row_before_apc        840726 non-null  object 
 15  

Let's write some functions that evaluate size of miss

In [7]:
def calc_fail(rawnav, ft_threshold = 1):

    rawnav = (
        rawnav
        # calculate miss from each side of range
        .assign(
            odom_low = lambda x, ft = ft_threshold :
                (x.odom_ft_min - ft) - x.odom_ft,
            odom_hi  = lambda x, ft = ft_threshold :
                x.odom_ft - (x.odom_ft_max + ft),
        )
        # then keep only the ones where this is positive
        .assign(
            odom_low = lambda x: np.maximum.reduce(x.odom_low, 0),
            odom_hi = lambda x: np.maximum.reduce(x.odom_hi,0)
        )
        .assign(
            odom_miss = lambda x: np.maximum.reduce(x.odom_low,x.odom_hi)
        )
        .assign(
            odom_interp_fail = lambda x:
                x.odom_miss.gt(0)
        ) 
    )


let's double check on the existing files

In [8]:
rawnav_check = (
    rawnav
    .pipe(
        calc_fail,
        ft_threshold = 1
    )
)

TypeError: return arrays must be of ArrayType

In [None]:
rawnav_fails = rawnav_check.odom_interp_fail.describe()