In [206]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import string
import os
import itertools

import descartes
import geopandas as gpd
from shapely.geometry import Point, Polygon

%config InlineBackend.figure_format = 'retina'
%matplotlib inline
pd.options.display.max_rows = 5

In [207]:
p1 = "/Users/sai/Documents/00 NEU/Semester 1/"
p2 = "1 DS 5110 - Introduction to Data Management and Processing/"
p3 = "Project/NYC-Taxi-Data-Analysis/data/green/"

dpath = p1 + p2 + p3
fname = "green_samp_locid.csv"

In [208]:
green = pd.read_csv(dpath+fname)

In [209]:
green.head()

Unnamed: 0,vendorid,lpep_pickup_datetime,lpep_dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pulocationid,dolocationid,trip_distance,...,fare_amount,tip_amount,extra,tolls_amount,improvement_surcharge,ehail_fee,mta_tax,ratecodeid,passenger_count,store_and_fwd_flag
0,2,2015-12-12 21:56:27,2015-12-12 22:04:52,-73.964546,40.715477,-73.95871,40.713448,256.0,256.0,1.3,...,7.5,0.0,0.5,0.0,0.3,,0.5,1,6,N
1,1,2014-05-10 17:54:57,2014-05-10 18:10:46,-73.893402,40.851505,-73.857895,40.880135,78.0,254.0,6.0,...,19.5,0.0,0.0,0.0,,,0.5,1,1,N
2,2,2015-12-28 19:15:23,2015-12-28 19:22:08,-73.981316,40.689789,-73.966042,40.693432,97.0,49.0,1.11,...,6.5,1.66,1.0,0.0,0.3,,0.5,1,1,N
3,2,2014-08-22 06:47:18,2014-08-22 07:02:34,-73.945518,40.832947,-74.003052,40.756271,244.0,246.0,7.1,...,21.5,0.0,0.0,0.0,,,0.5,1,1,N
4,2,2015-10-19 10:15:08,2015-10-19 10:48:05,-73.961182,40.806767,-73.972717,40.756344,166.0,162.0,4.4,...,22.0,4.56,0.0,0.0,0.3,,0.5,1,5,N


In [210]:
green.columns

Index(['vendorid', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'pickup_longitude', 'pickup_latitude', 'dropoff_longitude',
       'dropoff_latitude', 'pulocationid', 'dolocationid', 'trip_distance',
       'trip_type', 'payment_type', 'total_amount', 'fare_amount',
       'tip_amount', 'extra', 'tolls_amount', 'improvement_surcharge',
       'ehail_fee', 'mta_tax', 'ratecodeid', 'passenger_count',
       'store_and_fwd_flag'],
      dtype='object')

In [211]:
gr = (
    green
    .filter(['lpep_pickup_datetime', 'pulocationid', 'dolocationid', 
             'trip_distance', 'total_amount'])
)

In [212]:
gr.head()

Unnamed: 0,lpep_pickup_datetime,pulocationid,dolocationid,trip_distance,total_amount
0,2015-12-12 21:56:27,256.0,256.0,1.3,8.8
1,2014-05-10 17:54:57,78.0,254.0,6.0,20.0
2,2015-12-28 19:15:23,97.0,49.0,1.11,9.96
3,2014-08-22 06:47:18,244.0,246.0,7.1,22.0
4,2015-10-19 10:15:08,166.0,162.0,4.4,27.36


In [214]:
class PriceGradients():
    
    n = 10
    
    def __init__(self, data):
        self.data = data
        
        
    def get_subset(self, comb):
        s_id, d_id = comb
        
        indices = (
            self.data
            .query("(pulocationid == @s_id) & (dolocationid == @d_id)")
            .index
            .values)
        
        return(indices)
    
    
    def compute_gradient(self):
        subset = self.data.loc[self.subset_ind, :]
        
        if subset.shape[0] != 0:
            gradients = (subset.total_amount.values / subset.trip_distance.values)
            gradients = gradients[np.invert(np.isinf(gradients))]
            gradients = gradients[np.invert(np.isnan(gradients))]
            if len(gradients) == 0:
                gradient = np.nan
            else:
                gradient = np.nanmean(gradients)
        else:
            gradient = np.nan
        
        return(gradient)
    
    
    def main(self):
        gradients = np.zeros((self.n, self.n))
        combn = [(x+1,y+1) for x in range(self.n) for y in range(self.n)]
        
        for comb in combn:
            self.subset_ind = self.get_subset(comb)
            gradients[comb[0]-1, comb[1]-1] = self.compute_gradient()
        
        return(gradients)

In [215]:
pg_gr = PriceGradients(data=gr)
grad = pg_gr.main()



In [216]:
grad

array([[2088.46622767,           nan,           nan,           nan,
                  nan,           nan,           nan,           nan,
                  nan,           nan],
       [          nan,           nan,           nan,           nan,
                  nan,           nan,           nan,           nan,
                  nan,           nan],
       [          nan,           nan,  103.95470253,           nan,
                  nan,           nan,    3.72865029,           nan,
                  nan,           nan],
       [          nan,           nan,           nan,    8.34126439,
                  nan,           nan,           nan,           nan,
                  nan,           nan],
       [          nan,           nan,           nan,           nan,
                  nan,           nan,           nan,           nan,
                  nan,           nan],
       [   4.9122807 ,           nan,           nan,           nan,
                  nan,  115.8407994 ,           nan,     

In [217]:
subset = gr.query("pulocationid==1 & dolocationid==1")
subset

Unnamed: 0,lpep_pickup_datetime,pulocationid,dolocationid,trip_distance,total_amount
6889,2014-05-14 04:59:26,1.0,1.0,3.20,97.50
69014,2015-01-15 15:23:31,1.0,1.0,0.00,25.33
...,...,...,...,...,...
7190297,2014-09-21 19:34:15,1.0,1.0,0.01,100.00
7192780,2014-05-04 16:57:56,1.0,1.0,5.70,94.00


In [194]:
b = (subset.total_amount.values / subset.trip_distance.values)
b = b[np.invert(np.isinf(b))]
b = b[np.invert(np.isnan(b))]
b
# np.nanmean(b)

  """Entry point for launching an IPython kernel.


array([], dtype=float64)

In [128]:
green.iloc[[7190296,7190297,7190298],:]

Unnamed: 0,vendorid,lpep_pickup_datetime,lpep_dropoff_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,pulocationid,dolocationid,trip_distance,...,fare_amount,tip_amount,extra,tolls_amount,improvement_surcharge,ehail_fee,mta_tax,ratecodeid,passenger_count,store_and_fwd_flag
7190296,2,2015-01-11 23:14:43,2015-01-11 23:37:52,-73.929909,40.756729,-73.945061,40.757511,7.0,193.0,3.61,...,15.0,0.0,0.5,0.0,0.3,,0.5,1,2,N
7190297,2,2014-09-21 19:34:15,2014-09-22 18:08:03,-74.177094,40.694965,-74.177132,40.695004,1.0,1.0,0.01,...,100.0,0.0,0.0,0.0,,,0.0,5,1,N
7190298,2,2017-09-22 03:38:26,2017-09-22 03:42:18,,,,,49.0,17.0,0.9,...,5.0,1.89,0.5,0.0,0.3,,0.5,1,1,N


In [135]:
a = np.array([1, 2, 3, np.inf, -np.inf, 4])
a

array([  1.,   2.,   3.,  inf, -inf,   4.])

In [150]:
a[np.invert(np.isinf(a))]

array([1., 2., 3., 4.])