In [1]:
import pandas as pd
import numpy as np
import geopy as geopy
from geopy.distance import geodesic
import datetime
import time

In [2]:
# csv data source: https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page
trips = pd.read_csv('yellow_tripdata_2016-01.csv')

In [3]:
trips = trips[['tpep_pickup_datetime', 
               'tpep_dropoff_datetime', 
               'trip_distance', 
               'pickup_longitude', 
               'pickup_latitude', 
               'dropoff_longitude', 
               'dropoff_latitude', 
               'passenger_count']]

In [4]:
lga_long = -73.8702298524
lga_lat = 40.7730135746
lga_long_rad = lga_long*np.pi/180
lga_lat_rad = lga_lat*np.pi/180

In [5]:
def distanceToLGA(df):
    long_rad = df['pickup_longitude']*np.pi/180
    lat_rad = df['pickup_latitude']*np.pi/180
    drad = 2*np.arcsin(np.sqrt(np.sin((lat_rad-lga_lat_rad)/2)*np.sin((lat_rad-lga_lat_rad)/2)+np.cos(lat_rad)*np.cos(lga_lat_rad)*np.sin((long_rad-lga_long_rad)/2)*np.sin((long_rad-lga_long_rad)/2)))
    dmile = drad*180*60*1.15/np.pi
    return df.assign(pku_LGA=dmile)
trips = trips.pipe(distanceToLGA)

In [6]:
def distanceToLGA_dropoff(df):
    long_rad = df['dropoff_longitude']*np.pi/180
    lat_rad = df['dropoff_latitude']*np.pi/180
    drad = 2*np.arcsin(np.sqrt(np.sin((lat_rad-lga_lat_rad)/2)*np.sin((lat_rad-lga_lat_rad)/2)+np.cos(lat_rad)*np.cos(lga_lat_rad)*np.sin((long_rad-lga_long_rad)/2)*np.sin((long_rad-lga_long_rad)/2)))
    dmile = drad*180*60*1.15/np.pi
    return df.assign(dpo_LGA=dmile)
trips = trips.pipe(distanceToLGA_dropoff)

In [7]:
# filter out any trips that have 4 or more passengers since those are not shareable
trips = trips.loc[trips["passenger_count"] < 4]
# sort trips by pickup time, but first convert the filed to a datetime
trips['tpep_pickup_datetime'] =  pd.to_datetime(trips['tpep_pickup_datetime'], format='%Y-%m-%d %H:%M:%S')
trips.sort_values(by=["tpep_pickup_datetime"])

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pku_LGA,dpo_LGA
0,2016-01-01 00:00:00,2016-01-01 00:00:00,1.10,-73.990372,40.734695,-73.981842,40.732407,2,6.813587,6.471903
2,2016-01-01 00:00:00,2016-01-01 00:00:00,10.54,-73.984550,40.679565,-73.950272,40.788925,1,8.792662,4.323707
3,2016-01-01 00:00:00,2016-01-01 00:00:00,4.75,-73.993469,40.718990,-73.962242,40.657333,1,7.443047,9.320319
4,2016-01-01 00:00:00,2016-01-01 00:00:00,1.76,-73.960625,40.781330,-73.977264,40.758514,3,4.757918,5.682344
5,2016-01-01 00:00:00,2016-01-01 00:18:30,5.52,-73.980118,40.743050,-73.913490,40.763142,2,6.104168,2.361085
6,2016-01-01 00:00:00,2016-01-01 00:26:45,7.45,-73.994057,40.719990,-73.966362,40.789871,2,7.435427,5.155570
7,2016-01-01 00:00:01,2016-01-01 00:11:55,1.20,-73.979424,40.744614,-73.992035,40.753944,1,6.034070,6.500262
8,2016-01-01 00:00:02,2016-01-01 00:11:14,6.00,-73.947151,40.791046,-73.920769,40.865578,1,4.207095,6.910658
9,2016-01-01 00:00:02,2016-01-01 00:11:08,3.21,-73.998344,40.723896,-73.995850,40.688400,1,7.505685,8.787973
10,2016-01-01 00:00:03,2016-01-01 00:06:19,0.79,-74.006149,40.744919,-73.993797,40.741440,1,7.363571,6.815963


In [8]:
trips_fromLGA = trips.loc[trips['pku_LGA'] <= 0.5, :]
#trips from LGA: pickup location in 0.5mile range of LGA coordinates, and dropoff location is outside of 0.5mile range
trips_fromLGA = trips_fromLGA.loc[trips_fromLGA['dpo_LGA'] > 0.5, :]
trips_fromLGA

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pku_LGA,dpo_LGA
289,2016-01-29 09:18:37,2016-01-29 09:56:54,10.20,-73.872765,40.773754,-73.973061,40.761772,1,0.141964,5.429452
422,2016-01-29 09:19:02,2016-01-29 09:50:34,8.60,-73.870934,40.773758,-73.982758,40.742805,1,0.063166,6.239775
433,2016-01-29 09:19:03,2016-01-29 09:25:55,2.00,-73.872803,40.773834,-73.906647,40.770035,1,0.145881,1.914034
534,2016-01-29 09:19:19,2016-01-29 09:30:25,2.10,-73.874763,40.773937,-73.882248,40.754311,3,0.245322,1.435230
569,2016-01-29 09:19:26,2016-01-29 10:06:03,13.00,-73.862831,40.768871,-73.976311,40.760128,1,0.480801,5.614521
807,2016-01-29 09:20:09,2016-01-29 09:55:37,10.03,-73.872910,40.774151,-73.970413,40.761841,1,0.160513,5.291865
841,2016-01-29 09:20:14,2016-01-29 09:43:51,9.46,-73.862823,40.768822,-73.954689,40.765785,1,0.483163,4.441652
881,2016-01-29 09:20:23,2016-01-29 09:44:32,10.80,-73.873146,40.774136,-73.905655,40.678722,1,0.170921,6.764662
887,2016-01-29 09:20:24,2016-01-29 09:47:55,8.70,-73.870964,40.773941,-73.981499,40.739906,1,0.074614,6.248248
1015,2016-01-29 09:20:45,2016-01-29 09:48:23,9.20,-73.863518,40.769871,-73.971428,40.754177,1,0.412365,5.446104


In [9]:
trips_toLGA = trips.loc[trips['dpo_LGA'] <= 0.5, :]
# trips to LGA: pickup location outside the 0.5mile range, and dropoff is in 0.5mile range of LGA
trips_toLGA = trips_toLGA.loc[trips_toLGA['pku_LGA'] > 0.5, :]
trips_toLGA

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,trip_distance,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,pku_LGA,dpo_LGA
268,2016-01-29 09:18:33,2016-01-29 09:40:10,10.76,-73.977135,40.751945,-73.870529,40.773758,1,5.773101,0.053690
307,2016-01-29 09:18:39,2016-01-29 09:45:00,8.82,-73.975693,40.765018,-73.870613,40.773979,2,5.538719,0.069572
388,2016-01-29 09:18:54,2016-01-29 10:08:47,12.93,-74.000771,40.762310,-73.865028,40.770580,2,6.861671,0.319477
547,2016-01-29 09:19:22,2016-01-29 09:44:07,13.30,-74.015770,40.711155,-73.872437,40.774536,1,8.724008,0.155988
600,2016-01-29 09:19:32,2016-01-29 09:47:54,9.30,-73.984108,40.755341,-73.871193,40.774200,1,6.075012,0.096120
691,2016-01-29 09:19:48,2016-01-29 09:49:40,10.00,-73.975052,40.787968,-73.865036,40.770565,1,5.573094,0.319693
767,2016-01-29 09:20:02,2016-01-29 09:40:16,10.60,-73.973946,40.756927,-73.872459,40.774529,1,5.532686,0.156522
870,2016-01-29 09:20:20,2016-01-29 09:43:03,9.39,-73.955040,40.764954,-73.864525,40.770142,1,4.466684,0.357970
956,2016-01-29 09:20:37,2016-01-29 09:45:08,11.20,-73.972412,40.755341,-73.865280,40.770664,1,5.477592,0.305247
1044,2016-01-29 09:20:49,2016-01-29 09:42:37,9.93,-73.969032,40.756817,-73.872627,40.774307,1,5.283000,0.153822


In [10]:
trips_fromLGA.to_csv('tripsFromLGA_012016.csv', index=False)
trips_toLGA.to_csv('tripsToLGA_012016.csv', index=False)

In [11]:
# Calucluates the distances between all the points given.
# points: list of tuples, ex: [(lat, long)]
# returns: a list of lists to represent a matrix. 
# each entry [i][j] = the distrance from points[i] to points[j]
def distance(points):
    matrix = []
    for i in range(len(points)):
        matrix.append([])
        for j in range(len(points)):
            dist = geodesic(points[i], points[j]).miles
            matrix[i].append(dist)
            
    return matrix

In [12]:
print(distance([(10,10), (11, 11)]))

[[0.0, 96.69791029042952], [96.69791029042952, 0.0]]
