In [2]:
from google.cloud import bigquery
from google.oauth2 import service_account
import os
import time
import geopy.distance
from gmplot import gmplot
import math
import numpy as np

Credentials for downloading data from Google BigQuery

In [4]:
key_path = "xxx.json"

credentials = service_account.Credentials.from_service_account_file(
    key_path,
    scopes=["https://www.googleapis.com/auth/cloud-platform"],
)

client = bigquery.Client(
    credentials=credentials,
    project=credentials.project_id,
)

Query NYC yellow 2015 taxi data, night pick-ups (12pm to 4 am), timeline based on https://www1.nyc.gov/assets/mome/pdf/ESI-NYCEDC-Nightlife-Report-2018.pdf

Note: We use 2015 data because it contains complete latitude and longitude information. ore recent years is grouped into taxi zones.

In [17]:
query = """
        SELECT ROUND(pickup_latitude, 4) as lat,
        ROUND(pickup_longitude, 4) as long,
        SUM(passenger_count) as num_pickups
        FROM (Select *,CAST(pickup_datetime as time) AS pickup_time,CAST(dropoff_datetime as time) AS dropoff_time, 
            CAST(pickup_datetime as date) AS pickup_date, CAST(dropoff_datetime as date) AS dropoff_date, 
            EXTRACT(month FROM pickup_datetime) AS pickup_month, EXTRACT(month FROM dropoff_datetime) AS dropoff_month 
            FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2015`)
        WHERE(passenger_count > 0 AND trip_distance > 0 AND pickup_longitude <> 0 AND pickup_latitude <> 0 AND
              pickup_time <= '04:00:00')
        GROUP BY lat, long    
    """
    
query_job = client.query(query)  
dfk = query_job.to_dataframe()

Subset Brooklyn Carroll Gardens - Cobble Hill data:

In [18]:
dfk2 = dfk[(dfk.long >= -74.0011) & (dfk.long <= -73.9688) & (dfk.lat >= 40.6640) & (dfk.lat <= 40.7043)]

Create a Heatmap from the NYC yellow taxi pick-up locations in the Brooklyn Carroll Gardens Area:

In [19]:
gmap = gmplot.GoogleMapPlotter(40.6848, -73.9935, 17)
gmap.apikey = "AIzaSyCsAMIwsITXPUeYS5TcJDQFOCKGjH8SoKo"
latitudes = dfk2.lat
longitudes = dfk2.long
gmap.heatmap(latitudes, longitudes)
gmap.draw("Images\my_map.html")          

The resulting image shows how late pick-ups in the Brooklyn Carroll Gardens Area data is messy:

<div>    
<img src="Images/img1.png" width="500"/>
</div>

The objective is to process this data and see if an exploratory analysis can help answer wich bars are trending based on pick-ups. To regularize for outliers we are going to use monthly data : 

In [21]:
months = ['1','2','3','4','5','6','7','8','9','10','11','12']
df_l = []

for m in months:
    query = """
        SELECT ROUND(pickup_latitude, 4) as lat,
        ROUND(pickup_longitude, 4) as long,
        SUM(passenger_count) as num_pickups
        FROM (Select *,CAST(pickup_datetime as time) AS pickup_time,CAST(dropoff_datetime as time) AS dropoff_time, CAST(pickup_datetime as date) AS pickup_date, CAST(dropoff_datetime as date) AS dropoff_date, EXTRACT(month FROM pickup_datetime) AS pickup_month, EXTRACT(month FROM dropoff_datetime) AS dropoff_month FROM `bigquery-public-data.new_york_taxi_trips.tlc_yellow_trips_2015`)
        WHERE(passenger_count > 0 AND trip_distance > 0 AND pickup_longitude <> 0 AND pickup_latitude <> 0 AND
              pickup_longitude >= -73.999724 AND pickup_longitude <= -73.975729 AND pickup_latitude >= 40.674620 AND pickup_latitude <= 40.691091 AND
              pickup_time <= '04:00:00' AND dropoff_month = {})
        GROUP BY lat, long    
    """.format(m)    
    
    query_job = client.query(query)  
    
    df = query_job.to_dataframe()
    df_l.append(df)
    print("Month {} of 12".format(m))

Month 1 of 12
Month 2 of 12
Month 3 of 12
Month 4 of 12
Month 5 of 12
Month 6 of 12
Month 7 of 12
Month 8 of 12
Month 9 of 12
Month 10 of 12
Month 11 of 12
Month 12 of 12


Now lets suppose a user wants to go out in the Brooklyn-Carroll are, and wants to see what bars are trending and so, selects 10 bars from the area. In this example the selected bars are:

* Boat Bar
* Camp Bar
* Leyenda Bar 
* Angry Wades Bar 
* Barely Disfigured Bar  
* Zombie Hut Bar
* Bar Great Harry
* Brooklyn Social Bar

The program would use the coordinates of the selected bars and obtain the number of nightly pickups each month in a 40 meter radius:

Note: 80 meters is the average length of a NYC block.

In [24]:
radius = 80/2 #meters    
    
Other_coords = (40.6859, -73.9909)  #Boat 
Other_coords2 = (40.6858, -73.9910)  #Camp  
Leyenda_coords = (40.6843, -73.9919)  #Leyend 
Other_coords3 = (40.6841, -73.9924)  #Angry Wades 
Other_coords4 = (40.6831, -73.9927) #Barely Disfigured  
Other_coords5 = (40.6826, -73.9931) #Zombie Hut
Other_coords6 = (40.6824, -73.9936)  #Bar Great Harry
Other_coords7 = (40.6805, -73.9945)  #Brooklyn Social

pickups = [0] * 12
pickupsO = [0] * 12
pickupsO2 = [0] * 12
pickupsO3 = [0] * 12
pickupsO4 = [0] * 12
pickupsO5 = [0] * 12
pickupsO6 = [0] * 12
pickupsO7 = [0] * 12

for i,df in enumerate(df_l):
    for j,row in enumerate(df.values):
        lat = row[0] ; long = row[1]    
        try:
            d = geopy.distance.distance(Leyenda_coords, (lat,long)).m
            dO = geopy.distance.distance(Other_coords, (lat,long)).m
            dO2 = geopy.distance.distance(Other_coords2, (lat,long)).m
            dO3 = geopy.distance.distance(Other_coords3, (lat,long)).m
            dO4 = geopy.distance.distance(Other_coords4, (lat,long)).m
            dO5 = geopy.distance.distance(Other_coords5, (lat,long)).m
            dO6 = geopy.distance.distance(Other_coords6, (lat,long)).m
            dO7 = geopy.distance.distance(Other_coords7, (lat,long)).m
            if d <= radius:
                pickups[i] += row[2]
            if dO <= radius:
                pickupsO[i] += row[2]
            if dO2 <= radius:
                pickupsO2[i] += row[2]
            if dO3 <= radius:
                pickupsO3[i] += row[2]
            if dO4 <= radius:
                pickupsO4[i] += row[2]
            if dO5 <= radius:
                pickupsO5[i] += row[2]
            if dO6 <= radius:
                pickupsO6[i] += row[2]
            if dO7 <= radius:
                pickupsO7[i] += row[2]
        except:
            print("ERROR: lat {}, long {}".format(lat,long))
        if j%1000 == 0:
            print("Line {} of {}, month {}".format(j,len(df),i+1))
    print("Finished month {} of 12".format(i+1))

Line 0 of 3298, month 1
Line 1000 of 3298, month 1
Line 2000 of 3298, month 1
Line 3000 of 3298, month 1
Finished month 1 of 12
Line 0 of 3287, month 2
Line 1000 of 3287, month 2
Line 2000 of 3287, month 2
Line 3000 of 3287, month 2
Finished month 2 of 12
Line 0 of 3545, month 3
Line 1000 of 3545, month 3
Line 2000 of 3545, month 3
Line 3000 of 3545, month 3
Finished month 3 of 12
Line 0 of 3383, month 4
Line 1000 of 3383, month 4
Line 2000 of 3383, month 4
Line 3000 of 3383, month 4
Finished month 4 of 12
Line 0 of 3529, month 5
Line 1000 of 3529, month 5
Line 2000 of 3529, month 5
Line 3000 of 3529, month 5
Finished month 5 of 12
Line 0 of 3154, month 6
Line 1000 of 3154, month 6
Line 2000 of 3154, month 6
Line 3000 of 3154, month 6
Finished month 6 of 12
Line 0 of 3088, month 7
Line 1000 of 3088, month 7
Line 2000 of 3088, month 7
Line 3000 of 3088, month 7
Finished month 7 of 12
Line 0 of 2897, month 8
Line 1000 of 2897, month 8
Line 2000 of 2897, month 8
Finished month 8 of 12
Lin

We then define a score basedo on the average pickups and ajusted for other bars that co-exist in the same radius. This is done by calculation the intersection area of two or more circles:

In [25]:
def intersection_area(A, B, r):
    """Return the area of intersection of two circles.
    Based on https://scipython.com/book/chapter-8-scipy/problems/p84/overlapping-circles/
    """    
    d = geopy.distance.distance(A,B).m
    if d <= 0:
        # One circle is entirely enclosed in the other.
        return 1
    if d >= 2*r:
        # The circles don't overlap at all.
        return 0

    r2,d2 = r**2, d**2
    alpha = np.arccos(d2 / (2*d*r))
    return ( 2*r2 * alpha - 0.5 * (2*r2 * np.sin(2*alpha)))/(math.pi * r2)

def average(lst): 
    return sum(lst) / len(lst) 

def score(pickups, intsc):
    return pickups * (1-0.5*intsc)  

Based on the score we color code the circular radius for graphical purposes:

In [26]:
def set_color(number):
    if number <= 50 :
        return '#00B216'  #Dark green
    elif number <= 100:        
        return '#66FF79'  #Light green
    elif number <= 200:        
        return '#FFEB00'  #yellow
    elif number <= 210:        
        return '#CCC033'  #Dark yellow
    elif number <= 220:        
        return '#FF9F00'  #Orange
    elif number <= 230:        
        return '#E58F00'  #Dark Orange
    else:        
        return '#FF0500'  #Red

gmap = gmplot.GoogleMapPlotter(Leyenda_coords[0], Leyenda_coords[1], 18)
gmap.apikey = "AIzaSyCsAMIwsITXPUeYS5TcJDQFOCKGjH8SoKo"
gmap.circle(Other_coords[0], Other_coords[1], radius, set_color(score(average(pickupsO), 0.7790)))
gmap.circle(Other_coords2[0], Other_coords2[1], radius, set_color(score(average(pickupsO2), 0.7790)))
gmap.circle(Leyenda_coords[0], Leyenda_coords[1], radius, set_color(score(average(pickups), 0.3408)))
gmap.circle(Other_coords3[0], Other_coords3[1], radius, set_color(score(average(pickupsO3), 0.3408)))
gmap.circle(Other_coords4[0], Other_coords4[1], radius, set_color(score(average(pickupsO4), 0.0537)))
gmap.circle(Other_coords5[0], Other_coords5[1], radius, set_color(score(average(pickupsO5), 0.4384)))
gmap.circle(Other_coords6[0], Other_coords6[1], radius, set_color(score(average(pickupsO6), 0.3847)))
gmap.circle(Other_coords7[0], Other_coords7[1], radius, set_color(score(average(pickupsO7), 0)))
gmap.marker(Other_coords[0], Other_coords[1], 'cornflowerblue')
gmap.marker(Other_coords2[0], Other_coords2[1], 'cornflowerblue')
gmap.marker(Leyenda_coords[0], Leyenda_coords[1], 'cornflowerblue')
gmap.marker(Other_coords3[0], Other_coords3[1], 'cornflowerblue')
gmap.marker(Other_coords4[0], Other_coords4[1], 'cornflowerblue')
gmap.marker(Other_coords5[0], Other_coords5[1], 'cornflowerblue')
gmap.marker(Other_coords6[0], Other_coords6[1], 'cornflowerblue')
gmap.marker(Other_coords7[0], Other_coords7[1], 'cornflowerblue')
gmap.draw("Images\my_map2.html")

<div>    
<img src="Images/img3.png" width="200"/>
</div>

This result would be displayed to the user along with the following table:

<div>    
<img src="Images/img4.png" width="800"/>
</div>