In [28]:
import os
import folium
import pandas as pd
import numpy as np
import geojson

%matplotlib inline

In [29]:
FILTERED_DATA_DIR = '/datadrive/taxi_data/filtered'
DATA_DIR = '/datadrive/taxi_data'

In [30]:
data = pd.read_csv(os.path.join(FILTERED_DATA_DIR, 'yellow_tripdata_2016-05.csv'), usecols=['tpep_pickup_date',
                                                                                            'pickup_longitude',
                                                                                            'pickup_latitude',
                                                                                            'region'])

In [31]:
# done for aggregating purposes
data['id'] = 1

In [32]:
data.head()

Unnamed: 0,pickup_longitude,pickup_latitude,tpep_pickup_date,region,id
0,-73.985901,40.76804,2016-05-01 00:00:00,1233,1
1,-73.991577,40.744751,2016-05-01 00:00:00,1180,1
2,-73.993073,40.741573,2016-05-01 00:00:00,1180,1
3,-73.991943,40.684601,2016-05-01 00:00:00,1173,1
4,-74.00528,40.740192,2016-05-01 00:00:00,1130,1


In [33]:
regions = pd.read_csv(os.path.join(DATA_DIR, 'regions.csv'), delimiter=';')
regions.head()

Unnamed: 0,region,west,east,south,north
0,1,-74.25559,-74.244478,40.49612,40.504508
1,2,-74.25559,-74.244478,40.504508,40.512896
2,3,-74.25559,-74.244478,40.512896,40.521285
3,4,-74.25559,-74.244478,40.521285,40.529673
4,5,-74.25559,-74.244478,40.529673,40.538061


In [34]:
# compute region centers
regions['long'] = (regions.west + regions.east) / 2
regions['lat'] = (regions.south + regions.north) / 2

In [35]:
# compute mapping from region code to its coordinates
mapping = dict(list(map(lambda r: (int(r[0]), (r[1], r[2])),
              np.array(regions[['region', 'lat', 'long']]))))

In [36]:
def region_to_lat_long(row):
    return mapping[row['region']]

## 1

In [37]:
# AGGREGATE DATA BY REGION AND HOUR
agg_data = data[['region', 'tpep_pickup_date', 'id']].groupby(['tpep_pickup_date', 'region']).aggregate('count').reset_index()

In [38]:
# amount of cells with at least one ride
rides_per_cell = agg_data[['region', 'id']].groupby('region').aggregate('sum').reset_index()
rides_per_cell[rides_per_cell.id > 0].region.unique().shape

(1217,)

In [39]:
# So there are 2500 - 1217 regions with no rides
2500 - 1217

1283

## 2

In [46]:
map_osm = folium.Map(location=[40.7128, -74.0059])
folium.Marker([40.7484, -73.9857], popup='Empire state building').add_to(map_osm)

<folium.map.Marker at 0x7f7324894588>

##  3

In [50]:
data_aggregated_total = data[['region', 'id']].groupby('region').aggregate('count').reset_index()

In [53]:
lat_long = data_aggregated_total.apply(region_to_lat_long, axis=1)
lat_long_reshped = np.array(list(zip(*zip(*lat_long))))
data_aggregated_total['lat'] = lat_long_reshped[:, 0]
data_aggregated_total['long'] = lat_long_reshped[:, 1]

In [54]:
plugins.HeatMap(np.array(agg_filtered_data[['lat', 'long','id']]), radius=9).add_to(map_osm)
map_osm

## 4

In [55]:
map_osm = folium.Map(location=[40.7128, -74.0059])
folium.Marker([40.6892, -74.0445], popup='Statue of Liberty').add_to(map_osm)

<folium.map.Marker at 0x7f73248ca6d8>

## 5 and 6

In [57]:
# get regions with amount fo rider per hour less than 5
rare_regions_df = agg_data[['region', 'id']].groupby('region').aggregate('mean').reset_index()
rare_regions = rare_regions_df[rare_regions_df.id <= 5].region.unique()

In [75]:
# filter out rare regions. Sum all rides for whole month per each region
agg_filtered_data = agg_data[~agg_data.region.isin(rare_regions)][['region', 'id']] \
    .groupby('region') \
    .aggregate('mean') \
    .reset_index()

In [70]:
# add coordinates for each record in dataframe ( for visulization)
lat_long = agg_filtered_data.apply(region_to_lat_long, axis=1)
lat_long_reshped = np.array(list(zip(*zip(*lat_long))))
agg_filtered_data['lat'] = lat_long_reshped[:, 0]
agg_filtered_data['long'] = lat_long_reshped[:, 1]

In [71]:
agg_filtered_data[['region', 'id']].tail(10)

Unnamed: 0,region,id
94,1683,5.251724
95,1684,18.693676
96,1733,6.371248
97,1734,224.497191
98,1783,209.171946
99,2068,100.172805
100,2069,12.015314
101,2118,146.490591
102,2119,39.16692
103,2168,73.153515


In [73]:
# plot heatmap with color propotrional to amount of rides
from folium import plugins
plugins.HeatMap(np.array(agg_filtered_data[['lat', 'long','id']]), radius=9).add_to(map_osm)
map_osm