In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import time
import os
from IPython.display import clear_output


In [None]:
#from sklearn.cluster import KMeans
#from sklearn import preprocessing
#from sklearn.model_selection import train_test_split
#from sklearn.preprocessing import Imputer
#from sklearn import linear_model
#from sklearn.metrics import mean_squared_error
#from sklearn.ensemble import RandomForestRegressor
import folium
import tensorflow as tf
from bokeh.plotting import figure, output_notebook, show 

# Loading Data

In [None]:
#Loading data
df=pd.read_csv("green_tripdata_2016-05.csv", usecols=
                  ['lpep_pickup_datetime', 'Lpep_dropoff_datetime',  
                   'Pickup_longitude', 'Pickup_latitude',
                   'Dropoff_longitude', 'Dropoff_latitude',  
                   'Passenger_count','Trip_distance'])
#add fare ammount later if needed

In [None]:
#Brief look at our data
df.head()

In [None]:
#retrieving our range of coordinates
print(np.min(df['Pickup_longitude']), np.min(df['Pickup_latitude']))
print(np.max(df['Pickup_longitude']), np.max(df['Pickup_latitude']))
print(np.min(df['Dropoff_longitude']), np.min(df['Dropoff_latitude']))
print(np.max(df['Dropoff_longitude']), np.max(df['Dropoff_latitude']))

In [None]:
#looking at data types to supress scientific notation below
df.dtypes

In [None]:
#df.describe().apply(lambda x: format(x, 'f'))
df.describe()
#TODO: suppress sci not.

In [None]:
#number of data points
print('Size:', df.shape[0])

# Visualization of Dataset

In [None]:
points_x = np.array(df['Pickup_longitude'])
points_y = np.array(df['Pickup_latitude'])

In [None]:
#basic scatter plot
plt.axis([-74.5, -73.4,40.5, 41.2])
plt.scatter(points_x,points_y)

In [None]:
%time
# random 10k samples
samples = df.sample(n=1000)
samples2=df.sample(n=10000)

In [None]:
m = folium.Map(location=[40.9, -74.05],zoom_start=12)
for each in samples.iterrows():
    folium.CircleMarker([each[1]['Pickup_latitude'],each[1]['Pickup_longitude']],
                        radius=1,
                        color='blue',
                        popup=str(each[1]['Pickup_latitude'])+','+str(each[1]['Pickup_longitude']),
                        fill_color='#FD8A6C'
                        ).add_to(m)

In [None]:
m

In [None]:
output_notebook()

In [None]:
NYC = x_range, y_range = ((-74.05, -73.7), (40.6, 40.9))
plot_width = int(750)
plot_height = int(plot_width//1.2)

def base_plot(tools='pan, wheel_zoom, reset', plot_width=plot_width, plot_height=plot_height, **plot_args):
    p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
              x_range=x_range, y_range=y_range, outline_line_color=None,
              min_border=0, min_border_left=0, min_border_right=0,
              min_border_top=0, min_border_bottom=0, **plot_args)
    
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    return p

options = dict(line_color=None, fill_color='blue', size=5)

In [None]:
p = base_plot()

p.circle(x=samples2['Pickup_longitude'], y=samples2['Pickup_latitude'], **options)
show(p)

In [None]:
import datashader as ds
from datashader import transfer_functions as tr_fns
from datashader.colors import Greys9
Greys9_r = list(reversed(Greys9))[:2]

In [None]:
%%time
cvs = ds.Canvas(plot_width=plot_width, plot_height=plot_height, x_range=x_range, y_range=y_range)
agg = cvs.points(df, 'Dropoff_longitude', 'Dropoff_latitude', ds.count('Passenger_count'))
img = tr_fns.shade(agg, cmap=["white", 'darkblue'], how='linear')

img

In [None]:
from datashader.bokeh_ext import InteractiveImage
from functools import partial
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9, Hot, viridis, inferno
from IPython.core.display import HTML, display

background = "black"
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))

def create_image(x_range, y_range, w=plot_width, h=plot_height):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    agg = cvs.points(df, 'Pickup_longitude', 'Pickup_latitude', ds.count('Passenger_count'))
    img = tr_fns.shade(agg, cmap=Hot, how='eq_hist')
    return tr_fns.dynspread(img, threshold=0.5, max_px=4)

p = base_plot(background_fill_color=background)
export(create_image(*NYC), "NYCT_hot_green")
InteractiveImage(p, create_image)

In [None]:
from functools import partial

background = "black"
export = partial(export_image, export_path="export", background=background)
m = partial(colormap_select, reverse=(background=="black"))

def create_image90(x_range, y_range, w=plot_width, h=plot_height):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    agg = cvs.points(df, 'Dropoff_longitude', 'Dropoff_latitude', ds.count('Passenger_count'))
    img = tr_fns.shade(agg.where(agg > np.percentile(agg, 90)), cmap=inferno, how='eq_hist')
    return tr_fns.dynspread(img, threshold=0.3, max_px=4)
    
p = base_plot(background_fill_color=background)
export(create_image(*NYC), "NYCT_90th")
InteractiveImage(p, create_image90)

In [None]:
def merged_images(x_range, y_range, w=plot_width, h=plot_height, how='log'):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    picks = cvs.points(df, 'Pickup_longitude', 'Pickup_latitude', ds.count('Passenger_count'))
    drops = cvs.points(df, 'Dropoff_longitude', 'Dropoff_latitude', ds.count('Passenger_count'))
    more_drops = tr_fns.shade(drops.where(drops > picks), cmap=["darkblue", 'cornflowerblue'], how=how)
    more_picks = tr_fns.shade(drops.where(picks > drops), cmap=["darkred", 'orangered'], how=how)
    img = tr_fns.stack(more_picks, more_drops)
    return tr_fns.dynspread(img, threshold=0.3, max_px=4)

p = base_plot(background_fill_color=background)
export(merged_images(*NYC), "NYCT_pickups_vs_drops")
#InteractiveImage(p, merged_images)

# Feature Selection

In [None]:
green=df[['lpep_pickup_datetime','Pickup_latitude', 'Pickup_longitude','Passenger_count','Trip_istance']]
green.columns=['Date','Lat','Long']
green["Type"]="green" #used to identify taxi type later
green["Date"]=green["Date"].str[0:13].replace("\s+","-", regex=True)


In [None]:
green

In [None]:
yellow_raw=pd.read_csv("yellow_tripdata_2016-05.csv", index_col=False)

In [None]:
#yellow_raw
yellow_temp=yellow_raw[['passenger_count','pickup_latitude', 'pickup_longitude']]

points_x_yel = np.array(yellow_temp['pickup_longitude'])
points_y_yel = np.array(yellow_temp['pickup_latitude'])
plt.axis([-74.5, -73.4,40.5, 41.2])
plt.scatter(points_x_yel,points_y_yel)

In [None]:
yellow=yellow_raw[['tpep_pickup_datetime','pickup_latitude', 'pickup_longitude']]
yellow.columns=['Date','Lat','Long']
yellow["Type"]="yellow" #used to identify taxi type later
yellow["Date"]=yellow["Date"].str[0:13].replace("\s+","-", regex=True)

In [None]:
yellow

In [None]:
sample_yellow=yellow.sample(n=1000)
sample_yellow_raw=yellow_raw.sample(n=10000)
m_yellow = folium.Map(location=[40.9, -74.05],zoom_start=12)
for each in sample_yellow.iterrows():
    folium.CircleMarker([float(each[1]['Lat']),float(each[1]['Long'])],
                        radius=1,
                        color='blue',
                        popup=str(each[1]['Lat'])+','+str(each[1]['Long']),
                        fill_color='#FD8A6C'
                        ).add_to(m_yellow)
m_yellow

In [None]:

p.circle(x=sample_yellow_raw['pickup_longitude'], y=sample_yellow_raw['pickup_latitude'], **options)
show(p)

In [None]:
weather=pd.read_csv("nyc_weather_data_2016_05_temps_only.csv", index_col=False)
#weather

In [None]:
weather=weather.drop_duplicates(subset='DATE', keep='first') 
#weather["DATE"]=pd.to_datetime(weather["DATE"],format="%Y%m%d%")
weather["DATE"]=pd.to_datetime(weather["DATE"]).dt.strftime('%Y-%m-%d')
weather=weather[['DATE','TAVG']]
weather.columns=['Date', 'temp'] #add rain
weather=weather.fillna(0)
weather["date"]=weather["Date"].str[0:10].replace("\s+","-", regex=True)
weather

# Merging data

In [None]:
data=pd.concat([yellow, green], axis=0)
data

In [None]:
data["date"]=data["Date"].str[0:10].replace("\s+","-", regex=True)

In [None]:
#data

In [None]:
data=pd.merge(data, weather, how='left', on=['date']) #add for weather if I can

#data.to_csv('may_data_both.csv', sep='\t')

# we will use this combined dataset to use for our modeling

In [None]:
date=pd.to_datetime(data['Date_x'], format='%Y-%m-%d-%H')
data['Month']=date.dt.month
data['Day']=date.dt.day
data['Hour']=date.dt.hour
data

In [None]:
data.to_csv('may_data_combined_weather.csv', sep='\t')

# Starting point for part 2: reducing dimensionality

In [None]:
data2=pd.read_csv("may_data_combined_weather.csv",sep='\t')
data2

In [None]:
data2.dtypes

In [None]:
sample_data2 = data2.sample(n=1000)
sample_data2_big = data2.sample(n=10000)
sample_data2_big

In [None]:
m2 = folium.Map(location=[40.9, -74.05],zoom_start=12)
for each in sample_data.iterrows():
    folium.CircleMarker([float(each[1]['Lat']),float(each[1]['Long'])],
                        radius=1,
                        color='blue',
                        popup=str(each[1]['Lat'])+','+str(each[1]['Long']),
                        fill_color='#FD8A6C'
                        ).add_to(m2)
m2

In [None]:

background = "black"
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))

def create_image(x_range, y_range, w=plot_width, h=plot_height):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    agg = cvs.points(yellow_raw, 'pickup_longitude', 'pickup_latitude', ds.count('passenger_count'))
    img = tr_fns.shade(agg, cmap=Hot, how='eq_hist')
    return tr_fns.dynspread(img, threshold=0.5, max_px=4)

p = base_plot(background_fill_color=background)
export(create_image(*NYC), "NYCT_hot_yellow")
InteractiveImage(p, create_image)

# Transcoding zipcode from coordinates

In [None]:
sample_data2_big["Lat"]=sample_data2_big["Lat"].astype(str)
sample_data2_big["Long"]=sample_data2_big["Long"].astype(str)
sample_data2_big["Coord"]=sample_data2_big[["Lat", "Long"]].apply(lambda x: ",".join(x), axis=1)
sample_data2_big

In [None]:
sample_data2_big.dtypes

### testing coordinate entry with geopy

In [None]:
import folium
m = folium.Map(location=[40.9, -74.05],zoom_start=12)
folium.CircleMarker(location=[40.7410861, -73.9896298241625], 
                    radius=3,
                    popup='Flatiron Building', 
                    fill_color='#3186cc').add_to(m)
m

In [None]:
from geopy.geocoders import Nominatim
import re
import time
import csv

geolocator = Nominatim(user_agent='ml_capstone_py/1.0 (stevenya97@gmail.com)')
#requests.get(link, headers = {'User-agent': 'my_app_ml'})

In [None]:
#### geopy test
location = geolocator.geocode("175 5th Avenue NYC")
print(location.address)

print((location.latitude, location.longitude))

print(location.raw)


In [None]:
location = geolocator.reverse("40.7410861, -73.9896298241625")
print(location.address)

print((location.latitude, location.longitude))

print(location.raw)

In [None]:
sample_data2_big.iat[1,11]

In [None]:
coord_df=pd.DataFrame((sample_data2_big['Coord']))


In [None]:
coord_df.iat[1,0]

In [None]:
geolocator.reverse(str(coord_df.iat[1,0]), timeout=500)

In [None]:
sample_data2_big.shape[0]