# Exploring Dataset and Feature Engineering

In [None]:
def latitude_to_mercator(coords):
    """
    Function which converts an array of latitude coordinates 
    into its mercator coordinate representation
    """
    k = 6378137
    converted = list()
    for lat in coords:
        converted.append(np.log(np.tan((90 + lat) * np.pi/360.0)) * k)
    return converted

def longitude_to_mercator(coords):
    """
    Function which converts an array of longitude coordinates 
    into its mercator coordinate representation
    """
    k = 6378137
    converted = list()
    for lon in coords:
        converted.append(lon * (k * np.pi/180.0))
    return converted

In [None]:
# mcoords = the middle coordinates for the map
pickup_geo_data= ['pickup_latitude', 'pickup_longitude']
mcoords = df_tot[pickup_geo_data].describe().loc[["50%"]].values[0]

# axis ranges
xRange = [df_tot['pickup_longitude'].min(), df_tot['pickup_longitude'].max()]
yRange = [df_tot['pickup_latitude'].min(), df_tot['pickup_latitude'].max()]



In [None]:
from bokeh.plotting import figure, show
from bokeh.tile_providers import get_provider, Vendors

# to display bokeh plots inside jupyter, we need to use output_notebook
from bokeh.io import reset_output, output_notebook

reset_output()
output_notebook()
# note below that it says "BokehJS 1.4.0 successfully loaded."

In [None]:
TILE = get_provider("STAMEN_TERRAIN_RETINA")

pickup_m = figure(x_range=longitude_to_mercator(xRange), y_range=latitude_to_mercator(yRange),
       x_axis_type="mercator", y_axis_type="mercator")
pickup_m.add_tile(TILE)
pickup_m.title.text = "Pickups in NYC"

In [None]:
#convert to merccer
df_tot['pickupX'] = df_tot['pickup_longitude'].apply(lambda x: longitude_to_mercator([x])[0])
df_tot['pickupY'] = df_tot['pickup_latitude'].apply(lambda x: latitude_to_mercator([x])[0])
df_tot[['pickupX','pickupY']]

In [None]:
# for every source value, draw a small circle denoting a pickup
pickup_m.circle(x='pickupX', y='pickupY', 
         size=5, fill_color="blue", fill_alpha=0.5, 
         source=df_tot[['pickupX','pickupY']])
show(pickup_m)

In [None]:
#for drop offs
# create map
dropoff = figure(x_range=longitude_to_mercator(xRange), y_range=latitude_to_mercator(yRange),
       x_axis_type="mercator", y_axis_type="mercator")
dropoff.add_tile(TILE)
dropoff.title.text = "Dropoff in NYC"

# convert to mercer
df_tot['dropoffX'] = df_tot['dropoff_longitude'].apply(lambda x: longitude_to_mercator([x])[0])
df_tot['dropoffY'] = df_tot['dropoff_latitude'].apply(lambda x: latitude_to_mercator([x])[0])

# plot circles (source = data source)
dropoff.circle(x='dropoffX', y='dropoffY', 
         size=5, color="pink", fill_color="red", fill_alpha=0.5, 
         source=df_tot[['dropoffX','dropoffY']])

show(dropoff)

In [None]:
show(dropoff)

In [None]:
df_tot['tpep_trip_totaltime']= df_tot['tpep_dropoff_datetime'] - df_tot['tpep_pickup_datetime']

In [None]:
df_tot.describe().round()

In [None]:
num_clusters = 20
km = KMeans(n_clusters=num_clusters)
km.fit(data)

centers = km.cluster_centers_

km_loc_pickup= figure(x_range=longitude_to_mercator(xRange), y_range=latitude_to_mercator(yRange),
       x_axis_type="mercator", y_axis_type="mercator")
km_loc_pickup.add_tile(TILE)
km_loc_pickup.title.text = "Pickups in NYC"

# plot centroid / cluster center / group mean for each group
clus_xs = []
clus_ys = []

#we get the  cluster x / y values from the k-means algorithm
for entry in centers:
    clus_xs.append(entry[0])
    clus_ys.append(entry[1])

# the cluster center is marked by a circle, with a cross in it
km_loc_pickup.circle_cross(x=clus_xs, y=clus_ys, size=40, fill_alpha=0, line_width=2, color= "red")


# plot circles (source = data source)
km_loc_pickup.circle(x='pickupX', y='pickupY', 
         size=5, color="pink", fill_color="red", fill_alpha=0.5, 
         source=df_tot[['pickupX','pickupY']])

show(km_loc_pickup.circle)