## Dependencies

In [None]:
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import animation
import numpy as np
import dateutil
from sklearn.cluster import MiniBatchKMeans
from ipywidgets import interact,  FloatSlider, RadioButtons
import geopandas
import matplotlib.pyplot as plt
import seaborn as sn
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings("ignore")

## Introduction

To start out, I wanted to use this project as an opportunity to brush up on my data visualization skills. This is a work in progress so I'll continute to add more plots when I can. Still figuring out GIS! 

Hopefully some of this is useful! Please provide as much feedback as possible. 

## Reading/Cleaning Dataset

In [None]:
df = pd.read_csv('../input/train.csv')

In [None]:
df["dropoff_datetime"] = pd.to_datetime(df["dropoff_datetime"])
df["pickup_datetime"] = pd.to_datetime(df["pickup_datetime"])

In [None]:
df.head()

In [None]:
#filter dataset
xlim = [-74.03, -73.77]
ylim = [40.63, 40.90]
df = df[(df.pickup_longitude> xlim[0]) & (df.pickup_longitude < xlim[1])]
df = df[(df.dropoff_longitude> xlim[0]) & (df.dropoff_longitude < xlim[1])]
df = df[(df.pickup_latitude> ylim[0]) & (df.pickup_latitude < ylim[1])]
df = df[(df.dropoff_latitude> ylim[0]) & (df.dropoff_latitude < ylim[1])]
df = df[df.trip_duration < 5000]

In [None]:
df_byhour = df.groupby(df.pickup_datetime.dt.hour)["trip_duration"].mean()
df_bymonth = df.groupby(df.pickup_datetime.dt.month)["trip_duration"].mean()
df_byday = df.groupby(df.pickup_datetime.dt.weekday)["trip_duration"].mean()

df_heatmap_do = df.groupby([pd.cut(df.dropoff_latitude, np.arange(ylim[0], ylim[1], 0.01)),
                            pd.cut(df.dropoff_longitude, np.arange(xlim[0], xlim[1], 0.005))])["id"].count()
df_heatmap_pu = df.groupby([pd.cut(df.pickup_latitude, np.arange(ylim[0], ylim[1], 0.01)),
                            pd.cut(df.pickup_longitude, np.arange(xlim[0], xlim[1], 0.005))])["id"].count()
plot_df_dropoff = (df_heatmap_do.reset_index()
              .pivot(index='dropoff_latitude', columns='dropoff_longitude')).sort_index(ascending=False)
plot_df_pickup = (df_heatmap_pu.reset_index()
              .pivot(index='pickup_latitude', columns='pickup_longitude')).sort_index(ascending=False)

### Compare plots of pick up and drop off locations

In [None]:
plt.figure(figsize=(20,20))

plt.subplot(2,2,1)
plt.title("Pick Up Plot")
plt.plot(df["pickup_longitude"],df["pickup_latitude"],'.',alpha = 0.4, markersize = 0.5)

plt.subplot(2,2,2)
plt.title("Pick Up Heatmap")
#plt.plot(df["dropoff_longitude"],df["dropoff_latitude"],'.',alpha = 0.4, markersize = 0.5,color = 'b')
#plt.pcolor(plot_df_pickup)
sn.heatmap(plot_df_pickup,xticklabels=False, yticklabels=False)

plt.subplot(2,2,3)
plt.title("Drop Off Plot")
plt.plot(df["dropoff_longitude"],df["dropoff_latitude"],'.',alpha = 0.4, markersize = 0.5)

plt.subplot(2,2,4)
plt.title("Drop Off Heatmap")
#plt.plot(df["dropoff_longitude"],df["dropoff_latitude"],'.',alpha = 0.4, markersize = 0.5,color = 'b')
sn.heatmap(plot_df_dropoff,xticklabels=False, yticklabels=False)

plt.show()

At first glance, the added density of drop offs outside of Manhattan show that people may tend to use the subway system when they are going into the city but prefer to taxi home. Also, people seem to be getting picked up from midtown and dispersing throughout the rest of Manhattan. This intuitively makes sense as Port Authority and Penn Station are in that area so commuters will typically get into the city in the area and taxi elsewhere.

In [None]:
df_hours = df.groupby(df.pickup_datetime.dt.hour)["trip_duration"].mean()

In [None]:
fig, ax = plt.subplots(1, figsize = (10,10))
plt.bar(df_hours.index.values, df_hours, align = "center")
rect_morn = matplotlib.patches.Rectangle((7,0), 2, 1000, angle=0, fill = True, 
                                    linewidth = 2.0, edgecolor = 'r', alpha = 0.3, linestyle = '--')
rect_eve = matplotlib.patches.Rectangle((16,0), 2, 1000, angle=0, fill = True, 
                                    linewidth = 2.0, edgecolor = 'r', alpha = 0.3, linestyle = '--')
ax.add_patch(rect_morn)
ax.add_patch(rect_eve)
plt.title("Trip Duration by Hour")
plt.xlabel("Hour")
plt.ylabel("Average Trip Duration")
plt.ylim([400,1000])
plt.show()

As expected, we see a peak during morning rush hour. It is quite interesting that the trip duration continues to increase as the day goes on. Having been to NYC on a weekday, this does make some sense. Typically people are moving about throughout the day for meetings and tourists will be traveling as the day goes on. 

### Examine Hourly/Daily/Monthly Trends

In [None]:
count_byhour = df.groupby(df.pickup_datetime.dt.hour)["id"].count()
count_bymonth = df.groupby(df.pickup_datetime.dt.month)["id"].count()
count_byday = df.groupby(df.pickup_datetime.dt.weekday)["id"].count()

plt.figure(figsize=(20,17))
plt.subplot(321)
plt.plot(df_byhour)
plt.title("Average Ride Duration per Hour (Seconds)")
plt.ylabel("Average Ride Duration")
plt.xlabel("Hour")
plt.subplot(322)
plt.plot(count_byhour)
plt.title("Ride Volume per Hour")
plt.ylabel("Ride Volume")
plt.xlabel("Hour")

plt.subplot(325)
plt.plot(df_bymonth)
plt.title("Average Ride Duration per Month (Seconds)")
plt.ylabel("Average Ride Duration")
plt.xlabel("Month")
plt.subplot(326)
plt.plot(count_bymonth)
plt.title("Ride Volume per Month")
plt.ylabel("Ride Volume")
plt.xlabel("Month")

plt.subplot(323)
plt.plot(df_byday)
plt.xlabel("Day")
plt.ylabel("Average Ride Duration (Seconds)")
plt.title("Average Ride Duration per Day")
plt.subplot(324)
plt.plot(count_byday)
plt.ylabel("Ride Volume")
plt.title("Ride Volume per Day")
plt.xlabel("Day")

This actually ended up being quite interesting. On a hourly scale, the volume and trip duration match up well but they start to deviate when we look at a daily and monthly scale.

Daily, we see that the middle of the week tends to have the longest trip duration but Thursday and Friday have the largest Volume. My hunch is that when people are going out on Thursday and Friday, they are typically going to call a cab home regardless of how short of a distance. 

Monthly, we see almost no correlation between Ride Duration and Volume. I plan on looking into why there is a constant increase in trip duration time as we approach the summer. Granted, the average is only increasing by a minute or so, the trend is still interesting. 

In [None]:
by_passenger_count = df.groupby(df.passenger_count)["trip_duration"].mean()
plt.figure(figsize=(10,10))
plt.title("Passenger Count vs Average Duration")
plt.xlabel("Number of Passagers")
plt.ylabel("Average Duration")
plt.bar(by_passenger_count.index.values, by_passenger_count, align = "center")

As expected, see that number of passengers has no effect on trip_duration. 

In [None]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [None]:
df.loc[:, 'distance_haversine'] = haversine_array(df['pickup_latitude'].values, df['pickup_longitude'].values, df['dropoff_latitude'].values, df['dropoff_longitude'].values)
df.loc[:, 'distance_dummy_manhattan'] =  dummy_manhattan_distance(df['pickup_latitude'].values, df['pickup_longitude'].values, df['dropoff_latitude'].values, df['dropoff_longitude'].values)
df.loc[:, 'direction'] = bearing_array(df['pickup_latitude'].values, df['pickup_longitude'].values, df['dropoff_latitude'].values, df['dropoff_longitude'].values)

Now that we've got distances, lets see what 

In [None]:
df_heatmap_distance = df.groupby([pd.cut(df.dropoff_latitude, np.arange(ylim[0], ylim[1], 0.01)),
                            pd.cut(df.dropoff_longitude, np.arange(xlim[0], xlim[1], 0.005))])["distance_dummy_manhattan"].mean()
plot_df_distance = (df_heatmap_distance.reset_index()
              .pivot(index='dropoff_latitude', columns='dropoff_longitude')).sort_index(ascending=False)

In [None]:
plt.figure(figsize=(10,10))
sn.heatmap(plot_df_distance ,xticklabels=False, yticklabels=False)
plt.title("Heatmap of Average Distance Traveled")
plt.xlabel("Longitude")
plt.ylabel("Latitude")

Not the prettiest of graphs but it is interesting to see that a majority of the lengthier rides are coming from the Bronx. I don't live in Manhattan but I could make the assumption that the public transportation isn't as effecting in that area. 

## Clustering Coordinates

In [None]:
coords = np.vstack((df[['pickup_latitude', 'pickup_longitude']].values,
                    df[['dropoff_latitude', 'dropoff_longitude']].values))
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

df.loc[:, 'pickup_cluster'] = kmeans.predict(df[['pickup_latitude', 'pickup_longitude']])
df.loc[:, 'dropoff_cluster'] = kmeans.predict(df[['dropoff_latitude', 'dropoff_longitude']])

In [None]:
plt.figure(figsize=(10,10))
plt.scatter(df.pickup_longitude,df.pickup_latitude, 
            c = df.pickup_cluster, cmap = "nipy_spectral",marker = '.', alpha = 0.4)
plt.grid(False)
plt.title("")
plt.show()

I'm not quite sure why this is showing up like a child's drawing since it has a completely different aesthetic on my jupyter notebook. Looks like I still have some learning to do regarding the Jupyter->Kaggle transfer. 

Looking at this, it'd be interesting to see where people travel to and from most often. 

In [None]:
df_plot = df[(df.pickup_longitude> -74.0) & (df.pickup_longitude < -73.95)]
df_plot = df[(df.pickup_latitude> 40.725) & (df.pickup_latitude < 40.8)]

In [None]:
ave_cluster_loc = df.groupby("pickup_cluster").mean()
df["cluster_pair"] = list(zip(df.pickup_cluster, df.dropoff_cluster))
top_pairs_df = df.groupby("cluster_pair")["id"].count().sort_values(axis = 0, ascending = False)
top_pairs_index = top_pairs_df.index.values[:5]

In [None]:
top_pairs_df.head()

The `groupby()` is a godsend for visualizations. It really makes it easier to organize your dataset in clever ways. 

Below, I've plotted the top five "pairs" of trips in this dataset. By that, I mean we are looking at which trip (cluster->cluster) happen most often.

In [None]:
def plotit(ind = 0):

    fig, ax = plt.subplots(figsize=(11,11))
    ax.scatter(df_plot.pickup_longitude,df_plot.pickup_latitude, 
                c = df_plot.pickup_cluster, cmap = "nipy_spectral",marker = '.', alpha = 0.01)
    ax.scatter(ave_cluster_loc["pickup_longitude"],ave_cluster_loc["pickup_latitude"],marker = '.')

    for index, row in ave_cluster_loc.iterrows():

           ax.annotate(index, (row["pickup_longitude"],row["pickup_latitude"]))
    
    ax.arrow(ave_cluster_loc.loc[top_pairs_index[ind][0]].pickup_longitude, 
             ave_cluster_loc.loc[top_pairs_index[ind][0]].pickup_latitude,
            ave_cluster_loc.loc[top_pairs_index[ind][1]].pickup_longitude - ave_cluster_loc.loc[top_pairs_index[ind][0]].pickup_longitude - 0.004, 
            ave_cluster_loc.loc[top_pairs_index[ind][1]].pickup_latitude - ave_cluster_loc.loc[top_pairs_index[ind][0]].pickup_latitude -0.004,
     head_width = 0.005, head_length = 0.005)  

    ax.set_xlim([-74.0,-73.95]) 
    ax.set_ylim([40.725, 40.8])

Using the `ipywidget` package, I was able to use a slide to allow you all to see each the direction and pairing, one by one. I'm still working on the aesthetic here but I think you get the point. GIS is a little trickier than I expected so that'll take a day or so to figure out. 

In [None]:
interact(plotit, ind=(0,4,1))


Above, I've used the `ipywidgets` package to create an interactive plot. Rather than have all the arrows lay on top of each other, It seemed more convenient for you to be able to go through the top five pairs using a slider. 