In [118]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# 1. Characterize the data and comment about its quality
- When trying to download this data pipelines must be automated to handle empty columns on the end
- The coded values match the data dictionary
- There are some weird characteristics about this dataset, for example there are some Trip start times that are exactly the same as trip end time
    - For some of these trips they have data that seems like a trip with a duration > 0 happened. (i.e. charges made to the trip

# 2. Explore and visualize the data e.g. a histogram of trip distance

In [2]:
trip_data = pd.read_csv('https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2016-12.csv', sep=',', index_col=False)
trip_data.head()

  trip_data = pd.read_csv('https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2016-12.csv', sep=',', index_col=False)


Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type
0,2,2016-12-01 00:00:54,2016-12-01 00:06:54,N,1,92,192,1,1.29,6.5,0.5,0.5,0.0,0.0,,0.3,7.8,2,1
1,2,2016-12-01 00:52:41,2016-12-01 00:54:51,N,1,92,171,1,0.64,4.0,0.5,0.5,1.06,0.0,,0.3,6.36,1,1
2,2,2016-12-01 00:10:39,2016-12-01 00:14:47,N,1,75,238,2,0.89,5.5,0.5,0.5,1.7,0.0,,0.3,8.5,1,1
3,2,2016-12-01 00:12:16,2016-12-01 00:15:31,N,1,166,151,1,0.66,4.5,0.5,0.5,1.74,0.0,,0.3,7.54,1,1
4,2,2016-12-01 00:29:22,2016-12-01 00:39:51,N,1,166,42,1,2.15,9.5,0.5,0.5,2.16,0.0,,0.3,12.96,1,1


In [15]:
df = trip_data.sample(1000) # Sampling to avoid having to do aggregations for time. I have an example of an agg I do in notebook 01
fig = px.histogram(df, x="tip_amount", y="trip_distance", color="trip_type",
                   marginal="box", # or violin, rug
                   histfunc = 'avg',
                   opacity=.8,
                   barmode = 'overlay',
                   hover_data=df.columns)
fig.show()

In [26]:
# I want to explore this metric, because I mentioned it when suggesting potential options for question 4
df['tip_rate'] = df['tip_amount']/df['total_amount']

In [27]:
fig = px.histogram(df, x="tip_rate", y="trip_distance", color="trip_type",
                   marginal="box", # or violin, rug
                   histfunc = 'avg',
                   opacity=.8,
                   barmode = 'overlay',
                   hover_data=df.columns)
fig.show()

# By displaying this as a scatter we get more details about the tip types
This suggests that there might be a selectable automated system that allows users to pick the tip amount from 4 different options.


In [28]:
fig = px.scatter(df, x="tip_rate", y="trip_distance", color="trip_type",
#                    marginal="box", # or violin, rug
#                    histfunc = 'avg',
                   opacity=.8,
#                    barmode = 'overlay',
                   hover_data=df.columns)
fig.show()

# 3. Find interesting trip statistics grouped by hour
- How many trips do we have over time for december?

In [40]:
trip_data['lpep_pickup_datetime_hour'] = pd.to_datetime(trip_data['lpep_pickup_datetime']).dt.floor('h')
trip_data['tip_rate'] = trip_data['tip_amount']/trip_data['total_amount']

### First trunc the dates

In [38]:
trip_data[['lpep_pickup_datetime_hour','lpep_pickup_datetime']].head(3)

Unnamed: 0,lpep_pickup_datetime_hour,lpep_pickup_datetime
0,2016-12-01,2016-12-01 00:00:54
1,2016-12-01,2016-12-01 00:52:41
2,2016-12-01,2016-12-01 00:10:39


In [52]:
df[['trip_start_hour','trip_counts']] = trip_data.groupby(by=['lpep_pickup_datetime_hour']).count().reset_index()[['lpep_pickup_datetime_hour','lpep_pickup_datetime']]

# We can see daily seasonality to our data
This means that we will need to normalize by hour should we analyze data by hour.

In [55]:
fig = px.line(df, x="trip_start_hour", y="trip_counts")
fig.show()

# 4. The taxi drivers want to know what kind of trip yields better tips. Can you build a model for them and explain the model?
- This is a very vague request, but maybe after interacting with the data some obvious patterns may show up
- There are some leading questions that suggest they may be interested in "Time of day" as well as the length of the trips
- In the model explanation I would be sure to include that it may be beneficial to come to an agreement on what "better tips" mean.
    - Are we talking about just a higher tip?
    - Should we take into consideration cost of drivers time?
    - Longer trips may earn better tips, but a lower percentage of the entire trip cost.
    - Suggestion: use "tip rate" to define better tips. You can also think of this as Percent of Trip Cost.
        - tip/total cost of trip
            - i.e. {'trip_total_cost': '40.00', 'tip': '8.00'}
            - tip_rate = 8/40 = 0.20
    - We can view all these tips on a distribution and either use explicit classification (informed by project management) or ML to generate insights. A really simple approach would be to split the distribution of tip_rates into quartiles (low, med, high, very high). These types of tip ranges can inform a classification model, maybe KNN, of what future trips might yeild. We have the option of harness testing these models to inform users of which trips yeild better tips through simulation OR we can simply use statistics (A/B tests) on the data provided to inform decision makers of the features that are most likely to impact tips.
    - We can also take a PCA approach where we judge the types of trips based on the features with the most influence on the variance. After we have reduced it down to a few influential features we can simulate future outcomes with a few different methods
        - Monte Carlo
            - bootstrapped 
                - "Here is the current distribution of your data, but when you increase x (i.e trip length) by this much here is your likely outcome based on historic behavior."

# 5. Pick one of the options below
I will be visualizing data to find more patterns, but I did find some anomalies along the way

    - Trips where no time passes, but charges are > 0
    - From our features we can select some continuous values (and/or hotencode the discontinous values) to identify anomalies via KNN

In [80]:
trip_data['lpep_pickup_hour_of_day'] = pd.to_datetime(trip_data['lpep_pickup_datetime']).apply(lambda x: x.hour)

In [82]:
tip_rates = trip_data[['lpep_pickup_hour_of_day','tip_rate']]

In [94]:
tip_rates_desc = tip_rates.groupby(by='lpep_pickup_hour_of_day').describe()
tip_rates_desc = tip_rates_desc.reset_index()

In [101]:
tip_rates_desc

Unnamed: 0_level_0,lpep_pickup_hour_of_day,tip_rate,tip_rate,tip_rate,tip_rate,tip_rate,tip_rate,tip_rate,tip_rate
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max
0,0,48704.0,0.072284,0.090501,0.0,0.0,0.0,0.166667,1.0
1,1,37990.0,0.069845,0.091574,0.0,0.0,0.0,0.166667,1.0
2,2,27800.0,0.068947,0.090523,0.0,0.0,0.0,0.166667,0.954401
3,3,21690.0,0.066541,0.091949,0.0,0.0,0.0,0.166667,0.967118
4,4,17823.0,0.058454,0.08962,0.0,0.0,0.0,0.165919,1.0
5,5,11347.0,0.061277,0.088784,0.0,0.0,0.0,0.165775,1.0
6,6,15137.0,0.068653,0.088693,0.0,0.0,0.0,0.166144,1.0
7,7,30812.0,0.067898,0.08596,0.0,0.0,0.0,0.166276,1.0
8,8,45567.0,0.074327,0.088117,0.0,0.0,0.0,0.166667,1.0
9,9,47646.0,0.073975,0.08875,0.0,0.0,0.0,0.166667,1.0


# Our frequency of trips reaches a global minimum at 5am and max at 7pm
We also have a local max at around 9am. These are not super surprising as this reflects a work schedule. I am a little surprised to see the relative decrease isn't as high as I expected after 9am

In [114]:
fig = go.Figure()
x_val = tip_rates_desc['lpep_pickup_hour_of_day']
y_count = tip_rates_desc['tip_rate']['count']

fig.update_layout(
    title_text="All Trip Counts"
)

fig.update_xaxes(title_text="Hour of Day")

fig.add_trace(go.Scatter(x=x_val, y=y_count,
                        
                    mode='lines',
                    name='Trip Count'))
fig.show()

In [120]:
fig = make_subplots(specs=[[{"secondary_y": True}]])

# Add traces
fig.add_trace(
    go.Scatter(x=tip_rates_desc['lpep_pickup_hour_of_day'], y=tip_rates_desc['tip_rate']['count'], name="Trip Counts"),
    secondary_y=False,
)

fig.add_trace(
    go.Scatter(x=tip_rates_desc['lpep_pickup_hour_of_day'], y=tip_rates_desc['tip_rate']['mean'], name="Mean of Tip Rates"),
    secondary_y=True,
)

# Add figure title
fig.update_layout(
    title_text="Total trip counts vs. Mean of Tips"
)

# Set x-axis title
fig.update_xaxes(title_text="Hour of Day")

# Set y-axes titles
fig.update_yaxes(title_text="<b>Trip Counts</b> yaxis title", secondary_y=False)
fig.update_yaxes(title_text="<b>Mean of Tip Rates</b> yaxis title", secondary_y=True)

fig.show()

# Although this is an interesting pattern I would approach this with skepticism.

Before we make a claim that people give better tips in the morning compared to 