In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
import haversine as hs
import mpu

In [2]:
df = pd.read_csv('taxi1.csv')

In [3]:
df.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435


In [4]:
#let's see outliers
print( df['trip_duration'].nlargest(10))

978383     3526282
924150     2227612
680594     2049578
355003     1939736
1234291      86392
295382       86391
73816        86390
59891        86387
1360439      86385
753765       86379
Name: trip_duration, dtype: int64


In [5]:
df=df[df.trip_duration!=df.trip_duration.max()]

In [6]:
print( df['trip_duration'].nlargest(10))

924150     2227612
680594     2049578
355003     1939736
1234291      86392
295382       86391
73816        86390
59891        86387
1360439      86385
753765       86379
91717        86378
Name: trip_duration, dtype: int64


In [7]:
df=df[df.trip_duration!=df.trip_duration.max()]

In [8]:
print( df['trip_duration'].nlargest(10))

680594     2049578
355003     1939736
1234291      86392
295382       86391
73816        86390
59891        86387
1360439      86385
753765       86379
91717        86378
1221666      86378
Name: trip_duration, dtype: int64


In [9]:
df=df[df.trip_duration!=df.trip_duration.max()]

In [10]:
print( df['trip_duration'].nlargest(10))

355003     1939736
1234291      86392
295382       86391
73816        86390
59891        86387
1360439      86385
753765       86379
91717        86378
1221666      86378
66346        86377
Name: trip_duration, dtype: int64


In [11]:
df=df[df.trip_duration!=df.trip_duration.max()]

In [12]:
print( df['trip_duration'].nlargest(5))

1234291    86392
295382     86391
73816      86390
59891      86387
1360439    86385
Name: trip_duration, dtype: int64


In [48]:
df.head(3)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,dropoff_time,pickup_longlat,dropoff_longlat,distance,pickup_dayofweek,pickup_weekday_name,pickup_hour,pickup_month,pickup_day,shift
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.98215484619139,40.76793670654297,-73.96463012695312,40.765602111816406,N,...,17:32:30,"(40.76793670654297, -73.98215484619139)","(40.765602111816406, -73.96463012695312)",1.4985207796462985,0,Monday,17,3,14,PM
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.97902679443358,40.763938903808594,-74.00533294677734,40.710086822509766,N,...,12:10:48,"(40.763938903808594, -73.97902679443358)","(40.71008682250977, -74.00533294677734)",6.385098495252615,1,Tuesday,11,1,19,AM
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.97305297851561,40.79320907592773,-73.9729232788086,40.782520294189446,N,...,13:38:10,"(40.79320907592773, -73.97305297851562)","(40.78252029418945, -73.9729232788086)",1.188588459333431,5,Saturday,13,3,26,AM


In [14]:
#removing this many randomized rows to have 1 million
remove_n = 458644
drop_indices = np.random.choice(df.index, remove_n, replace=False)
df = df.drop(drop_indices)

In [15]:
df.shape

(999996, 11)

In [16]:
#new columns pick up date and pick up times
df['pickup_date'] = pd.to_datetime(df['pickup_datetime']).dt.date
df['pickup_time'] = pd.to_datetime(df['pickup_datetime']).dt.time

In [17]:
#new columns drop off and pick up
df['dropoff_date'] = pd.to_datetime(df['dropoff_datetime']).dt.date
df['dropoff_time'] = pd.to_datetime(df['dropoff_datetime']).dt.time

In [18]:
#combine pickup latitude and longitude
df['pickup_longlat'] = list(zip(df.pickup_latitude, df.pickup_longitude))

In [19]:
#combine dropoff latitude and longitude
df['dropoff_longlat'] = list(zip(df.dropoff_latitude, df.dropoff_longitude))

In [20]:
#calculate distance from given coordinates in miles
import mpu
df['distance'] = df.apply(lambda x: mpu.haversine_distance(x['pickup_longlat'], x['dropoff_longlat']), 1)

In [21]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime'])

In [22]:
#new columns parsing day of week month etc
df["pickup_dayofweek"] = df.pickup_datetime.dt.dayofweek
df["pickup_weekday_name"] = df.pickup_datetime.dt.day_name()
df["pickup_hour"] = df.pickup_datetime.dt.hour
df["pickup_month"] = df.pickup_datetime.dt.month
df["pickup_day"] = df['pickup_datetime'].dt.day

In [23]:
#check if everything is fine
df.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,dropoff_date,dropoff_time,pickup_longlat,dropoff_longlat,distance,pickup_dayofweek,pickup_weekday_name,pickup_hour,pickup_month,pickup_day
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,2016-03-14,17:32:30,"(40.76793670654297, -73.98215484619139)","(40.765602111816406, -73.96463012695312)",1.498521,0,Monday,17,3,14
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,2016-01-19,12:10:48,"(40.763938903808594, -73.97902679443358)","(40.71008682250977, -74.00533294677734)",6.385098,1,Tuesday,11,1,19
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,2016-03-26,13:38:10,"(40.79320907592773, -73.97305297851562)","(40.78252029418945, -73.9729232788086)",1.188588,5,Saturday,13,3,26
5,id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982857,40.742195,-73.992081,40.749184,N,...,2016-01-30,22:09:03,"(40.74219512939453, -73.9828567504883)","(40.749183654785156, -73.99208068847656)",1.098942,5,Saturday,22,1,30
7,id1324603,2,2016-05-21 07:54:58,2016-05-21 08:20:49,1,-73.969276,40.797779,-73.92247,40.760559,N,...,2016-05-21,08:20:49,"(40.79777908325195, -73.96927642822266)","(40.76055908203125, -73.92247009277344)",5.714981,5,Saturday,7,5,21


In [24]:
#passengers count
passcount = df.groupby('passenger_count')['distance'].count()
passcount

passenger_count
0        48
1    708647
2    143768
3     41233
4     19505
5     53534
6     33258
7         2
9         1
Name: distance, dtype: int64

In [25]:
byhour = df.groupby('pickup_hour')
byhour = byhour['id'].count()
byhour

pickup_hour
0     36424
1     26498
2     19169
3     14248
4     10783
5     10288
6     22895
7     38054
8     46125
9     46442
10    44835
11    46903
12    49323
13    49188
14    50823
15    49097
16    44352
17    52242
18    61971
19    61769
20    57608
21    57789
22    55256
23    47914
Name: id, dtype: int64

In [26]:
tripweek = df.groupby('pickup_hour')
tripweek = tripweek['id'].count()
tripweek

pickup_hour
0     36424
1     26498
2     19169
3     14248
4     10783
5     10288
6     22895
7     38054
8     46125
9     46442
10    44835
11    46903
12    49323
13    49188
14    50823
15    49097
16    44352
17    52242
18    61971
19    61769
20    57608
21    57789
22    55256
23    47914
Name: id, dtype: int64

In [27]:
trip_week = df.groupby(['pickup_month','pickup_day','pickup_weekday_name'])
trip_week = trip_week['id'].count()
trip_week

pickup_month  pickup_day  pickup_weekday_name
1             1           Friday                 4945
              2           Saturday               4497
              3           Sunday                 4302
              4           Monday                 4614
              5           Tuesday                4946
                                                 ... 
6             26          Sunday                 4881
              27          Monday                 4995
              28          Tuesday                4958
              29          Wednesday              5156
              30          Thursday               5146
Name: id, Length: 182, dtype: int64

In [30]:
#shifts
#5am-4pm
am = [5,6,7,8,9,10,11,12,13,14,15,16]
#5pm-4am
pm = [17,18,19,20,21,22,23,24,0,1,2,4]
print(len(am))
print(len(pm))

12
12


In [31]:
df['shift'] = df['pickup_hour'].apply(lambda x: 'AM' if x in range(5,16) else 'PM')

In [32]:
#check if everything worked
df.head(10)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,...,dropoff_time,pickup_longlat,dropoff_longlat,distance,pickup_dayofweek,pickup_weekday_name,pickup_hour,pickup_month,pickup_day,shift
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,...,17:32:30,"(40.76793670654297, -73.98215484619139)","(40.765602111816406, -73.96463012695312)",1.498521,0,Monday,17,3,14,PM
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,...,12:10:48,"(40.763938903808594, -73.97902679443358)","(40.71008682250977, -74.00533294677734)",6.385098,1,Tuesday,11,1,19,AM
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,...,13:38:10,"(40.79320907592773, -73.97305297851562)","(40.78252029418945, -73.9729232788086)",1.188588,5,Saturday,13,3,26,AM
5,id0801584,2,2016-01-30 22:01:40,2016-01-30 22:09:03,6,-73.982857,40.742195,-73.992081,40.749184,N,...,22:09:03,"(40.74219512939453, -73.9828567504883)","(40.749183654785156, -73.99208068847656)",1.098942,5,Saturday,22,1,30,PM
7,id1324603,2,2016-05-21 07:54:58,2016-05-21 08:20:49,1,-73.969276,40.797779,-73.92247,40.760559,N,...,08:20:49,"(40.79777908325195, -73.96927642822266)","(40.76055908203125, -73.92247009277344)",5.714981,5,Saturday,7,5,21,AM
9,id0012891,2,2016-03-10 21:45:01,2016-03-10 22:05:26,1,-73.981049,40.744339,-73.973,40.789989,N,...,22:05:26,"(40.74433898925781, -73.98104858398438)","(40.78998947143555, -73.9729995727539)",5.121162,3,Thursday,21,3,10,PM
11,id1299289,2,2016-05-15 11:16:11,2016-05-15 11:34:59,4,-73.991531,40.749439,-73.956543,40.77063,N,...,11:34:59,"(40.74943923950195, -73.9915313720703)","(40.7706298828125, -73.95654296875)",3.773096,6,Sunday,11,5,15,AM
12,id1187965,2,2016-02-19 09:52:46,2016-02-19 10:11:20,2,-73.962982,40.75668,-73.984406,40.760719,N,...,10:11:20,"(40.7566795349121, -73.96298217773438)","(40.760719299316406, -73.98440551757811)",1.859483,4,Friday,9,2,19,AM
16,id3379579,2,2016-04-11 17:29:50,2016-04-11 18:08:26,1,-73.991165,40.755562,-73.99929,40.725353,N,...,18:08:26,"(40.75556182861328, -73.99116516113281)","(40.7253532409668, -73.9992904663086)",3.428086,0,Monday,17,4,11,PM
17,id1154431,1,2016-04-14 08:48:26,2016-04-14 09:00:37,1,-73.994255,40.745804,-73.999657,40.723343,N,...,09:00:37,"(40.74580383300781, -73.99425506591797)","(40.72334289550781, -73.9996566772461)",2.538672,3,Thursday,8,4,14,AM


In [33]:
#compare the sums for each shift
pd.options.display.float_format = '{:.20f}'.format
shiftdis = df.groupby('shift')
shiftdis = shiftdis['distance'].sum()
print(shiftdis)

shift
AM   1482736.13491473183967173100
PM   1956311.19213259010575711727
Name: distance, dtype: float64


## My Hypotheses
## h0 - there is no difference in the distance traveled between AM and PM shifts
## h1 - there is a difference in the distance traveled between AM and PM shifts

In [34]:
#separting types by creating masks
am_shift = df['shift'] == 'AM'
am_shift

0          False
2           True
4           True
5          False
7           True
           ...  
1458634     True
1458637    False
1458640     True
1458641     True
1458642     True
Name: shift, Length: 999996, dtype: bool

In [35]:
#size of amshift and distance
n_am = len(df[am_shift]['distance'])
n_am

453973

In [46]:
df[am_shift]['distance'].mean()

3.2661328645420142

In [37]:
df[am_shift]['distance'].std()

4.047063421116454

In [38]:
#pm mask
pm_shift = df['shift'] == 'PM'
pm_shift

0           True
2          False
4          False
5           True
7          False
           ...  
1458634    False
1458637     True
1458640    False
1458641    False
1458642    False
Name: shift, Length: 999996, dtype: bool

In [39]:
n_pm = len(df[pm_shift]['distance'])
n_pm

546023

In [40]:
df[pm_shift]['distance'].mean()

3.5828366060268344

In [41]:
df[pm_shift]['distance'].std()

4.718676377811621

In [47]:
#conduct the t-test
stats.ttest_ind(df[am_shift]['distance'], df[pm_shift]['distance'], equal_var=False)

Ttest_indResult(statistic=-36.12531137640677, pvalue=1.3918252161198493e-285)

In [45]:
pvalue=1.3918252161198493e-285
# interpret
alpha = 0.05
if pvalue > alpha:
	print('Sample looks Gaussian (fail to reject H0)')
else:
	print('Sample does not look Gaussian (reject H0)')

Sample does not look Gaussian (reject H0)


In [44]:
# df.to_csv('taxi_831.csv')