In [None]:
%matplotlib inline
import pandas as pd
from datetime import datetime
import seaborn as sns
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 10]

**Loading the Data**

Load the data using the Pandas `read_csv` function:

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.head()

In [None]:
test.head()

**Data Exploration**

Let's start off by exploring the files we just imported and understand what the data! Data Analysis is a very important step in any Machine Learning projects


*Getting a statistical summary of the data is also quite easy. *

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x) #instead of exponential, print the entire number
train.describe()

This looks alright? But wait! Checkout trip_duration column. Min is 1 second and Maximum is approximately `3526282.000/60*60= 980` hours! Possible?? Aww, I wish I had that time and money. So definetely there are outliers in the dataset. Lets clean in a bit.

In [None]:
train.info()

info() provides the type of data in the dataset. Now we know the data types. Cool!

**Check for NA or any outlier values**

In [None]:
print(train.isnull().sum())
#pretty cool!

In [None]:
# check for duplicate ids - nice, no duplicates
print('No of Duplicates, Trip IDs: {}'.format(len(train) - 
                                              len(train.drop_duplicates(subset='id'))))


In [None]:
# check latitude bounds, Latitude: -85 to +85
print('Latitude bounds: {} to {}'.format(
    max(train.pickup_latitude.min(), train.dropoff_latitude.min()),
    max(train.pickup_latitude.max(), train.dropoff_latitude.max())
))

In [None]:
# check longitude bounds, Longitude: -180 to +180
print('Longitude bounds: {} to {}'.format(
    max(train.pickup_longitude.min(), train.dropoff_longitude.min()),
    max(train.pickup_longitude.max(), train.dropoff_longitude.max())
))


In [None]:
# Get the vendor cnt
print('Vendors cnt: {}'.format(len(train.vendor_id.unique())))

In [None]:
# datetime range - 6 months worth data
print('Datetime range: {} to {}'.format(train.pickup_datetime.min(), 
                                        train.dropoff_datetime.max()))


In [None]:
# passenger count - 10, Hmm must be a big ride or big car! BTw, guess which car?
print('Passengers: {} to {}'.format(train.passenger_count.min(), 
                                        train.passenger_count.max()))

**2.1 Data Cleaning**

***2.1.1 Trip Duration Clean-up***

As noted above, there are some outliers associated with the `trip_duration`, specifically a 980 hour maximum trip duration and a minimum of 1 second trip duration. Lets exclude which are outside standard deviation


In [None]:
m = np.mean(train['trip_duration'])
s = np.std(train['trip_duration'])
train = train[train['trip_duration'] <= m + 2*s]
train = train[train['trip_duration'] >= m - 2*s]

***2.1.2  Latitude and Longitude Clean-up ***

The borders of NY City comes out to be the below coordinates :

city_long_border = (-74.03, -73.75)
city_lat_border = (40.63, 40.85) 

Comparing this to our `train.describe()` output we see that there are some coordinate points (pick ups/drop offs) that fall outside these borders. So let's limit our area of investigation to within the NY City borders.



In [None]:
train = train[train['pickup_longitude'] <= -73.75]
train = train[train['pickup_longitude'] >= -74.03]
train = train[train['pickup_latitude'] <= 40.85]
train = train[train['pickup_latitude'] >= 40.63]
train = train[train['dropoff_longitude'] <= -73.75]
train = train[train['dropoff_longitude'] >= -74.03]
train = train[train['dropoff_latitude'] <= 40.85]
train = train[train['dropoff_latitude'] >= 40.63]

Lets convert `pickup_datetime` and `dropoff_datetime` to `datetime` which might be helpful in feature extraction (Maybe, I don't noe!) 

In [None]:
train['pickup_datetime'] = pd.to_datetime(train.pickup_datetime)
test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)
train.loc[:, 'pickup_date'] = train['pickup_datetime'].dt.date
test.loc[:, 'pickup_date'] = test['pickup_datetime'].dt.date
train['dropoff_datetime'] = pd.to_datetime(train.dropoff_datetime)


**3. Data Visualization**

I think this is my favorite part!

****a. Trip Duration ****



In [None]:
plt.hist(train['trip_duration'].values, bins=71)
plt.xlabel('trip_duration')
plt.ylabel('number of train records')
plt.show()

Pickups and Dropoff's of Newyork

In [None]:
longitude = list(train.pickup_longitude) + list(train.dropoff_longitude)
latitude = list(train.pickup_latitude) + list(train.dropoff_latitude)
plt.figure(figsize = (10,10))
plt.plot(longitude,latitude,'.', alpha = 0.4, markersize = 0.05)
plt.show()

In [None]:
dayNames = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
monthNames = ['January', 'February', 'March', 'April', 'May', 'June', 
    'July', 'August', 'September', 'October', 'November', 'December']

#month
train["monthPickUp"] = train.pickup_datetime.dt.month.astype(np.uint8)
train["monthDropOff"] = train.dropoff_datetime.dt.month.astype(np.uint8)

#day
train["dayPickUp"] = train.pickup_datetime.dt.weekday.astype(np.uint8)
train["dayDropOff"] = train.dropoff_datetime.dt.weekday.astype(np.uint8)

#hour
train["hourPickUp"] = train.pickup_datetime.dt.hour.astype(np.uint8)
train["hourDropOff"] = train.dropoff_datetime.dt.hour.astype(np.uint8)

#for test set
#month
test["monthPickUp"] = test.pickup_datetime.dt.month.astype(np.uint8)

#day
test["dayPickUp"] = test.pickup_datetime.dt.weekday.astype(np.uint8)

#hour
test["hourPickUp"] = test.pickup_datetime.dt.hour.astype(np.uint8)

In [None]:
test.head()

Let me plot the distribution based on hour.

In [None]:
plt.figure(figsize=(12,2))
data = train.groupby("hourPickUp").aggregate({"id":"count"}).reset_index()
sns.barplot(x='hourPickUp', y='id', data=data)
plt.title('Pick-ups Hour Distribution')
plt.xlabel('Hour of Day, 0-23')
plt.ylabel('No of Trips made')

So from the above graph, we can infer maximum rides are requested in the evening. But being a California/Texas guy, New York seems to be busy city!

Now, distribution based on days of the week

In [None]:
plt.figure(figsize=(12,2))
data = train.groupby("dayPickUp").aggregate({"id":"count"}).reset_index()
sns.barplot(x="dayPickUp", y="id", data = data)
plt.title('Pick-ups based on days of the week')
plt.xlabel("days of the week Sun - Sat")
plt.ylabel("No of trips made")

In [None]:
plt.figure(figsize=(12,2))
data = train.groupby("monthPickUp").aggregate({"id":"count"}).reset_index()
sns.barplot(x="monthPickUp", y="id", data = data)
plt.title('Pick-ups based on months')
plt.xlabel("Months")
plt.ylabel("No of trips made")

Heatmap for pickup for day vs time. 

In [None]:
plt.figure(figsize=(12,2))
sns.heatmap(data = pd.crosstab(train.monthPickUp,
                              train.hourPickUp,
                              values=train.vendor_id, 
                              aggfunc='count',
                              normalize='index'))
plt.title('Pickup heatmap, Month vs. Day Hour')
plt.ylabel('Month') ; plt.xlabel('Day Hour, 0-23')


Can you infer from the above heatmap?

Similarly you can try Month vs Hour, Month vs day. (Assignment! Try it) haha

In [None]:
passengerCount = train.groupby('passenger_count')['trip_duration'].mean()
plt.subplots(1,1,figsize=(17,10))
plt.ylabel('Time in Seconds')
sns.barplot(passengerCount.index,passengerCount.values)

In [None]:
train.groupby('monthPickUp').size()

** Distance and Directionality **

Thanks to Beluga's post which can determine the distance and direction of a specific trip based on the pickup and dropoff coordinates. I didn't work on these functions! I gave credits to the author

In [None]:
def haversine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def dummy_manhattan_distance(lat1, lng1, lat2, lng2):
    a = haversine_array(lat1, lng1, lat1, lng2)
    b = haversine_array(lat1, lng1, lat2, lng1)
    return a + b

def bearing_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371  # in km
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [None]:
train.loc[:, 'distance_haversine'] = haversine_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
test.loc[:, 'distance_haversine'] = haversine_array(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)   

In [None]:
train.loc[:, 'distance_dummy_manhattan'] =  dummy_manhattan_distance(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
test.loc[:, 'distance_dummy_manhattan'] =  dummy_manhattan_distance(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)

In [None]:
train.loc[:, 'direction'] = bearing_array(train['pickup_latitude'].values, train['pickup_longitude'].values, train['dropoff_latitude'].values, train['dropoff_longitude'].values)
test.loc[:, 'direction'] = bearing_array(test['pickup_latitude'].values, test['pickup_longitude'].values, test['dropoff_latitude'].values, test['dropoff_longitude'].values)

**KMeans to create Clusters**

Three steps to preparing the data: 
1. create the coordinates
2. configure the KMeans clustering parameters, and 
3. create the actual clusters:

In [None]:
coords = np.vstack((train[['pickup_latitude', 'pickup_longitude']].values,
                    train[['dropoff_latitude', 'dropoff_longitude']].values))


In [None]:
from sklearn.cluster import MiniBatchKMeans
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])

In [None]:
train.loc[:, 'pickup_cluster'] = kmeans.predict(train[['pickup_latitude', 'pickup_longitude']])
train.loc[:, 'dropoff_cluster'] = kmeans.predict(train[['dropoff_latitude', 'dropoff_longitude']])
test.loc[:, 'pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test.loc[:, 'dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])

In [None]:
city_long_border = (-74.03, -73.75)
city_lat_border = (40.63, 40.85)

fig, ax = plt.subplots(ncols=1, nrows=1)
ax.scatter(train.pickup_longitude.values[:500000], train.pickup_latitude.values[:500000], s=10, lw=0,
           c=train.pickup_cluster[:500000].values, cmap='autumn', alpha=0.2)
ax.set_xlim(city_long_border)
ax.set_ylim(city_lat_border)
ax.set_xlabel('Longitude')
ax.set_ylabel('Latitude')
plt.show()

This shows a nice visual representation of the KMeans clustering algorithm at work (we used 100 clusters, but there is freedom to play around with this parameter to see how it changes the results). The clustering effectively created the different neighbourhoods in Manhattan as displayed by the borders between the different colors. This should be intuitive to some degree as a trip would differ from point A to point B, in various parts of New york. By nature, it's different.

**Creating Dummy Variables**

A simple function that changes categorical data into dummy/indicator variables. (https://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html)

In [None]:
vendor_train = pd.get_dummies(train['vendor_id'], prefix='vi', prefix_sep='_')
vendor_test = pd.get_dummies(test['vendor_id'], prefix='vi', prefix_sep='_')
passenger_count_train = pd.get_dummies(train['passenger_count'], prefix='pc', prefix_sep='_')
passenger_count_test = pd.get_dummies(test['passenger_count'], prefix='pc', prefix_sep='_')
store_and_fwd_flag_train = pd.get_dummies(train['store_and_fwd_flag'], prefix='sf', prefix_sep='_')
store_and_fwd_flag_test = pd.get_dummies(test['store_and_fwd_flag'], prefix='sf', prefix_sep='_')
cluster_pickup_train = pd.get_dummies(train['pickup_cluster'], prefix='p', prefix_sep='_')
cluster_pickup_test = pd.get_dummies(test['pickup_cluster'], prefix='p', prefix_sep='_')
cluster_dropoff_train = pd.get_dummies(train['dropoff_cluster'], prefix='d', prefix_sep='_')
cluster_dropoff_test = pd.get_dummies(test['dropoff_cluster'], prefix='d', prefix_sep='_')



In [None]:
month_train = pd.get_dummies(train['monthPickUp'], prefix='m', prefix_sep='_')
month_test = pd.get_dummies(test['monthPickUp'], prefix='m', prefix_sep='_')
dom_train = pd.get_dummies(train['dayPickUp'], prefix='dom', prefix_sep='_')
dom_test = pd.get_dummies(test['dayPickUp'], prefix='dom', prefix_sep='_')
hour_train = pd.get_dummies(train['hourPickUp'], prefix='h', prefix_sep='_')
hour_test = pd.get_dummies(test['hourPickUp'], prefix='h', prefix_sep='_')


In [None]:
train.head()

Number of features is really less. :(    (Just 7)
I'm scared to run a Machine Learning model on this. Let me spend some time to get more meaningful data. Till then, I have documented steps on how to run XGBoost Algorithm and cross validation.

In [None]:
train.drop(['monthDropOff','dayDropOff','hourDropOff'],axis = 1)

In [None]:
train = train.drop(['id','vendor_id','passenger_count','store_and_fwd_flag','monthPickUp','monthDropOff','dayPickUp','dayDropOff','hourDropOff','hourPickUp',
                   'pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis = 1)

In [None]:
test.head()

In [None]:
Test_id = test['id']
test = test.drop(['id','vendor_id','passenger_count','store_and_fwd_flag','monthPickUp','dayPickUp','hourPickUp',
                   'pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'], axis = 1)

In [None]:
train = train.drop(['dropoff_datetime','trip_duration'], axis = 1)

In [None]:
train.shape,test.shape

`Train, Test = train_test_split(train[0:100000], test_size = 0.2)`

`train['log_trip_duration'] = np.log(train['trip_duration'].values + 1)`


`X_train = Train.drop(['log_trip_duration'], axis=1)`
`Y_train = Train["log_trip_duration"] `
`X_test = Test.drop(['log_trip_duration'], axis=1)`
`Y_test = Test["log_trip_duration"]`



`Y_test = Y_test.reset_index().drop('index',axis = 1)`
`Y_train = Y_train.reset_index().drop('index',axis = 1)`



`dtrain = xgb.DMatrix(X_train, label=Y_train)`
`dvalid = xgb.DMatrix(X_test, label=Y_test)`
`dtest = xgb.DMatrix(Test_master)`
`watchlist = [(dtrain, 'train'), (dvalid, 'valid')]`



`xgb_pars = {'min_child_weight': 1, 'eta': 0.5, 'colsample_bytree': 0.9, 'max_depth': 6,`
`'subsample': 0.9, 'lambda': 1., 'nthread': -1, 'booster' : 'gbtree', 'silent': 1,`
`'eval_metric': 'rmse', 'objective': 'reg:linear'}`
`model = xgb.train(xgb_pars, dtrain, 10, watchlist, early_stopping_rounds=2, maximize=False, verbose_eval=1)`
`print('Modeling RMSLE %.5f' % model.best_score)`