In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor

In [None]:
data = pd.read_csv('../input/nyc-taxi-trip-duration-dataset/nyc_taxi_trip_duration.csv')

In [None]:
data.describe()

In [None]:
data['pickup_datetime'] = pd.to_datetime(data.pickup_datetime)
data['dropoff_datetime'] = pd.to_datetime(data.dropoff_datetime)

In [None]:
df_y = np.log(data['trip_duration'])

data.loc[:, 'pickup_weekday']  = data['pickup_datetime'].dt.weekday
data.loc[:, 'pickup_weekofyear'] = data['pickup_datetime'].dt.weekofyear
data.loc[:, 'pickup_hour'] = data['pickup_datetime'].dt.hour
data.loc[:, 'pickup_minute'] = data['pickup_datetime'].dt.minute
data.loc[:, 'pickup_minute'] = data['pickup_datetime'].dt.minute
data.loc[:, 'pickup_dt'] = (data['pickup_datetime'] - data['pickup_datetime'].min()).dt.total_seconds()
data.loc[:, 'pickup_week_hour'] = data['pickup_weekday'] * 24 + data['pickup_hour']

# Euclidean distance

In [None]:
y_dist = data['pickup_longitude'] - data['dropoff_longitude']
x_dist = data['pickup_latitude'] - data['dropoff_latitude']
data['dist_sq'] = (y_dist ** 2) + (x_dist ** 2)
data['dist_sq'] = data['dist_sq'] ** 0.5

# HAVERSINE DISTANCE

In [None]:
def harvesine_array(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5)**2 + np.cos(lat1) *  np.cos(lat2) * np.sin(lng * 0.5)**2
    h = 2 *  AVG_EARTH_RADIUS + np.arcsin(np.sqrt(d))
    return h

def direction_array(lat1, lng1, lat2, lng2):
    AVG_EARTH_RADIUS = 6371
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan(y,x))

data['harvesine_distance'] = harvesine_array(data['pickup_latitude'].values, data['pickup_longitude'].values, data['dropoff_latitude'].values, data['dropoff_longitude'].values)
data['direction'] = direction_array(data['pickup_latitude'].values, data['pickup_longitude'].values, data['dropoff_latitude'].values, data['dropoff_longitude'].values)

In [None]:
#Binning

data['pickup_latitude_round3'] = np.round(data['pickup_latitude'], 3)
data['pickup_longitude_round3'] = np.round(data['pickup_longitude'], 3)

data['dropoff_latitude_round3'] = np.round(data['dropoff_latitude'], 3)
data['dropoff_longitude_round3'] = np.round(data['dropoff_longitude'], 3)

In [None]:
data['vendor_id'] = data['vendor_id'] - 1

In [None]:
np.sum(pd.isnull(data))

In [None]:
data.fillna(0, inplace=True)

In [None]:
data = data.drop(['id', 'pickup_datetime', 'dropoff_datetime', 'trip_duration', 'store_and_fwd_flag'], axis=1)

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(data, df_y, test_size=1/3, random_state=0)

In [None]:
mean_pred = np.repeat(ytrain.mean(), len(ytest))
sqrt(mean_squared_error(ytest, mean_pred))

In [None]:
def cv_score(ml_model, rstate=11, cols = data.columns):
    i = 1
    cv_scores = []
    df1 = data.copy()
    df1 = data[cols]
    
    kf = KFold(n_splits=5, random_state=rstate, shuffle=True)
    
    for train_index, test_index in kf.split(df1, df_y):
        print('\n {} of Kfold {}'.format(i, kf.n_splits))
        xtr, xvl = df1.loc[train_index], df1.loc[test_index]
        ytr, yvl = df_y[train_index], df_y[test_index]
        
        model = ml_model
        model.fit(xtr, ytr)
        train_val = model.predict(xtr)
        pred_val = model.predict(xvl)
        
        rmse_score_train = sqrt(mean_squared_error(ytr, train_val))
        rmse_score = sqrt(mean_squared_error(yvl, pred_val))
        suffix = ""
        msg = ""
        
        msg += "Valid RMSE: {:5f}".format(rmse_score)
        print(msg)
        
        cv_scores.append(rmse_score)
        i += 1
    return cv_scores

In [None]:
linreg_score = cv_score(LinearRegression())

In [None]:
dtree_score = cv_score(DecisionTreeRegressor(min_samples_leaf=25, min_samples_split=25))

In [None]:
results_df = pd.DataFrame({'Linear_regression': linreg_score, 'decision_tree': dtree_score})

In [None]:
results_df.plot(y = ['Linear_regression', 'decision_tree'], kind='bar', legend=False)
plt.legend(bbox_to_anchor = (1.05, 1), loc=2, borderaxespad = 0.)
plt.show()

In [None]:
from sklearn import tree

In [None]:
dtree = DecisionTreeRegressor(min_samples_leaf=25, min_samples_split=25)
dtree.fit(xtrain, ytrain)

In [None]:
decision_tree = tree.export_graphviz(dtree,out_file='tree.dot',feature_names=xtrain.columns,max_depth=2,filled=True)
!dot -Tpng tree.dot -o tree.png