In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
import zipfile

zf = zipfile.ZipFile('../input/nyc-taxi-trip-duration/sample_submission.zip') 
sample_submission = pd.read_csv(zf.open('sample_submission.csv'))

zf = zipfile.ZipFile('../input/nyc-taxi-trip-duration/test.zip') 
test = pd.read_csv(zf.open('test.csv'))

zf = zipfile.ZipFile('../input/nyc-taxi-trip-duration/train.zip') 
train = pd.read_csv(zf.open('train.csv'))


In [None]:
train.head(10)

In [None]:
#Calculating between pick up and drop off locations.

from math import sin, cos, sqrt, atan2, radians

# approximate radius of earth in km
R = 6373.0
 
train['lat'] = np.radians(train.pickup_latitude) - np.radians(train.dropoff_latitude)
train['lon'] = np.radians(train['pickup_longitude']) - np.radians(train['dropoff_longitude'])


train['dist'] = np.sin(train['lat'] / 2)**2 + \
                         np.cos(np.radians(train.pickup_latitude)) * \
                         np.cos(np.radians(train.dropoff_latitude)) * \
                         np.sin(train['lon'] / 2)**2
                         
                         
train['dist2'] = 2 * np.arctan2(np.sqrt(train['dist']), np.sqrt(1 -train['dist'] ))
                         
train['dist_final'] = R * train['dist2']
                         
                         
                         

In [None]:
train.shape

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.drop(columns = ['lat', 'lon', 'dist', 'dist2'], inplace = True)

In [None]:
test.head()

In [None]:
test['lat'] = np.radians(test.pickup_latitude) - np.radians(test.dropoff_latitude)
test['lon'] = np.radians(test['pickup_longitude']) - np.radians(test['dropoff_longitude'])


test['dist'] = np.sin(test['lat'] / 2)**2 + \
                         np.cos(np.radians(test.pickup_latitude)) * \
                         np.cos(np.radians(test.dropoff_latitude)) * \
                         np.sin(test['lon'] / 2)**2
                         
                         
test['dist2'] = 2 * np.arctan2(np.sqrt(test['dist']), np.sqrt(1 -test['dist'] ))
                         
test['dist_final'] = R * test['dist2']
                         
test['dist_final'].head()

In [None]:
test.drop(columns = ['lat', 'lon', 'dist', 'dist2'], inplace = True)

In [None]:
test.info()

In [None]:
test.describe()

In [None]:
#Missing values
np.sum(pd.isnull(train))


In [None]:
np.sum(pd.isnull(test))


In [None]:
train.head()

In [None]:
train['pickup_date'] = pd.to_datetime(train.pickup_datetime)
train['pickup_hour'] =train['pickup_date'].dt.hour
train['pickup_min'] = train['pickup_date'].dt.minute


In [None]:
train['day_name'] = train.pickup_date.dt.day_name()

In [None]:
test['pickup_date'] = pd.to_datetime(test.pickup_datetime)
test['pickup_hour'] =test['pickup_date'].dt.hour
test['pickup_min'] = test['pickup_date'].dt.minute
test['day_name'] = test.pickup_date.dt.day_name()

In [None]:
# Removing redundant features

train.drop(columns =['pickup_datetime', 'dropoff_datetime', 'pickup_date'], inplace =True)
test.drop(columns =['pickup_datetime',  'pickup_date'],                     inplace =True)


In [None]:
train.drop(columns = ['id', 'vendor_id', 'store_and_fwd_flag'], inplace = True)
test.drop(columns = ['id', 'vendor_id', 'store_and_fwd_flag'], inplace = True)


In [None]:
# Check out numbers of duplicated rows
np.sum(train.duplicated())

train.drop_duplicates(inplace = True)

In [None]:
np.sum(test.duplicated())
test.drop_duplicates(inplace = True)

In [None]:
#Change categorical variables to numeric ones
train = pd.get_dummies(data = train, drop_first = True)
test =  pd.get_dummies(data = test, drop_first = True)


In [None]:
#Scale variables
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train.drop(columns = ['trip_duration']))
Y_scaled = np.log1p(train['trip_duration'])

In [None]:
#Model with XGBoost
import xgboost as xgb
from sklearn.metrics import mean_squared_error



In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train_scaled, Y_scaled, test_size=0.2, random_state=42)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='reg:squarederror', colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 70, verbosity =0)

In [None]:
xg_reg.fit(X_train,y_train)

preds = xg_reg.predict(X_test)


In [None]:
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

# k-fold Cross Validation using XGBoost
# 


In [None]:
data_dmatrix = xgb.DMatrix(data=X_train_scaled,label=Y_scaled)

params = {"objective":"reg:squarederror",'colsample_bytree': 0.3,'learning_rate': 0.1,
                'max_depth': 5, 'alpha': 10,'verbosity'  :0}

cv_results = xgb.cv(dtrain=data_dmatrix, params=params, nfold=3,
                    num_boost_round=50,early_stopping_rounds=10,metrics="rmse", as_pandas=True, seed=42)



In [None]:
cv_results.head()
