In [None]:
import numpy as np
import pandas as pd 
import datetime as dt
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/train.zip')
test = pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/test.zip')
sample_submission = pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/sample_submission.zip')

In [None]:
train.head(3)

In [None]:
test.head(3)

In [None]:
train.info()

In [None]:
train.isnull().sum()

## II. EDA & FE <a id="two"></a>

In [None]:
from scipy import stats
from scipy.stats import norm

In [None]:
plt.scatter(range(train.shape[0]),np.sort(train['trip_duration']))

In [None]:
sns.distplot(train.trip_duration.values, fit = norm)

In [None]:
sns.distplot(np.log1p(train.trip_duration.values), fit = norm)

In [None]:
train['trip_duration'] = np.log(train['trip_duration'].values)

In [None]:
feature_names=list(test)
df_train=train[feature_names]
df=pd.concat((df_train, test))

In [None]:
print(train.shape, test.shape, df.shape)

In [None]:
df.head(3)

In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])

In [None]:
df['month'] = df['pickup_datetime'].dt.month
df['day'] = df['pickup_datetime'].dt.day
df['weekday'] = df['pickup_datetime'].dt.weekday
df['hour'] = df['pickup_datetime'].dt.hour
df['dayofweek'] = df['pickup_datetime'].dt.dayofweek

In [None]:
df.drop(['pickup_datetime'], axis=1, inplace=True)

In [None]:
sns.countplot(df['hour'])

In [None]:
sns.countplot(df['dayofweek'])

In [None]:
df['dist_long'] = df['pickup_longitude'] - df['dropoff_longitude']
df['dist_lat'] = df['pickup_latitude'] - df['dropoff_latitude']

In [None]:
df['dist'] = np.sqrt(np.square(df['dist_long']) + np.square(df['dist_lat']))

In [None]:
def ft_haversine_distance(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371 #km
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

df['distance'] = ft_haversine_distance(df['pickup_latitude'].values,
                                       df['pickup_longitude'].values, 
                                       df['dropoff_latitude'].values,
                                       df['dropoff_longitude'].values)

In [None]:
df.boxplot(column='distance')

In [None]:
#df = df[(df.distance < 200)]

In [None]:
g_vendor = train.groupby('vendor_id')['trip_duration'].mean()
sns.barplot(g_vendor.index,g_vendor.values)

In [None]:
sfflag = train.groupby('store_and_fwd_flag')['trip_duration'].mean()
sns.barplot(sfflag.index,sfflag.values)

In [None]:
pc = train.groupby('passenger_count')['trip_duration'].mean()
sns.barplot(pc.index,pc.values)

In [None]:
df = pd.concat([df, pd.get_dummies(df['store_and_fwd_flag'],prefix = 'store')], axis=1)
df.drop(['store_and_fwd_flag'], axis=1, inplace=True)

df = pd.concat([df, pd.get_dummies(df['vendor_id'],prefix = 'vendor')], axis=1)
df.drop(['vendor_id'], axis=1, inplace=True)

In [None]:
df.head(3)

In [None]:
cor = df.corr()
mask = np.array(cor)
mask[np.tril_indices_from(mask)] = False
fig,ax= plt.subplots()
fig.set_size_inches(20,10)
sns.heatmap(cor,mask= mask,square=True,annot=True)

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_log_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import LinearRegression

### a. train_test_split <a id="three-a"></a>

In [None]:
df.head(3)

In [None]:
df.drop(["id"], axis=1, inplace=True)

In [None]:
new_train = df[:train.shape[0]]
new_test = df[train.shape[0]:]

In [None]:
target = train['trip_duration']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(new_train, target, test_size=0.2, shuffle=True)

In [None]:
def rmsle_score(preds, true):
    rmsle_score = (np.sum((np.log1p(preds)-np.log1p(true))**2)/len(true))**0.5
    return rmsle_score

In [None]:
from sklearn.metrics.scorer import make_scorer

RMSLE = make_scorer(rmsle_score)

In [None]:
import statsmodels.api as sm

In [None]:
model = sm.OLS(target.values, new_train.astype(float))

In [None]:
re = model.fit()
re.summary()

### d. lightgbm <a id="three-d"></a>

In [None]:
import lightgbm as lgbm

In [None]:
lgb_params = {
    'metric' : 'rmse',
    'learning_rate': 0.1,
    'max_depth': 25,
    'num_leaves': 1000, 
    'objective': 'regression',
    'feature_fraction': 0.9,
    'bagging_fraction': 0.5,
    'max_bin': 1000 }


In [None]:
lgb_df = lgbm.Dataset(new_train,target)

In [None]:
lgb_model = lgbm.train(lgb_params, lgb_df, num_boost_round=1500)

In [None]:
pred = lgb_model.predict(new_test)

In [None]:
pred_lgb = np.exp(pred)

### e. xgboost <a id="three-d"></a>

In [None]:
import xgboost as xgb

In [None]:
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.1,
    'max_depth':          14,
    'subsample':          0.8,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1
}

In [None]:
dtrain = xgb.DMatrix(new_train, target)

In [None]:
gbm = xgb.train(params,
                dtrain,
                num_boost_round = 200)

In [None]:
pred_test = np.exp(gbm.predict(xgb.DMatrix(new_test)))

In [None]:
#ensemble = (0.8*pred_lgb + 0.4*pred_test) 0.42295
#ensemble = (0.7*pred_lgb + 0.3*pred_test) 0.38148
ensemble = (0.6*pred_lgb + 0.4*pred_test) #0.38124
#ensemble = (0.55*pred_lgb + 0.45*pred_test) 0.38126

In [None]:
sub = pd.DataFrame()
sub['id'] = test.id
sub['trip_duration'] = ensemble
sub.head(3)

In [None]:
sub = pd.DataFrame()
sub['id'] = test.id
sub['trip_duration'] = ensemble
sub.head(3)

In [None]:
sub.to_csv('submission.csv', index=False)