In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import zipfile
import featuretools as ft

In [None]:
#loading csv file
trainset = pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/train.zip')

shape= trainset.shape
print ('total rows:',shape[0], '\ntotal columns:', shape[1])

In [None]:
trainset.head()

In [None]:
#loading csv file
testset = pd.read_csv('/kaggle/input/nyc-taxi-trip-duration/test.zip')

shape= testset.shape
print ('total rows:',shape[0], '\ntotal columns:', shape[1])

In [None]:
testset.head()

In [None]:
data_train = trainset.drop(['dropoff_datetime'], axis=1)

#### no nulls so moving on to encoding

In [None]:
data_train.loc[:, 'store_and_fwd_flag'] = data_train['store_and_fwd_flag'].map({'Y': True,'N': False})
testset.loc[:, 'store_and_fwd_flag'] = testset['store_and_fwd_flag'].map({'Y': True,'N': False})

In [None]:
data_train['test_data'] = False
testset['test_data'] = True
data = pd.concat([data_train, testset], sort=True)

In [None]:
es = ft.EntitySet("taxi")

In [None]:
from woodwork.logical_types import Categorical, Ordinal
es = es.add_dataframe(
    dataframe_name="trips",
    dataframe=data,
    index="id",
    time_index="pickup_datetime",
)
es


In [None]:
es['trips']

In [None]:
es.normalize_dataframe(base_dataframe_name="trips",
                    new_dataframe_name="vendors",
                    index="vendor_id")

es.normalize_dataframe(base_dataframe_name="trips",
                    new_dataframe_name="passenger_cnt",
                    index="passenger_count")

In [None]:
cutoff_time = es['trips'][['id', 'pickup_datetime']]

In [None]:
es.add_interesting_values()

In [None]:
es.plot()

In [None]:
trans_primitives = ['Minute', 'Hour', 'Day', 'Week', 'Month', 'Weekday', 'Is_weekend']

feature_matrix, features = ft.dfs(entityset=es,
                                  target_dataframe_name="trips",
                                  trans_primitives=trans_primitives,
                                  drop_contains=['trips.test_data'],
                                  verbose=True,
                                  cutoff_time=cutoff_time,
                                  approximate='36d')

In [None]:
features[:25]

### time to fit into XGBoost

In [None]:
def get_train_test_fm(feature_matrix):
    X_train = feature_matrix[feature_matrix['test_data'] == False]
    X_train = X_train.drop(['test_data'], axis=1)
    labels = X_train['trip_duration']
    X_train = X_train.drop(['trip_duration'], axis=1)
    X_test = feature_matrix[feature_matrix['test_data'] == True]
    X_test = X_test.drop(['test_data', 'trip_duration'], axis=1)
    return (X_train, labels, X_test)

In [None]:
X_train, labels, X_test = get_train_test_fm(feature_matrix)
labels = np.log(labels.values + 1) #taking log for a more linear relationship

In [None]:
from sklearn.model_selection import train_test_split
import xgboost as xgb

def train_xgb(X_train, labels):
    Xtr, Xv, ytr, yv = train_test_split(X_train.values,
                                        labels,
                                        test_size=0.2,
                                        random_state=0)

    dtrain = xgb.DMatrix(Xtr, label=ytr)
    dvalid = xgb.DMatrix(Xv, label=yv)

    evals = [(dtrain, 'train'), (dvalid, 'valid')]

    params = {
        'min_child_weight': 1, 'eta': 0.166,
        'colsample_bytree': 0.4, 'max_depth': 9,
        'subsample': 1.0, 'lambda': 57.93,
        'booster': 'gbtree', 'gamma': 0.5,
        'silent': 1, 'eval_metric': 'rmse',
        'objective': 'reg:linear',
    }

    model = xgb.train(params=params, dtrain=dtrain, num_boost_round=227,
                      evals=evals, early_stopping_rounds=60, maximize=False,
                      verbose_eval=10)

    print('Modeling RMSE %.5f' % model.best_score)
    return model

In [None]:
model = train_xgb(X_train, labels)

In [None]:
def predict_xgb(model, X_test):
    dtest = xgb.DMatrix(X_test.values)
    ytest = model.predict(dtest)
    X_test['trip_duration'] = np.exp(ytest) - 1
    return X_test[['trip_duration']]

In [None]:
submission = predict_xgb(model, X_test)
submission.head(5)

In [None]:
def feature_importances(model, feature_names):
    feature_importance_dict = model.get_fscore()
    fs = ['f%i' % i for i in range(len(feature_names))]
    f1 = pd.DataFrame({'f': list(feature_importance_dict.keys()),
                       'importance': list(feature_importance_dict.values())})
    f2 = pd.DataFrame({'f': fs, 'feature_name': feature_names})
    feature_importance = pd.merge(f1, f2, how='right', on='f')
    feature_importance = feature_importance.fillna(0)
    return feature_importance[['feature_name', 'importance']].sort_values(by='importance',
                                                                          ascending=False)

In [None]:
feature_names = X_train.columns.values
ft_importances = feature_importances(model, feature_names)
ft_importances

In [None]:
ft_importances.head(30)

In [None]:
submission.to_csv('trip_duration_ft_simple.csv', index=True, index_label='id')

#### previous viz

In [None]:
# import matplotlib.pyplot as plt
# data = trainset.iloc[0:30,:]

# fig = plt.figure(figsize=(40,10))
# ax1 = fig.add_subplot(221)

# ax1.scatter(data['pickup_longitude'],data['pickup_latitude'], s=data['passenger_count']*5, c='b', marker="s", label='first')
# ax1.scatter(data['dropoff_longitude'],data['dropoff_latitude'], s=data['passenger_count']*5, c='r', marker="o", label='second')
# plt.legend(loc='upper right');
# plt.show()

In [None]:
# import geopy.distance
# temp = []
# for i in range(0,30):
#     temp.append((data['pickup_longitude'][i],data['pickup_latitude'][i]))
# data['pick'] = temp
# data.head()
# coords_1 = (52.2296756, 21.0122287)
# coords_2 = (52.406374, 16.9251681)

# print geopy.distance.geodesic(coords_1, coords_2).km

In [None]:
# for i in range(0,30):
#     print(geopy.distance.geodesic(data['pickup_longitude'][i],data['pickup_latitude'][i]).km)