In [None]:
import pandas as pd
import numpy as np
from datetime import timedelta
import datetime as dt

def remove_outliers(df):
    # Refer to EDA notebook for the reasoning for choosing these specific filters
    df = df.query('trip_duration < 5900')
    df = df.query('passenger_count > 0')
    df = df.query('pickup_latitude > -100')
    df = df.query('pickup_latitude < 50')
    df['trip_duration'] = np.log(df['trip_duration'].values)

    return df

def encode_categorical_data(df, test):

    df = pd.concat([df, pd.get_dummies(df['store_and_fwd_flag'])], axis=1)
    test = pd.concat([test, pd.get_dummies(test['store_and_fwd_flag'])], axis=1)
    df = df.drop(['store_and_fwd_flag'], axis=1)

    df = pd.concat([df, pd.get_dummies(df['vendor_id'])], axis=1)
    test = pd.concat([test, pd.get_dummies(test['vendor_id'])], axis=1)
    df = df.drop(['vendor_id'], axis=1)

    return df, test

def convert_obj_to_ts(df, test):

    df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
    test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)

    df = df.drop(['dropoff_datetime'], axis=1)

    return df, test

def create_date_features(df):

    df['month'] = df.pickup_datetime.dt.month
    df['week'] = df.pickup_datetime.dt.week
    df['weekday'] = df.pickup_datetime.dt.weekday
    df['hour'] = df.pickup_datetime.dt.hour
    df['minute'] = df.pickup_datetime.dt.minute
    df['minute_oftheday'] = df['hour'] * 60 + df['minute']
    df.drop(['minute'], axis=1, inplace=True)

    return df

def ft_haversine_distance(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

def create_distance_features(df):

    df['distance'] = ft_haversine_distance(
                            df['pickup_latitude'].values,
                            df['pickup_longitude'].values, 
                            df['dropoff_latitude'].values,
                            df['dropoff_longitude'].values
                        )
    return df

def ft_degree(lat1, lng1, lat2, lng2):

    AVG_EARTH_RADIUS = 6371
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

def create_direction_features(df):
    df['direction'] = ft_degree(
                            df['pickup_latitude'].values,
                            df['pickup_longitude'].values,
                            df['dropoff_latitude'].values,
                            df['dropoff_longitude'].values
                        )
    return df

def data_pre_feat_engg(df):

    df = df.query('distance < 200')
    df['speed'] = df.distance / df.trip_duration
    df = df.query('speed < 30')
    df = df.drop(['speed'], axis=1)
    y = df["trip_duration"]
    df = df.drop(["trip_duration"], axis=1)
    df = df.drop(['id'], axis=1)
    X = df
    
    return X, y


def main():

    df = pd.read_csv('../input/nyc-taxi-trip-duration/train.zip')
    test = pd.read_csv('../input/nyc-taxi-trip-duration/test.zip')

    df = remove_outliers(df)
    df, test = encode_categorical_data(df, test)
    df, test = convert_obj_to_ts(df, test)
    df, test = create_date_features(df), create_date_features(test)

    df.drop(['pickup_datetime'], axis=1, inplace=True)

    df, test = create_distance_features(df), create_distance_features(test)
    df, test = create_direction_features(df), create_direction_features(test)
    
    fr1 = pd.read_csv('../input/new-york-city-taxi-with-osrm/fastest_routes_train_part_1.csv', usecols=['id', 'total_distance', 'total_travel_time',  'number_of_steps', ])
    fr2 = pd.read_csv('../input/new-york-city-taxi-with-osrm/fastest_routes_train_part_2.csv', usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
    test_street_info = pd.read_csv('../input/new-york-city-taxi-with-osrm/fastest_routes_test.csv',
                                   usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
    
    train_street_info = pd.concat((fr1, fr2))
    df = df.merge(train_street_info, how='left', on='id')
    test = test.merge(test_street_info, how='left', on='id')  
#     df['log_trip_duration'] = np.log(df['trip_duration'].values + 1)  
    
#     do_not_use_for_training = ['id', 'log_trip_duration', 'trip_duration', 'dropoff_datetime', 'pickup_date', 
#                                'pickup_datetime', 'date']
#     feature_names = [f for f in df.columns if f not in do_not_use_for_training]
    
#     df = df[feature_names].values

    X, y = data_pre_feat_engg(df)
    

    return X, y, test

In [None]:
import json
import time
import pandas as pd
import numpy as np
import lightgbm as lgb
import xgboost as xgb
# import catboost as ctb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# from data_preprocessing import main

# with open('lightgbm_config.json') as json_file:
#     lgb_config = json.load(json_file)

lgb_config = {
    "learning_rate": 0.1,
    "max_depth": 30,
    "num_leaves": 1000, 
    "objective": "regression",
    "feature_fraction": 0.8,
    "bagging_fraction": 0.5,
    "max_bin": 2000,
    "verbose":2
}

xgb_config = {
    "booster": "gbtree",
    "objective": "reg:squarederror",
    "learning_rate": 0.07,
    "max_depth": 16,
    "subsample": 0.9,
    "colsample_bytree": 0.7,
    "colsample_bylevel": 0.7,
    "verbosity": 2
}

# with open('xgboost_config.json') as json_file:
#     xgb_config = json.load(json_file)

def get_test_train_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

def lgb_train_loop(X, y):
    df = lgb.Dataset(X, y)
    lgb_model = lgb.train(lgb_config, df, num_boost_round=1500)

    return lgb_model

def xgb_train_loop(X,y):
    df = xgb.DMatrix(X, y)
    xgb_model = xgb.train(xgb_config, df, num_boost_round=200)

    return xgb_model

def get_ensemble_predictions(lgb_model, xgb_model, test, test_columns):
    xgb_preds = xgb_model.predict(xgb.DMatrix(test[test_columns]))
    lgb_preds = lgb_model.predict(test[test_columns])

    pred_xgb = np.exp(xgb_preds)
    pred_lgb = np.exp(lgb_preds)

    ensemble_preds = (0.6*pred_lgb + 0.4*pred_xgb)

    return ensemble_preds

def create_submission_df(test, ensemble_preds):
    sub_df = pd.DataFrame()
    sub_df['id'] = test.id
    sub_df['trip_duration'] = ensemble_preds

    return sub_df

if __name__ == '__main__':

    # get preprocessed data
    X, y, test = main()
    test_columns = X.columns
    X_train, X_test, y_train, y_test = get_test_train_split(X, y)
    
#     lgb_model = lgb_train_loop(X, y)
#     xgb_model = xgb_train_loop(X, y)

#     ensemble_preds = get_ensemble_predictions(lgb_model, xgb_model, test, test_columns)

#     sub_df = create_submission_df(test, ensemble_preds)
#     sub_df.to_csv('submission_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

In [None]:
lgb_model = lgb_train_loop(X, y)
xgb_model = xgb_train_loop(X, y)

In [None]:
def get_ensemble_predictions(lgb_model, xgb_model, test, test_columns):
    xgb_preds = xgb_model.predict(xgb.DMatrix(test[test_columns]))
    lgb_preds = lgb_model.predict(test[test_columns])

    pred_xgb = np.exp(xgb_preds)
    pred_lgb = np.exp(lgb_preds)

    ensemble_preds = (0.6*pred_lgb + 0.4*pred_xgb)
#     ensemble_preds = pred_lgb


    return ensemble_preds

In [None]:
ensemble_preds = get_ensemble_predictions(lgb_model, xgb_model, test, test_columns)
sub_df = create_submission_df(test, ensemble_preds)
sub_df.to_csv('submission_final_ensemble_wa_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test[test_columns])
shap.summary_plot(shap_values, test[test_columns], feature_names = X.columns)

In [None]:
import catboost as ctb

In [None]:
from catboost import CatBoostRegressor

In [None]:
model = CatBoostRegressor(random_seed=42)

In [None]:
model = CatBoostRegressor(eval_metric='RMSE',random_seed=42)

In [None]:
model.fit(X, y, verbose=2)

In [None]:
preds = model.predict(test[test_columns])
preds = np.exp(preds)
sub_df = pd.DataFrame()
sub_df['id'] = test.id
sub_df['trip_duration'] = preds
sub_df.to_csv('submission_final_catboost_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

In [None]:
import catboost

In [None]:
train_dataset = catboost.Pool(X_train, y_train) 
test_dataset = catboost.Pool(X_test, y_test)

In [None]:
model = catboost.CatBoostRegressor(loss_function='RMSE')

In [None]:
grid = {'iterations': [1000],
        'learning_rate': [0.03],
        'depth': [8, 10],
        'l2_leaf_reg': [0.2, 0.5, 1]}
model.grid_search(grid, train_dataset)

In [None]:
pred = model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
import shap


In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(test[test_columns])
shap.summary_plot(shap_values, test[test_columns], feature_names = X.columns)

In [None]:
pred_xgb = np.exp(preds)

In [None]:
pred_xgb

In [None]:
X.dtypes

In [None]:
y.dtypes

In [None]:
test.dtypes

In [None]:
xgb_preds = xgb_model.predict(xgb.DMatrix(test[test_columns]))
lgb_preds = lgb_model.predict(test[test_columns])

pred_xgb = np.exp(xgb_preds)
pred_lgb = np.exp(lgb_preds)

ensemble_preds = (0.7*pred_lgb + 0.3*pred_xgb)

In [None]:
sub_df = pd.DataFrame()
sub_df['id'] = test.id
sub_df['trip_duration'] = ensemble_preds

In [None]:
sub_df.to_csv('submission_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)