In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import zipfile
import time 

from sklearn.cluster import KMeans ,AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import classification_report, confusion_matrix 
from sklearn.metrics import mean_squared_error as MSE

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
files = ["../input/nyc-taxi-trip-duration/train.zip", "../input/nyc-taxi-trip-duration/test.zip", "../input/nyc-taxi-trip-duration/sample_submission.zip"]
for file in files:
    with zipfile.ZipFile(file,"r") as zip_ref:
        zip_ref.extractall("./nyc-taxi-trip-duration/")

In [None]:
def remove_outliers(df):
    # Refer to EDA notebook for the reasoning for choosing these specific filters
    df = df.query('trip_duration < 5900')
    df = df.query('passenger_count > 0')
    df = df.query('pickup_latitude > -100')
    df = df.query('pickup_latitude < 50')
    
    return df

In [None]:
def encode_categorical_data(df, test):

    df = pd.concat([df, pd.get_dummies(df['store_and_fwd_flag'])], axis=1)
    test = pd.concat([test, pd.get_dummies(test['store_and_fwd_flag'])], axis=1)
    df = df.drop(['store_and_fwd_flag'], axis=1)

    df = pd.concat([df, pd.get_dummies(df['vendor_id'])], axis=1)
    test = pd.concat([test, pd.get_dummies(test['vendor_id'])], axis=1)
    df = df.drop(['vendor_id'], axis=1)

    return df, test

In [None]:
def convert_obj_to_ts(df, test):

    df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
    test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)

    df = df.drop(['dropoff_datetime'], axis=1)

    return df, test

In [None]:
def create_date_features(df):

    df['month'] = df.pickup_datetime.dt.month
    df['week'] = df.pickup_datetime.dt.week
    df['weekday'] = df.pickup_datetime.dt.weekday
    df['hour'] = df.pickup_datetime.dt.hour
    df['minute'] = df.pickup_datetime.dt.minute
    df['minute_oftheday'] = df['hour'] * 60 + df['minute']
    df.drop(['minute'], axis=1, inplace=True)

    return df

In [None]:
def ft_haversine_distance(lat1, lng1, lat2, lng2):
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    AVG_EARTH_RADIUS = 6371
    lat = lat2 - lat1
    lng = lng2 - lng1
    d = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    h = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(d))
    return h

In [None]:
def create_distance_features(df):

    df['distance'] = ft_haversine_distance(
                            df['pickup_latitude'].values,
                            df['pickup_longitude'].values, 
                            df['dropoff_latitude'].values,
                            df['dropoff_longitude'].values
                        )
    return df

In [None]:
def ft_degree(lat1, lng1, lat2, lng2):

    AVG_EARTH_RADIUS = 6371
    lng_delta_rad = np.radians(lng2 - lng1)
    lat1, lng1, lat2, lng2 = map(np.radians, (lat1, lng1, lat2, lng2))
    y = np.sin(lng_delta_rad) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(lng_delta_rad)
    return np.degrees(np.arctan2(y, x))

In [None]:
def create_direction_features(df):
    df['direction'] = ft_degree(
                            df['pickup_latitude'].values,
                            df['pickup_longitude'].values,
                            df['dropoff_latitude'].values,
                            df['dropoff_longitude'].values
                        )
    return df

In [None]:
def data_pre_feat_engg(df):

    df = df.query('distance < 200')
    df['speed'] = df.distance / df.trip_duration
    df = df.query('speed < 30')
    df = df.drop(['speed'], axis=1)

    df['trip_duration'] = np.log(df['trip_duration'].values)
    y = df["trip_duration"]
    df = df.drop(["trip_duration"], axis=1)
    df = df.drop(['id'], axis=1)
    X = df
    
    return X, y

In [None]:
def read_and_clean():

    df = pd.read_csv("./nyc-taxi-trip-duration/train.csv")
    test = pd.read_csv("./nyc-taxi-trip-duration/test.csv")

    df = remove_outliers(df)
    df, test = encode_categorical_data(df, test)
    df, test = convert_obj_to_ts(df, test)
    df, test = create_date_features(df), create_date_features(test)
    
    df.drop(['pickup_datetime'], axis=1, inplace=True)

    df, test = create_distance_features(df), create_distance_features(test)
    df, test = create_direction_features(df), create_direction_features(test)

    X, y = data_pre_feat_engg(df)

    return X, y, test

In [None]:
def get_test_train_split(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [None]:
X, y, test = read_and_clean()
test_columns = X.columns
X_train, X_test, y_train, y_test = get_test_train_split(X, y)

In [None]:
def create_submission_df(test, preds):
    sub_df = pd.DataFrame()
    sub_df['id'] = test.id
    sub_df['trip_duration'] = preds

    return sub_df

## KNN

In [None]:
from sklearn.neighbors import KNeighborsRegressor
neigh = KNeighborsRegressor(n_neighbors=20, weights='distance')
neigh.fit(X_train, y_train)

In [None]:
preds = np.exp(neigh.predict(test[test_columns]))

In [None]:
sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_KNN_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## Stacking

In [None]:
from sklearn.svm import LinearSVR
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor

estimators = [('lr', RidgeCV()), ('svr', LinearSVR())]
sReg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=10,random_state=42))
sReg.fit(X_train, y_train)

In [None]:
print(sReg.score(X_train, y_train), sReg.score(X_test, y_test))
print(np.sqrt(MSE(y_test, sReg.predict(X_test))))

In [None]:
preds = np.exp(sReg.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_sReg_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## Bagging Regressor

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor

bagR = BaggingRegressor(base_estimator=SVR(), n_estimators=100)
bagR.fit(X_train, y_train)

In [None]:
print(bagR.score(X_train, y_train), bagR.score(X_test, y_test))
print(np.sqrt(MSE(y_test, bagR.predict(X_test))))

In [None]:
preds = np.exp(bagR.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_bagR_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor

ab = AdaBoostRegressor(n_estimators=100)
ab.fit(X_train, y_train)

In [None]:
print(ab.score(X_train, y_train), ab.score(X_test, y_test))
print(np.sqrt(MSE(y_test, ab.predict(X_test))))

In [None]:
preds = np.exp(ab.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_ab_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
dt = DecisionTreeRegressor(max_depth=13)
dt.fit(X_train, y_train)

In [None]:
print(dt.score(X_train, y_train), dt.score(X_test, y_test))
print(np.sqrt(MSE(y_test, dt.predict(X_test))))

In [None]:
preds = np.exp(dt.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_dt_max_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

In [None]:
from sklearn import tree
tree.plot_tree(dt)

## Random Forest

In [None]:
%%time
# Try RandomForest
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(verbose= 100, max_depth=10, max_features = 0.7, n_estimators=10, n_jobs=-1)
rf.fit(X_train, y_train)



In [None]:
print(rf.score(X_train, y_train), rf.score(X_test, y_test))
print(np.sqrt(MSE(y_test, rf.predict(X_test))))

In [None]:
feature_imp = pd.Series(rf.feature_importances_,index=X.columns).sort_values(ascending=False)
feature_imp

In [None]:
%matplotlib inline
# Creating a bar plot
sns.barplot(x=feature_imp, y=feature_imp.index)
# Add labels to your graph
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Visualizing Important Features")
plt.legend()
plt.show()

In [None]:
preds = np.exp(rf.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_random_max_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## Multi-layer Perceptron

In [None]:
mlp = MLPRegressor(hidden_layer_sizes=(64, 64, 64), max_iter=1000, verbose = 100, early_stopping =  True)  
mlp.fit(X_train, y_train)

In [None]:
print(mlp.score(X_train, y_train), mlp.score(X_test, y_test))
print(np.sqrt(MSE(y_test, mlp.predict(X_test))))

In [None]:
preds = np.exp(mlp.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_mlp_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lp = LinearRegression()
lp.fit(X_train, y_train)

In [None]:
print(lp.score(X_train, y_train), lp.score(X_test, y_test))
print(np.sqrt(MSE(y_test, lp.predict(X_test))))

In [None]:
preds = np.exp(lp.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_lp_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## Linear Ridge

In [None]:
from sklearn.linear_model import Ridge
r = Ridge(alpha=0.1)
r.fit(X_train, y_train)

In [None]:
print(r.score(X_train, y_train), r.score(X_test, y_test))
print(np.sqrt(MSE(y_test, r.predict(X_test))))

In [None]:
preds = np.exp(r.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_r_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## Bayesian Linear Ridge

In [None]:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(X_train, y_train)

In [None]:
print(br.score(X_train, y_train), br.score(X_test, y_test))
print(np.sqrt(MSE(y_test, br.predict(X_test))))

In [None]:
preds = np.exp(br.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_br_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## Deep Neural Network

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

In [None]:
model = Sequential()
model.add(Dense(16, input_dim=16, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(1, activation='linear'))
# compile the keras model
model.compile(loss='mse',  optimizer='adam', metrics=['mae'])
# fit the keras model on the dataset
model.fit(X_train, y_train, epochs=10, batch_size=32)
# evaluate the keras model
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
print(np.sqrt(MSE(y_test, model.predict(X_test))))

In [None]:
preds = np.exp(model.predict(test[test_columns]))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_dnn_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)

## DNN + Random Forest

In [None]:
model = Sequential()
model.add(Dense(16, input_dim=16, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(12, activation='relu'))
model.add(Dense(12, activation='relu', name='my_dense'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='linear'))
# compile the keras model
model.compile(loss='mse',  optimizer='adam', metrics=['mae'])
# fit the keras model on the dataset
model.fit(X_train, y_train, epochs=10, batch_size=32)
# evaluate the keras model
_, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
from keras.models import Model
layer_name='my_dense'
intermediate_layer_model = Model(inputs=model.input,
                                 outputs=model.get_layer(layer_name).output)

intermediate_layer_model.summary()

In [None]:
intermediate_output = intermediate_layer_model.predict(X_train) 
intermediate_output = pd.DataFrame(data=intermediate_output)

intermediate_test_output = intermediate_layer_model.predict(X_test) 
intermediate_test_output = pd.DataFrame(data=intermediate_test_output)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(verbose= 10,max_depth=10, max_features = 1.0, n_estimators=10, n_jobs=-1)
rf.fit(intermediate_output, y_train)

In [None]:
print(rf.score(intermediate_output, y_train), rf.score(intermediate_test_output, y_test))
print(np.sqrt(MSE(y_test, rf.predict(intermediate_test_output))))

In [None]:
intermediate_test = intermediate_layer_model.predict(test[test_columns]) 
intermediate_test = pd.DataFrame(data=intermediate_test)

preds = np.exp(rf.predict(intermediate_test))

sub_df = create_submission_df(test, preds)
sub_df.to_csv('submission_dnn_rf_{}.csv'.format(time.strftime("%Y%m%d%H%M")), index=False)