In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# train_df = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/train.csv')
# train_df.head()

In [None]:
# test_df = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/test.csv')
# test_df.head()

In [None]:
selected_cols = ['fare_amount',
                'pickup_datetime',
                'pickup_longitude',
                'pickup_latitude',
                'dropoff_longitude',
                'dropoff_latitude',
                'passenger_count']

In [None]:
dtypes = {'fare_amount': 'float32',
                'pickup_longitude': 'float32',
                'pickup_latitude': 'float32',
                'dropoff_longitude': 'float32',
                'dropoff_latitude': 'float32',
                'passenger_count': 'uint8'
}

In [None]:
train_df_part = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/train.csv', 
                           usecols=selected_cols, dtype=dtypes,
                           parse_dates=['pickup_datetime'],
                           nrows=100000)
train_df_part.head()

load test dataset

In [None]:
test_df = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/test.csv', 
                           dtype=dtypes,
                           parse_dates=['pickup_datetime'])
test_df.head()

# explore the dataset

In [None]:
train_df_part.info()

In [None]:
train_df_part.describe()

In [None]:
df = train_df_part
df['pickup_datetime'].min(), df['pickup_datetime'].max()

## we could limit ranges to ranges in test dataset

### split to train and validation

In [None]:
from sklearn.model_selection import train_test_split
train_df, val_df = train_test_split(df, test_size=0.2 )

In [None]:
train_df = train_df.dropna()
val_df = val_df.dropna()

In [None]:
X_cols = ['pickup_longitude',
            'pickup_latitude',
            'dropoff_longitude',
            'dropoff_latitude',
            'passenger_count']
y_col = 'fare_amount'

In [None]:
X_train = train_df[X_cols]
y_train = train_df[y_col]

In [None]:
X_train.head()

In [None]:
y_train.head()

In [None]:
X_val = val_df[X_cols]
y_val = val_df[y_col]

In [None]:
test_df = test_df[X_cols]
test_df.head()

Train Hardcoded and baseline models

In [None]:
class MeanRegressor:
    def fit(self, inputs, targets):
        self.mean = targets.mean()
        
    def predict(self, inputs):
        return np.full(inputs.shape[0], self.mean)

In [None]:
mean_model = MeanRegressor()

In [None]:
mean_model.fit(X_train, y_train)
mean_model.mean

In [None]:
train_pred = mean_model.predict(X_train)

In [None]:
val_pred = mean_model.predict(X_val)

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
def rmse(targets, preds):
    #get root mean squared error
    return mean_squared_error(targets, preds, squared=False)

In [None]:
train_rmse = rmse(y_train, train_pred)
train_rmse

In [None]:
val_rmse = rmse(y_val, val_pred)
val_rmse

## train and evaluate baseline model

In [None]:
from sklearn.linear_model import LinearRegression

reg = LinearRegression()
reg.fit(X_train, y_train)
train_pred = reg.predict(X_train)
val_pred = reg.predict(X_val)

In [None]:
train_rmse = rmse(y_train, train_pred)
train_rmse

In [None]:
val_rmse = rmse(y_val, val_pred)
val_rmse

In [None]:
# to submit 
test_pred = reg.predict(test_df)
sub_df = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/sample_submission.csv')
sub_df

In [None]:
sub_df['fare_amount'] = test_pred
sub_df

In [None]:
sub_df.to_csv('linearReg.csv', index=None)

## Reusable functions

In [None]:
def predict_and_submit(model, test_imputs, fname):  
    # to submit 
    test_pred = model.predict(test_imputs)
    sub_df = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/sample_submission.csv')
    sub_df['fare_amount'] = test_pred
    sub_df.to_csv(fname, index=None)
    return sub_df

##  Feature Engineering

In [None]:
def add_dateparts(df, col):
    df[col +"_year"] = df[col].dt.year
    df[col + "_month"] = df[col].dt.month
    df[col +"_day"] = df[col].dt.day
    df[col + "_weekday"] = df[col].dt.weekday
    df[col+"_hour"] = df[col].dt.hour

In [None]:
col = 'pickup_datetime'
add_dateparts(df, col)

In [None]:
df.head()

In [None]:
test_df = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/test.csv', 
                           dtype=dtypes,
                           parse_dates=['pickup_datetime'])

In [None]:
add_dateparts(test_df, col)
test_df.head()

In [None]:
test_df.drop(columns ='key', inplace=True)

## Add distance between pickup and dropoff location

#### use haversine formuala

In [None]:
def haversine(lon1, lat1, lon2, lat2):
    """
    calculate the great circle distance between two points
    on the earth (in decimal degrees)
    
    all args must be of equal length 
    works on a list
    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    km = 6367 * c
    return km

In [None]:
lon1 = df['pickup_longitude']
lat1 = df['pickup_latitude']
lon2 = df['dropoff_longitude']
lat2 = df['dropoff_latitude']

df['trip_distance'] = haversine(lon1, lat1, lon2, lat2)

In [None]:
lon1 = test_df['pickup_longitude']
lat1 = test_df['pickup_latitude']
lon2 = test_df['dropoff_longitude']
lat2 = test_df['dropoff_latitude']

test_df['trip_distance'] = haversine(lon1, lat1, lon2, lat2)
test_df.head()

In [None]:
train_df, val_df = train_test_split(df, test_size=0.2 )

In [None]:
train_df.head(4)

## add distance from popular landmarks
#### add distance from drop location to
1- JFK, LAG, EWR airports
2- Times Square
3- Met Museum
4- World Trade Center


In [None]:
jfk_lonlat = -73.7781, 40.6413
lga_lonlat = -73.8740, 40.7769
ewr_lonlat = -74.1745, 40.6895
met_lonlat = -73.9632, 40.7794
wtc_lonlat = -74.0099, 40.7126

In [None]:
def add_landmark_dropoff_distance(df, landmark_name, landmark_lonlat):
    lon, lat = landmark_lonlat
    df[landmark_name + '_drop_distance'] = haversine(lon, lat, df['dropoff_longitude'], df['dropoff_latitude'])

In [None]:
for a_df in [train_df, val_df, test_df]:
    for name, lonlat in [('jfk', jfk_lonlat), ('lga', lga_lonlat), ('ewr', ewr_lonlat), ('met', met_lonlat), ('wtc', wtc_lonlat)]:
        add_landmark_dropoff_distance(a_df, name, lonlat)

We'll use the following ranges:

    fare_amount: 1to500
    longitudes: -75 to -72
    latitudes: 40 to 42
    passenger_count: 1 to 6

In [None]:
df.describe()

In [None]:
def remove_outliers(df):
    return df[(df['fare_amount'] >= 1.) & 
              (df['fare_amount'] <= 500.) &
              (df['pickup_longitude'] >= -75) & 
              (df['pickup_longitude'] <= -72) & 
              (df['dropoff_longitude'] >= -75) & 
              (df['dropoff_longitude'] <= -72) & 
              (df['pickup_latitude'] >= 40) & 
              (df['pickup_latitude'] <= 42) & 
              (df['dropoff_latitude'] >=40) & 
              (df['dropoff_latitude'] <= 42) & 
              (df['passenger_count'] >= 1) & 
              (df['passenger_count'] <= 6)]

In [None]:
train_df = remove_outliers(train_df)

In [None]:
val_df = remove_outliers(val_df)

### we could scale data later

## train models

#### Ridge Regression

In [None]:
X_train = train_df.drop(columns=['fare_amount', 'pickup_datetime'])
y_train = train_df['fare_amount']

In [None]:
X_val = val_df.drop(columns=['fare_amount', 'pickup_datetime'])
y_val = val_df['fare_amount']

In [None]:
test_df=test_df.drop(columns = 'pickup_datetime')

In [None]:
test_df.head(3)

In [None]:
def evaluate(model , train_inputs, train_targets, val_inputs, val_targets):
    train_pred = model.predict(train_inputs)
    train_rmse = rmse(train_targets, train_pred)
    
    val_pred = model.predict(val_inputs)
    val_rmse = rmse(val_targets, val_pred)
    
    return train_rmse, val_rmse, train_pred, val_pred

In [None]:
from sklearn.linear_model import Ridge

In [None]:
model1 = Ridge(alpha=0.9)
model1.fit(X_train, y_train)

In [None]:
evaluate(model1, X_train, y_train, X_val, y_val)

In [None]:
predict_and_submit(model1, test_df, 'Ridge.csv')

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
model2 = RandomForestRegressor(max_depth=10, n_jobs=-1, random_state=42, n_estimators=50)

In [None]:
model2.fit(X_train, y_train)

In [None]:
evaluate(model2, X_train, y_train, X_val, y_val)

In [None]:
predict_and_submit(model2, test_df, 'rf_submission.csv')

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
model2

In [None]:
parameters = {'n_estimators':(50, 80), 'max_depth':[10, 20]}
model_2 = RandomForestRegressor()
model2_GS= GridSearchCV(model_2, parameters)

In [None]:
model2_GS.fit(X_train, y_train)

In [None]:
model2_GS.best_params_

In [None]:
evaluate(model2_GS, X_train, y_train, X_val, y_val)

In [None]:
predict_and_submit(model2, test_df, 'rf_gridsearch.csv')