In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
%%time

path = '/kaggle/input/new-york-city-taxi-fare-prediction/'

traintypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

cols = list(traintypes.keys())

#n_rows = 6 * (10 ** 6)
#df = pd.read_csv(path + 'train.csv', nrows = n_rows) 
df = pd.read_csv(path + 'train.csv', usecols=cols, dtype=traintypes)

In [None]:
print(df.info())
print(df.describe())


In [None]:
def clean_df(df):
    return df[(df.fare_amount > 0) & 
            (df.pickup_longitude > -80) & (df.pickup_longitude < -70) &    # NY longintude
            (df.pickup_latitude > 35) & (df.pickup_latitude < 45) &        # NY latitude
            (df.dropoff_longitude > -380) & (df.dropoff_longitude < -10) &  
            (df.dropoff_latitude > 10) & (df.dropoff_latitude < 450) &
             (df.passenger_count > 0) & (df.passenger_count <= 9)]


In [None]:
df = clean_df(df)

In [None]:
def get_distance(lat1, lon1, lat2, lon2):
    
    # approximate radius of earth in km
    R = 6373.0
    
    lat1 = np.radians(lat1)
    lon1 = np.radians(lon1)
    lat2 = np.radians(lat2)
    lon2 = np.radians(lon2)
    
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    
    a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2)**2
    c = 2 * np.arctan(np.sqrt(a), np.sqrt(1 - a))
    return R * c
    
def dist(pickup_lat, pickup_long, dropoff_lat, dropoff_long):  
    distance = np.abs(dropoff_lat - pickup_lat) + np.abs(dropoff_long - pickup_long)
    return distance

def airport_feats(train):
    for data in [train]:
        nyc = (-74.0063889, 40.7141667)
        jfk = (-73.7822222222, 40.6441666667)
        ewr = (-74.175, 40.69)
        lgr = (-73.87, 40.77)
        data['distance_to_center'] = dist(nyc[1], nyc[0],  data['pickup_latitude'], data['pickup_longitude'])
        data['pickup_distance_to_jfk'] = dist(jfk[1], jfk[0], data['pickup_latitude'], data['pickup_longitude'])
        data['dropoff_distance_to_jfk'] = dist(jfk[1], jfk[0], data['dropoff_latitude'], data['dropoff_longitude'])
        data['pickup_distance_to_ewr'] = dist(ewr[1], ewr[0], data['pickup_latitude'], data['pickup_longitude'])
        data['dropoff_distance_to_ewr'] = dist(ewr[1], ewr[0],data['dropoff_latitude'], data['dropoff_longitude'])
        data['pickup_distance_to_lgr'] = dist(lgr[1], lgr[0], data['pickup_latitude'], data['pickup_longitude'])
        data['dropoff_distance_to_lgr'] = dist(lgr[1], lgr[0], data['dropoff_latitude'], data['dropoff_longitude'])
    return train


In [None]:
df['pickup_datetime'] = df['pickup_datetime'].str.slice(0, 16)
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
#df['pickup_datetime'] = pd.to_datetime(df.pickup_datetime)
df['bsns_hour_morning'] = np.logical_and(df.pickup_datetime.dt.hour >= 4,df.pickup_datetime.dt.hour <= 7)
df['weekday'] = df.pickup_datetime.dt.dayofweek < 5
df['year'] = df.pickup_datetime.dt.year

In [None]:
#df = airport_feats(df)
a_lat1 = np.array(df.pickup_latitude)
a_lon1 = np.array(df.pickup_longitude)
a_lat2 = np.array(df.dropoff_latitude)
a_lon2= np.array(df.dropoff_longitude)
del df['pickup_datetime']
df['distance'] = get_distance( a_lat1, a_lon1, a_lat2, a_lon2) # distance between pickup and dropoff
del df['pickup_latitude']
del df['pickup_longitude']
del df['dropoff_latitude']
del df['dropoff_longitude']



In [None]:
# Selecting features
#new_feat = ['distance_to_center', 'dropoff_distance_to_jfk', 'dropoff_distance_to_ewr', 'dropoff_distance_to_lgr']
same_feat = ['bsns_hour_morning','weekday','year','passenger_count','distance']
X = df[same_feat] # selected features
y = df['fare_amount']

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.impute import SimpleImputer

imputer = SimpleImputer()
imputer.fit(X)
X = imputer.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42)

rf = RandomForestRegressor(max_features='sqrt', n_estimators=400, max_depth=8)
rf.fit(X_train, y_train)
pred_rf = rf.predict(X_train)

# Evaluation
from sklearn.model_selection import cross_val_score

print('- RF train:',mean_squared_error(y_train, pred_rf))
print('- RF test:',mean_squared_error(y_test, rf.predict(X_test)))



In [None]:
test = pd.read_csv(path+'test.csv')

test['pickup_datetime'] = pd.to_datetime(test.pickup_datetime)
test['bsns_hour_morning'] = np.logical_and(test.pickup_datetime.dt.hour >= 4,test.pickup_datetime.dt.hour <= 7) # rush hour
test['weekday'] = test.pickup_datetime.dt.dayofweek < 5
test['year'] = test.pickup_datetime.dt.year

test_keys = test.key
X = test.drop(['key'],axis=1)

a_lat1 = np.array(X.pickup_latitude)
a_lon1 = np.array(X.pickup_longitude)
a_lat2 = np.array(X.dropoff_latitude)
a_lon2= np.array(X.dropoff_longitude)
X['distance'] = get_distance( a_lat1, a_lon1, a_lat2, a_lon2)
X = airport_feats(X)
X = X[same_feat]

X = imputer.transform(X)
rf_pred = np.round(rf.predict(X),2)

# Save outputs
rf_output = pd.DataFrame({'key': test_keys, 'fare_amount': rf_pred})
rf_output.to_csv('taxi_fare_submission.csv', index=False)