# Kaggle: New York City Taxi Fare Prediction
> the competition's link can be found __[here](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)__

### Helpful articles: 
- __[modeling](https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6)__
- __[markdown](https://medium.com/ibm-data-science-experience/markdown-for-jupyter-notebooks-cheatsheet-386c05aeebed)__

In [29]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2, f_regression

In [89]:
train = pd.read_csv("train.csv")

In [90]:
train = train[:1000000]

In [118]:
test = pd.read_csv("test.csv")

In [119]:
test.shape

(9914, 7)

In [6]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
key                  1000000 non-null object
fare_amount          1000000 non-null float64
pickup_datetime      1000000 non-null object
pickup_longitude     1000000 non-null float64
pickup_latitude      1000000 non-null float64
dropoff_longitude    999990 non-null float64
dropoff_latitude     999990 non-null float64
passenger_count      1000000 non-null int64
dtypes: float64(5), int64(1), object(2)
memory usage: 61.0+ MB


In [91]:
train.drop(columns=['key'], inplace=True)

In [92]:
train['pickup_datetime'].fillna("0000-00-00 00:00:00 UTC", inplace=True)
#train.fillna(-999, inplace=True)
train.dropna(inplace=True)

In [10]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [11]:
train.dtypes

fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 7 columns):
fare_amount          1000000 non-null float64
pickup_datetime      1000000 non-null object
pickup_longitude     1000000 non-null float64
pickup_latitude      1000000 non-null float64
dropoff_longitude    1000000 non-null float64
dropoff_latitude     1000000 non-null float64
passenger_count      1000000 non-null int64
dtypes: float64(5), int64(1), object(1)
memory usage: 53.4+ MB


In [93]:
train['pickup_datetime'] = train['pickup_datetime'].map(lambda x: str(x)[:-4])

In [14]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1


In [94]:
train['date'] = train['pickup_datetime'].map(lambda x: str(x)[:10])
train['time'] = train['pickup_datetime'].map(lambda x: str(x)[-8:])
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date,time
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15,17:26:21
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05,16:52:16
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18,00:35:00
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21,04:30:42
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09,07:51:00


In [95]:
train['year'] = pd.DatetimeIndex(train['date']).year
train['month'] = pd.DatetimeIndex(train['date']).month
train['day'] = pd.DatetimeIndex(train['date']).day
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date,time,year,month,day
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15,17:26:21,2009,6,15
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05,16:52:16,2010,1,5
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18,00:35:00,2011,8,18
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21,04:30:42,2012,4,21
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09,07:51:00,2010,3,9


In [96]:
train['date'] = pd.to_datetime(train['date'])
train['day_of_week'] = train['date'].dt.weekday_name
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date,time,year,month,day,day_of_week
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15,17:26:21,2009,6,15,Monday
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05,16:52:16,2010,1,5,Tuesday
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18,00:35:00,2011,8,18,Thursday
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21,04:30:42,2012,4,21,Saturday
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09,07:51:00,2010,3,9,Tuesday


In [97]:
train['hour'] = train['time'].map(lambda x: str(x)[:2])
train.drop(columns=['pickup_datetime', 'date', 'time'], inplace=True)
train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,day_of_week,hour
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,Monday,17
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,Tuesday,16
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,Thursday,0
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,Saturday,4
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,Tuesday,7


In [98]:
train['day_of_week'] = pd.factorize(train['day_of_week'])[0] + 1
train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,day_of_week,hour
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,1,17
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,2,16
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,3,0
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4,4
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,2,7


### calculating distance

In [99]:
import math

def distance(df):
    p = 0.017453292519943295     
    a = 0.5 - np.cos((df.dropoff_latitude - df.pickup_latitude) * p)/2 + np.cos(df.pickup_latitude * p) * np.cos(df.dropoff_latitude * p) * (1 - np.cos((df.dropoff_longitude - df.pickup_longitude) * p)) / 2
    
    df['distance'] = 0.6213712 * 12742 * np.arcsin(np.sqrt(a))

In [100]:
distance(train)
train.head()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,year,month,day,day_of_week,hour,distance
0,4.5,-73.844311,40.721319,-73.84161,40.712278,1,2009,6,15,1,17,0.640487
1,16.9,-74.016048,40.711303,-73.979268,40.782004,1,2010,1,5,2,16,5.25067
2,5.7,-73.982738,40.76127,-73.991242,40.750562,2,2011,8,18,3,0,0.863411
3,7.7,-73.98713,40.733143,-73.991567,40.758092,1,2012,4,21,4,4,1.739386
4,5.3,-73.968095,40.768008,-73.956655,40.783762,1,2010,3,9,2,7,1.242218


In [101]:
mask = train['pickup_longitude'].between(-75, -73)
mask &= train['dropoff_longitude'].between(-75, -73)
mask &= train['pickup_latitude'].between(40, 42)
mask &= train['dropoff_latitude'].between(40, 42)
mask &= train['passenger_count'].between(0, 8)
mask &= train['fare_amount'].between(0, 250)

train = train[mask]

### model

In [102]:
y = train['fare_amount']

In [103]:
data = train.drop(columns=['fare_amount'])

In [104]:
# feature extraction
test = SelectKBest(score_func=f_regression, k=4)
fit = test.fit(data, y)
# summarize scores
np.set_printoptions(precision=3)
print(fit.scores_)

[1.619e+05 3.523e+04 8.827e+04 2.419e+04 2.012e+02 1.358e+04 6.291e+02
 2.012e+00 1.764e+01 3.650e+02 1.994e+06]


In [105]:
data = train.drop(columns=['passenger_count', 'month', 'day', 'day_of_week', 'hour', 'fare_amount'])

In [106]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)

In [107]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [108]:
predictions

array([ 9.649,  7.08 , 11.577, ...,  9.168,  7.461, 17.595])

In [109]:
np.sqrt(mean_squared_error(y_test, predictions))

5.331852705363492

### Preparing test data to produce submission

In [75]:
test.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,year,distance
0,-73.97332,40.763805,-73.98143,40.743835,2015,1.443607
1,-73.986862,40.719383,-73.998886,40.739201,2015,1.507044
2,-73.982524,40.75126,-73.979654,40.746139,2011,0.384398
3,-73.98116,40.767807,-73.990448,40.751635,2012,1.218529
4,-73.966046,40.789775,-73.988565,40.744427,2012,3.347514


In [120]:
test_key_col = test['key']
test['pickup_datetime'].fillna("0000-00-00 00:00:00 UTC", inplace=True)
test['pickup_datetime'] = test['pickup_datetime'].map(lambda x: str(x)[:-4])
test['date'] = test['pickup_datetime'].map(lambda x: str(x)[:10])
test['time'] = test['pickup_datetime'].map(lambda x: str(x)[-8:])
test['year'] = pd.DatetimeIndex(test['date']).year
test['month'] = pd.DatetimeIndex(test['date']).month
test['day'] = pd.DatetimeIndex(test['date']).day
test['date'] = pd.to_datetime(test['date'])
test['day_of_week'] = test['date'].dt.weekday_name
test['hour'] = test['time'].map(lambda x: str(x)[:2])
test.drop(columns=['pickup_datetime', 'date', 'time'], inplace=True)
test['day_of_week'] = pd.factorize(test['day_of_week'])[0] + 1
distance(test)

# mask = test['pickup_longitude'].between(-75, -73)
# mask &= test['dropoff_longitude'].between(-75, -73)
# mask &= test['pickup_latitude'].between(40, 42)
# mask &= test['dropoff_latitude'].between(40, 42)
# mask &= test['passenger_count'].between(0, 8)
# test = test[mask]

test.drop(columns=['passenger_count', 'month', 'day', 'day_of_week', 'hour', 'key'], inplace=True)

In [121]:
test.shape

(9914, 6)

In [122]:
test_pred = lm.predict(test)
test_pred

array([10.835, 11.668,  5.551, ..., 47.352, 22.746,  8.253])

In [130]:
submission = pd.DataFrame(
    {'key': test_key_col
    })
submission['fare_amount'] = test_pred
submission.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.834996
1,2015-01-27 13:08:24.0000003,11.667643
2,2011-10-08 11:53:44.0000002,5.551013
3,2012-12-01 21:12:12.0000002,8.212259
4,2012-12-01 21:12:12.0000003,14.76663


In [132]:
submission.to_csv("submission.csv", index=False)