# Kaggle: New York City Taxi Fare Prediction
> the competition's link can be found __[here](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)__

### Helpful articles: 
- __[modeling](https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6)__
- __[markdown](https://medium.com/ibm-data-science-experience/markdown-for-jupyter-notebooks-cheatsheet-386c05aeebed)__

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import datetime

In [31]:
train = pd.read_csv("train.csv")

In [29]:
test = pd.read_csv("test.csv")

In [3]:
train.head()

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2009-06-15 17:26:21.0000001,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,2010-01-05 16:52:16.0000002,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,2011-08-18 00:35:00.00000049,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,2012-04-21 04:30:42.0000001,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,2010-03-09 07:51:00.000000135,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [None]:
train.info()

In [32]:
train.drop(columns=['key'], inplace=True)

In [34]:
train['pickup_datetime'].fillna("0000-00-00 00:00:00 UTC", inplace=True)
train.fillna(-9999, inplace=True)

In [35]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21 UTC,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,2010-01-05 16:52:16 UTC,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00 UTC,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42 UTC,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00 UTC,-73.968095,40.768008,-73.956655,40.783762,1


In [7]:
train.dtypes

fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55423480 entries, 0 to 55423855
Data columns (total 7 columns):
fare_amount          float64
pickup_datetime      object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count      int64
dtypes: float64(5), int64(1), object(1)
memory usage: 3.3+ GB


In [36]:
train['pickup_datetime'] = train['pickup_datetime'].map(lambda x: str(x)[:-4])

In [37]:
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1


In [38]:
train['date'] = train['pickup_datetime'].map(lambda x: str(x)[:10])
train['time'] = train['pickup_datetime'].map(lambda x: str(x)[-8:])
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date,time
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15,17:26:21
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05,16:52:16
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18,00:35:00
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21,04:30:42
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09,07:51:00


In [39]:
train['year'] = pd.DatetimeIndex(train['date']).year
train['month'] = pd.DatetimeIndex(train['date']).month
train['day'] = pd.DatetimeIndex(train['date']).day
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date,time,year,month,day
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15,17:26:21,2009,6,15
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05,16:52:16,2010,1,5
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18,00:35:00,2011,8,18
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21,04:30:42,2012,4,21
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09,07:51:00,2010,3,9


In [None]:
train['date'] = pd.to_datetime(train['date'])
train['day_of_week'] = train['date'].dt.weekday_name
train.head()

Unnamed: 0,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,date,time,year,month,day,day_of_week
0,4.5,2009-06-15 17:26:21,-73.844311,40.721319,-73.84161,40.712278,1,2009-06-15,17:26:21,2009,6,15,Monday
1,16.9,2010-01-05 16:52:16,-74.016048,40.711303,-73.979268,40.782004,1,2010-01-05,16:52:16,2010,1,5,Tuesday
2,5.7,2011-08-18 00:35:00,-73.982738,40.76127,-73.991242,40.750562,2,2011-08-18,00:35:00,2011,8,18,Thursday
3,7.7,2012-04-21 04:30:42,-73.98713,40.733143,-73.991567,40.758092,1,2012-04-21,04:30:42,2012,4,21,Saturday
4,5.3,2010-03-09 07:51:00,-73.968095,40.768008,-73.956655,40.783762,1,2010-03-09,07:51:00,2010,3,9,Tuesday


In [None]:
train['hour'] = train['time'].map(lambda x: str(x)[:2])
train.drop(columns=['pickup_datetime', 'date', 'time'], inplace=True)
train.head()

In [None]:
train['day_of_week'] = pd.factorize(train['day_of_week'])[0] + 1
train.head()

In [23]:
sample = train[:10000000]

In [24]:
y = sample['fare_amount']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(sample, y, test_size=0.2)

In [26]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [27]:
predictions

array([10.9, 16.5,  7.7, ...,  6.5, 18.5,  9.3])

In [28]:
mean_squared_error(y_test, predictions)

1.0556349813525356e-23

### Preparing test data to produce submission

In [30]:
test_key_col = test['key']
test['pickup_datetime'].fillna("0000-00-00 00:00:00 UTC", inplace=True)
test.fillna(-9999, inplace=True)
test.drop(columns=['key'], inplace=True)
test['pickup_datetime'] = test['pickup_datetime'].map(lambda x: str(x)[:-4])
test['date'] = test['pickup_datetime'].map(lambda x: str(x)[:10])
test['time'] = test['pickup_datetime'].map(lambda x: str(x)[-8:])
test['year'] = pd.DatetimeIndex(test['date']).year
test['month'] = pd.DatetimeIndex(test['date']).month
test['day'] = pd.DatetimeIndex(test['date']).day
test['date'] = pd.to_datetime(test['date'])
test['day_of_week'] = test['date'].dt.weekday_name
test['hour'] = test['time'].map(lambda x: str(x)[:2])
test.drop(columns=['pickup_datetime', 'date', 'time'], inplace=True)
test['day_of_week'] = pd.factorize(test['day_of_week'])[0] + 1

KeyboardInterrupt: 

In [None]:
test_pred