# Kaggle: New York City Taxi Fare Prediction
> the competition's link can be found __[here](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)__

### Helpful articles: 
- __[modeling](https://towardsdatascience.com/train-test-split-and-cross-validation-in-python-80b61beca4b6)__
- __[markdown](https://medium.com/ibm-data-science-experience/markdown-for-jupyter-notebooks-cheatsheet-386c05aeebed)__

In [None]:
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
import datetime

In [None]:
train = pd.read_csv("train.csv")

In [None]:
test = pd.read_csv("test.csv")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.drop(columns=['key'], inplace=True)

In [None]:
train['pickup_datetime'].fillna("0000-00-00 00:00:00 UTC", inplace=True)
train.fillna(-999, inplace=True)

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.info()

In [None]:
train['pickup_datetime'] = train['pickup_datetime'].map(lambda x: str(x)[:-4])

In [None]:
train.head()

In [None]:
train['date'] = train['pickup_datetime'].map(lambda x: str(x)[:10])
train['time'] = train['pickup_datetime'].map(lambda x: str(x)[-8:])
train.head()

In [None]:
train['year'] = pd.DatetimeIndex(train['date']).year
train['month'] = pd.DatetimeIndex(train['date']).month
train['day'] = pd.DatetimeIndex(train['date']).day
train.head()

In [None]:
train['date'] = pd.to_datetime(train['date'])
train['day_of_week'] = train['date'].dt.weekday_name
train.head()

In [None]:
train['hour'] = train['time'].map(lambda x: str(x)[:2])
train.drop(columns=['pickup_datetime', 'date', 'time'], inplace=True)
train.head()

In [None]:
train['day_of_week'] = pd.factorize(train['day_of_week'])[0] + 1
train.head()

#### calculating distance

In [None]:
import math

def distance(df):
    p = 0.017453292519943295     
    a = 0.5 - np.cos((df.dropoff_latitude - df.pickup_latitude) * p)/2 + np.cos(df.pickup_latitude * p) * np.cos(df.dropoff_latitude * p) * (1 - np.cos((df.dropoff_longitude - df.pickup_longitude) * p)) / 2
    
    df['distance'] = 0.6213712 * 12742 * np.arcsin(np.sqrt(a))

In [None]:
distance(train)
train.head()

In [None]:
sample = train[:]

In [None]:
y = sample['fare_amount']

In [None]:
data = sample.drop(columns=['fare_amount'])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)

In [None]:
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
predictions = lm.predict(X_test)

In [None]:
predictions

In [None]:
mean_squared_error(y_test, predictions)

### Preparing test data to produce submission

In [25]:
test_key_col = test['key']
test['pickup_datetime'].fillna("0000-00-00 00:00:00 UTC", inplace=True)
test.fillna(-999, inplace=True)
test['pickup_datetime'] = test['pickup_datetime'].map(lambda x: str(x)[:-4])
test['date'] = test['pickup_datetime'].map(lambda x: str(x)[:10])
test['time'] = test['pickup_datetime'].map(lambda x: str(x)[-8:])
test['year'] = pd.DatetimeIndex(test['date']).year
test['month'] = pd.DatetimeIndex(test['date']).month
test['day'] = pd.DatetimeIndex(test['date']).day
test['date'] = pd.to_datetime(test['date'])
test['day_of_week'] = test['date'].dt.weekday_name
test['hour'] = test['time'].map(lambda x: str(x)[:2])
test.drop(columns=['pickup_datetime', 'date', 'time'], inplace=True)
test['day_of_week'] = pd.factorize(test['day_of_week'])[0] + 1

In [26]:
test_pred = lm.predict(test)
test_pred

array([12.73283492, 12.73278992, 11.19466956, ..., 13.41349568,
       13.53930996, 13.23950247])