# Making the importation of the libraries

In [None]:
import pandas as pd
import numpy as np
import random
import zipfile
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
train = pd.read_csv('../input/new-york-city-taxi-fare-prediction/train.csv')
test = pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv')

In [None]:
train.shape

In [None]:
train.head(10)

In [None]:
train.dtypes

How the number of records is very large, I'll take a sample of the file so that it's easier to work with the data

In [None]:
train = train.sample(n = 250000)

In [None]:
train.shape

The 'key' and 'pickup_datetime' attributes are records that contain dates, let's convert them to dates. I will create a new file with only 250000 samples and do the conversion.

In [None]:
train.to_csv('train_sample.csv', index = False)
train = pd.read_csv('./train_sample.csv', parse_dates = ['key', 'pickup_datetime'])
train.shape
train.head()

In [None]:
train.dtypes

# **We will now clean and manipulate the data**

Let's check for missing values:

In [None]:
train.isna().sum(axis = 0)

We have no missing values.

Let's check the 'key' and 'pickup_datetime':

In [None]:
key = train['key']
#visualizing data distributions with 100 divisions(bins)::
plt.hist(key, bins = 100)

In [None]:
pdt = train['pickup_datetime']

In [None]:
key.describe()

In [None]:
pdt.describe()

As we can see, the 'key' and 'pickup_datetime' columns have similar values for fist and last. We can associate this as an indication that the columns have the same values.

In [None]:
#eliminating seconds:
pdt = pdt.map(lambda date: date.tz_localize(None))
pdt

In [None]:
#key with same format as pdt:
key = pd.to_datetime(key.dt.strftime('%Y-%m-%d %H:%M:%S'))
key

Let's perform a logical command to find out if the columns actually have the same values:

In [None]:
(key == pdt)

In [None]:
(key == pdt).value_counts()

The key column is a primary key and is not used in machine learning models. Com has the same values as pickup_datetime, we will exclude it.

In [None]:
train.drop(['key'], axis = 1, inplace = True)
train.head()

In [None]:
#formatting:
train['pickup_datetime'] = pdt
train.head()

Now let's look at the coordinate attributes.

In [None]:
plon = train['pickup_longitude']
plon.describe()

The max and min values are quite different from the mean and quartiles.

In [None]:
plt.hist(plon, bins = 150)

In [None]:
plon.median()

In [None]:
#A zoom:
plt.hist(plon[(plon > plon.median() - 2.5) & (plon < plon.median() + 2.5)], bins = 100)

In [None]:
##Let's eliminate the inconsistent values:
plon_val = (plon > plon.median() - 0.5) & (plon < plon.median() + 0.5)
plon_val.value_counts()

In [None]:
#percent:
print(plon_val.value_counts(), '\n', plon_val.value_counts(normalize = True))

In [None]:
plat = train['pickup_latitude']
plat.describe()

In [None]:
plt.hist(plat, bins = 100)

In [None]:
plat.median()

In [None]:
plt.hist(plat[(plat > plat.median() - 2.5) & (plat < plat.median() + 2.5)], bins = 100)

In [None]:
plat_val = (plat > plat.median() - 0.5) & (plat < plat.median() + 0.5)

In [None]:
#percent:
print(plat_val.value_counts(), '\n', plat_val.value_counts(normalize = True))

In [None]:
dlon = train['dropoff_longitude']
dlon.describe()

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (15, 5))
ax[0].hist(dlon, bins = 100)
ax[1].hist(dlon[(dlon > dlon.median() - 2.5) & (dlon < dlon.median() + 2.5)], bins = 100)

In [None]:
dlon_val = (dlon > dlon.median() - 0.5) & (dlon < dlon.median() + 0.5)

In [None]:
#percent
print(dlon_val.value_counts(), '\n', dlon_val.value_counts(normalize = True))

In [None]:
dlat = train['dropoff_latitude']
dlat.describe()

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (15, 5))
ax[0].hist(dlat, bins = 100)
ax[1].hist(dlat[(dlat > dlat.median() - 2.5) & (dlat < dlat.median() + 2.5)], bins = 100)

In [None]:
dlat_val = (dlat > dlat.median() - 0.5) & (dlat < dlat.median() + 0.5)

In [None]:
#percent
print(dlat_val.value_counts(), '\n', dlat_val.value_counts(normalize = True))

Let's do an analysis of the passenger_count column

In [None]:
pcount = train['passenger_count']
pcount.describe()

It is very strange that a taxi takes 0 people, just as it is impossible to take more than 200 people in a single car. Let's see how many numbers 0 we have and the mean:

In [None]:
print((pcount == 0).sum(), (pcount == 0).mean())

In [None]:
plt.hist(pcount, bins = 100)

In [None]:
plt.hist(pcount[pcount < 15], bins = 100)

In [None]:
pcount_val = (pcount >= 1) & (pcount <= 6)
print(pcount_val.value_counts(), '\n', pcount_val.value_counts(normalize = True))

Analyzing the fare_amount

In [None]:
fare = train['fare_amount']
fare.describe()

In [None]:
plt.hist(fare, bins = 100)

In [None]:
plt.hist(fare[fare < 10], bins = 100)

In [None]:
plt.hist(fare[fare > 50], bins = 100)

In the graphs we can see that low values, close to zero are very difficult, as well as values that exceed the value of $ 150. Then we will filter within that range.

In [None]:
fare_val = (fare > 2) & (fare < 150)
print(fare_val.value_counts(), '\n', fare_val.value_counts(normalize = True))

I will exclude the variables outside the established ranges and form the database for the training.

In [None]:
#concatenating the values:
val_entries = fare_val & plon_val & plat_val & dlon_val & dlat_val & pcount_val
print(val_entries.value_counts(), '\n', val_entries.value_counts(normalize = True))

In [None]:
train = train.drop(val_entries[val_entries == False].index)
train.head()

In [None]:
train.shape

#  **Create new features**
We can see that distance and duration are important for determining the tariff. But we also see that there are no such characteristics. Something that influences the duration of the trip is the traffic condition.

In [None]:
train['hour_of_day'] = train['pickup_datetime'].map(lambda date: date.timetuple().tm_hour)
train['day_of_week'] = train['pickup_datetime'].map(lambda date: date.timetuple().tm_wday)
train['day_of_year'] = train['pickup_datetime'].map(lambda date: date.timetuple().tm_yday)
train['year'] = train['pickup_datetime'].map(lambda date: date.timetuple().tm_year)
train.head()

We are working with day of the week and year, and considering holidays and breaks. There are leap years in the database, it is from 2009 to 2015, and that gets in the way because if a holiday is on the 34th, with the leap year, it becomes the 35th. First let's see how many records we have on the 29th of February.

In [None]:
len(train['pickup_datetime'][train['pickup_datetime'].dt.strftime('%m-%d') == '02-29'])

As they are just 108, it is a small value compared to all abse of data, so we will exclude it

In [None]:
train.drop(train['pickup_datetime'][train['pickup_datetime'].dt.strftime('%m-%d') == '02-29'].index, inplace = True)

In [None]:
train.shape

As we excluded the 29th, we will have to fix the year 2012 because otherwise we will have the holidays on the wrong days. The condition will be 59 days because 31 days (like August, for example) + 28 (February) = 59. And make the correcting the extra days left over because of the leap year, for example, if before it was the 61st, it returns to the 60th.

In [None]:
condition = (train['year'] == 2012) & (train['day_of_year'] > 59)
train['day_of_year'][condition] = train['day_of_year'] - 1

In [None]:
fig, ax = plt.subplots(1, 4, figsize = (15,3))
train_not_2015 = train[train['year'] < 2015] # we won't use 2015 because we only have the dates until half of the year
ax[0].hist(train_not_2015['hour_of_day'], bins = 24) # 24 hours in a day
ax[0].set_title('Hour of day')
ax[1].hist(train_not_2015['day_of_week'], bins = 7) # 7 days in a week
ax[1].set_title('Days of week')
ax[2].hist(train_not_2015['day_of_year'], bins = 365) # 365 days
ax[2].set_title('Day of year')
ax[3].hist(train_not_2015['year'], bins = 6) # we have 6 years
ax[3].set_title('Year')

In [None]:
plt.figure(figsize = (15, 5))
plt.scatter(train['pickup_datetime'], train['fare_amount'], s = 1, alpha = 0.2)

We can see that we have a pattern forming straight horizontal lines, that means fixed rate, since a path that has a preset value. Let's zoom in on these records for a more detailed view:

In [None]:
from collections import Counter
fare_zoom = train['fare_amount'][(train['fare_amount'] > 40) & (train['fare_amount'] < 60)]

In [None]:
common_fares_zoom = Counter(fare_zoom)

It will count the tariffs, for example: $ 40 there are 7 records. Soon after we will see which are the most common 

In [None]:
common_fares_zoom

In [None]:
most_common_fares_zoom = common_fares_zoom.most_common(10)
most_common_fares_zoom

In [None]:
plt.bar([x[0] for x in most_common_fares_zoom], [x[1] for x in most_common_fares_zoom])

In [None]:
from mpl_toolkits.basemap import Basemap
#NYC latitude and longitude definition
lat1, lat2 = 40.55, 40.95
lon1, lon2 = -74.10, -73.70

plt.figure(figsize = (10, 10))
m = Basemap(projection = 'cyl', resolution = 'h',
            llcrnrlat = lat1, urcrnrlat = lat2,
            llcrnrlon = lon1, urcrnrlon = lon2)
m.drawcoastlines()
m.fillcontinents(color = 'palegoldenrod', lake_color = 'lightskyblue')
m.drawmapboundary(fill_color = 'lightskyblue')
m.drawparallels(np.arange(lat1, lat2 + 0.05, 0.1), labels = [1, 0, 0, 0])
m.drawmeridians(np.arange(lon1, lon2 + 0.05, 0.1), labels = [0, 0, 0, 1])

#Pickup locations - of all exits (green)
m.scatter(train['pickup_longitude'], train['pickup_latitude'], s = 1, c = 'green',
          alpha = 0.1, zorder = 5)
#Dropoof locations - of all exits (yellow)
m.scatter(train['dropoff_longitude'], train['dropoff_latitude'], s = 1, c='yellow',
         alpha = 0.1, zorder = 5)
for i in [0, 1, 2, 4]:
  this_fare = most_common_fares_zoom[i][0]
  this_df = train[train['fare_amount'] == this_fare]
  #pickup location - red
  m.scatter(this_df['pickup_longitude'], this_df['pickup_latitude'], s = 2, c = 'red',
           alpha = 0.2, zorder = 5)
  #dropoff location - blue
  m.scatter(this_df['dropoff_longitude'], this_df['dropoff_latitude'], s = 2, c = 'blue',
           alpha = 0.2, zorder = 5)

Let's create a new variable called 'coordinates' to be able to zoom in on the points located on the map where the fixed rates are:

In [None]:
#Arrival point coordinates
coords = train[['dropoff_latitude',
                'dropoff_longitude']][(train['fare_amount'] > 40) &
                                       (train['fare_amount'] < 60) &
                                       (train['dropoff_latitude'] < 40.7) &
                                       (train['dropoff_latitude'] > 40.6) &
                                       (train['dropoff_longitude'] < -73.7) &
                                       (train['dropoff_longitude'] > -73.9)]

In [None]:
coords.shape

In [None]:
coords.head()

If we take the mean latitude and longitude, we can play on Google Maps and find the real location. Let's see:

In [None]:
print(coords['dropoff_latitude'].median(), coords['dropoff_longitude'].median())

![IMAGE](https://drive.google.com/file/d/1Gv-ttT5bv6h57LoCI8FYJpZzCgp1h9t1/view?usp=sharing)

In [None]:
#Starting point coordinates
coords = train[['dropoff_latitude',
                'dropoff_longitude']][(train['fare_amount'] > 40) &
                                       (train['fare_amount'] < 60) &
                                       (train['dropoff_latitude'] < 40.85) &
                                       (train['dropoff_latitude'] > 40.7) &
                                       (train['dropoff_longitude'] < -73.9) &
                                       (train['dropoff_longitude'] > -74.1)]
print(coords['dropoff_latitude'].median(), coords['dropoff_longitude'].median())

![IMAGE](https://drive.google.com/file/d/18Xtflx7mvncHnW3XfcSSU7NV2XcrCt3E/view?usp=sharing)

We can see that the fixed rates have their standard going from the city center to the international airport. Now let's see if there is a relationship between number of passengers and fares:

In [None]:
filtered = train[['fare_amount', 
                  'passenger_count']][((train['fare_amount'] == most_common_fares_zoom[0][0]) |
                                       (train['fare_amount'] == most_common_fares_zoom[1][0]) |
                                       (train['fare_amount'] == most_common_fares_zoom[2][0]) |
                                       (train['fare_amount'] == most_common_fares_zoom[4][0]))&
                                       (train['fare_amount'] < 60) &
                                       (train['dropoff_latitude'] < 40.7) & 
                                       (train['dropoff_latitude'] > 40.6) & 
                                       (train['dropoff_longitude'] < -73.7) &
                                       (train['dropoff_longitude'] > -73.9)]
plt.scatter(filtered['passenger_count'], filtered['fare_amount'])

As we can see, the number of passengers does not change the amount to be paid in the fare.
Let's see if the tariff has anything to do with the day of the week, year and time:

In [None]:
filtered2 = train[['fare_amount', 
                  'hour_of_day',
                  'day_of_week',
                  'day_of_year']][((train['fare_amount'] == most_common_fares_zoom[0][0]) |
                                       (train['fare_amount'] == most_common_fares_zoom[1][0]) |
                                       (train['fare_amount'] == most_common_fares_zoom[2][0]) |
                                       (train['fare_amount'] == most_common_fares_zoom[4][0]))&
                                       (train['fare_amount'] < 60) &
                                       (train['dropoff_latitude'] < 40.7) & 
                                       (train['dropoff_latitude'] > 40.6) & 
                                       (train['dropoff_longitude'] < -73.7) &
                                       (train['dropoff_longitude'] > -73.9)]

fig, ax = plt.subplots(1, 3, figsize = (15, 5))
ax[0].scatter(filtered2['hour_of_day'], filtered2['fare_amount'])
ax[0].set_title('Hour of day')
ax[1].scatter(filtered2['day_of_week'], filtered2['fare_amount'])
ax[1].set_title('Day of week')
ax[2].scatter(filtered2['day_of_year'], filtered2['fare_amount'])
ax[2].set_title('Day of year')

We can see that these variables do not change the rate.

In [None]:
train.drop('pickup_datetime', axis = 1, inplace = True)
train.head()

We know that distance influences the tariff, so let's see what is the best way to calculate it and use it in the model:

In [None]:
#conversion to radians:
lon1, lon2 = np.radians(train['pickup_longitude']), np.radians(train['dropoff_longitude'])
lat1, lat2 = np.radians(train['pickup_latitude']), np.radians(train['dropoff_latitude'])
#subtraction from the start point to the end point:
dlon = lon2 - lon1
dlat = lat2 - lat1

In [None]:
#Euclidean Distance (Km)
a = np.sin(dlat / 2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
train['eucl_distance'] = 6373 * c

train.head()

In [None]:
#Manhattan Distance (Km)
a1 = np.sin(dlon / 2)**2
c1 = 2 * np.arctan2(np.sqrt(a1), np.sqrt(1 - a1))
a2 = np.sin(dlat / 2)**2
c2 = 2 * np.arctan2(np.sqrt(a2), np.sqrt(1 - a2))
train['manh_distance'] = 6373 * (c1 + c2)

train.head()

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10,5))
ax[0].hist(train['eucl_distance'])
ax[0].set_title('Euclidian Distance')
ax[1].hist(train['manh_distance'])
ax[1].set_title('Manhattan Distance')

# **Now creating the predictive attributes and the response attribute**

In [None]:
X_train = train.drop('fare_amount', axis = 1)
Y_train = train['fare_amount']

Let's normalize the data as they are on different scales:

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)

In [None]:
from sklearn.model_selection import cross_val_score
cv = 3
scoring = 'neg_mean_squared_error'
import multiprocessing
n_jobs = multiprocessing.cpu_count() - 1

# **Comparing the models:**

In [None]:
#Linear Regression:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
scores = cross_val_score(model, X_train_scaled, Y_train, cv = cv,
                         scoring = scoring, n_jobs = n_jobs)
np.sqrt(-scores.mean())

In [None]:
#Ridge Regression:
from sklearn.linear_model import Ridge
model = Ridge()
scores = cross_val_score(model, X_train_scaled, Y_train, cv = cv,
                         scoring = scoring, n_jobs = n_jobs)
np.sqrt(-scores.mean())

In [None]:
#Lasso Regression:
from sklearn.linear_model import Lasso
model = Lasso()
scores = cross_val_score(model, X_train_scaled, Y_train, cv = cv,
                         scoring = scoring, n_jobs = n_jobs)
np.sqrt(-scores.mean())

In [None]:
#Nearest Neighbors:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor()
scores = cross_val_score(model, X_train_scaled, Y_train, cv = cv,
                         scoring = scoring, n_jobs = n_jobs)
np.sqrt(-scores.mean())

In [None]:
#Decision Tree:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
scores = cross_val_score(model, X_train_scaled, Y_train, cv = cv,
                         scoring = scoring, n_jobs = n_jobs)
np.sqrt(-scores.mean())

In [None]:
#Random Forest:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
scores = cross_val_score(model, X_train_scaled, Y_train, cv = cv,
                         scoring = scoring, n_jobs = n_jobs)
np.sqrt(-scores.mean())

In [None]:
#Deep Learning:
from sklearn.neural_network import MLPRegressor
model = MLPRegressor()
scores = cross_val_score(model, X_train_scaled, Y_train, cv = cv,
                         scoring = scoring, n_jobs = n_jobs)
np.sqrt(-scores.mean())

In [None]:
#Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor()
scores = cross_val_score(model, X_train_scaled, Y_train, cv = cv,
                         scoring = scoring, n_jobs = n_jobs)
np.sqrt(-scores.mean())

In [None]:
from sklearn.model_selection import train_test_split
X_train1, X_test, Y_train1, Y_test = train_test_split(X_train_scaled, Y_train,test_size = 0.2, random_state = 24)

In [None]:
model = RandomForestRegressor(n_estimators = 150)
model.fit(X_train1, Y_train1)

features_importances = model.feature_importances_
argsort = np.argsort(features_importances) #making a ordering of importances
features_importances_sorted = features_importances[argsort]

In [None]:
feature_names = X_train.columns
features_sorted = feature_names[argsort]

In [None]:
plt.barh(features_sorted, features_importances_sorted)

In [None]:
Y_pred = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y_test, Y_pred)
rmse = np.sqrt(mse)

In [None]:
mse

In [None]:
rmse

In [None]:
print_every = int(250000 / 1000)

fig = plt.figure(figsize=(20,5))
plt.bar(list(range(len(Y_test[::print_every]))), Y_test.values[::print_every],
        alpha = 1, color = 'red', width = 1, label = 'true values')
plt.bar(list(range(len(Y_pred[::print_every]))), Y_pred[::print_every],
        alpha = 0.5, color = 'blue', width = 1, label = 'predicted values')
plt.legend()

In [None]:
test = pd.read_csv('../input/new-york-city-taxi-fare-prediction/test.csv',parse_dates = ['pickup_datetime'])

In [None]:
key = test['key']

In [None]:
#Data
pdt = test['pickup_datetime']
pdt = pdt.map(lambda date: date.tz_localize(None))
test.drop(['key'], axis = 1, inplace=True)
test['pickup_datetime'] = pdt

#New attributes
test['day_of_week'] = test['pickup_datetime'].map(lambda date: date.timetuple().tm_wday)
test['day_of_year'] = test['pickup_datetime'].map(lambda date: date.timetuple().tm_yday)
test['year'] = test['pickup_datetime'].map(lambda date: date.timetuple().tm_year)
test['hour_of_day'] = test['pickup_datetime'].map(lambda date: date.timetuple().tm_hour)
test.drop('pickup_datetime', axis = 1, inplace = True)

#Distance
lon1, lon2 = np.radians(test['pickup_longitude']), np.radians(test['dropoff_longitude'])
lat1, lat2 = np.radians(test['pickup_latitude']), np.radians(test['dropoff_latitude'])
dlon = lon2 - lon1
dlat = lat2 - lat1

a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
test['eucl_distance'] = 6373 * c

a1 = np.sin(dlon/2)**2
c1 = 2 * np.arctan2(np.sqrt(a1), np.sqrt(1-a1))
a2 = np.sin(dlat/2)**2
c2 = 2 * np.arctan2(np.sqrt(a2), np.sqrt(1-a2))
test['manh_distance'] = 6373 * (c1+c2)

In [None]:
X_test = test
X_test_scaled = scaler.transform(X_test) #scaled between 0 and 1

In [None]:
Y_pred = model.predict(X_test)

In [None]:
sub = pd.DataFrame({'key': key, 'fare_amount': Y_pred})
sub.head()
sub.to_csv('submission.csv', index = False)