## IMPORT

In [None]:
# import 

# libraries

from scipy import stats
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight');
plt.rcParams['font.size'] = 14;
plt.figure(figsize=(12,5));
palette = sns.color_palette('Paired', 10);

# map

import folium
from folium.plugins import HeatMap
from folium.plugins import HeatMapWithTime

# sci-kit learn

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn import set_config; set_config(display='diagram')

In [None]:
data = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/train.csv", nrows=1000000)
data.head()

# DATA ANALYSIS

In [None]:
data = data.drop(["key"],axis=1)

In [None]:
size_before = len(data)
data = data.drop_duplicates()
size_after = len(data)
print(str(size_before - size_after) + " duplicates were removed.")

In [None]:
100 * data.isnull().sum().sort_values(ascending=False)/len(data)

In [None]:
#exploring data

def plot_dist(series=data["fare_amount"], title = "Fare Distribution"):
    sns.histplot(series, kde=True, stat='density',discrete=True)
    sns.despine()
    plt.title(title);
    plt.show()
plot_dist()

In [None]:
#dropping absurd values

data = data[data.fare_amount.between(0,60)]
plot_dist(data.fare_amount)

In [None]:
data['fare-bin'] = pd.cut(data['fare_amount'], bins = list(range(0, 50, 5)), include_lowest=True).astype('str')

# Uppermost bin
data['fare-bin'] = data['fare-bin'].replace(np.nan, '[45+]')

# apply this to clean up the first bin's label
data['fare-bin'] = data['fare-bin'].apply(lambda x: x.replace('-0.001', '0'))

# sort by fare the correct look in the chart
data = data.sort_values(by='fare_amount')

In [None]:
sns.catplot(x="fare-bin", kind="count", palette= "icefire", data=data, height=5, aspect=3);
sns.despine()
plt.show()

**ANALYSING FEATURES**

In [None]:
# passanger_count feature

data.passenger_count.describe()

In [None]:
sns.catplot(x="passenger_count", kind="count", palette="icefire", data=data, height=5, aspect=3);
sns.despine()
plt.title('Passenger Count');
plt.show()

In [None]:
# pickup_datetime feature

def extract_time_features(df):
    timezone_name = 'America/New_York'
    time_column = "pickup_datetime"
    df.index = pd.to_datetime(df[time_column])
    df.index = df.index.tz_convert(timezone_name)
    df["dow"] = df.index.weekday
    df["hour"] = df.index.hour
    df["month"] = df.index.month
    df["year"] = df.index.year
    return df.reset_index(drop=True)

In [None]:
data = extract_time_features(data.drop(["fare-bin"], axis=1))
data.head()

In [None]:
# taxi trip repartition by hour of the day

sns.catplot(x="hour", kind="count", palette="icefire", data=data, height=5, aspect=3);
sns.despine()
plt.title('Hour of Day');
plt.show()

In [None]:
# taxi trip repartition by day of the week

sns.catplot(x="dow", kind="count", palette="icefire", data=data, height=5, aspect=3);
sns.despine()
plt.title('Day of Week');
plt.show()

### Finding boundaries from test set and removing outliers from training set

In [None]:
data_test = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/test.csv")

In [None]:
for col in ["pickup_latitude","pickup_longitude","dropoff_latitude","dropoff_longitude"]:
    MIN = data_test[col].min()
    MAX = data_test[col].max()
    
    print(col, MIN, MAX)

In [None]:
data = data[data["pickup_latitude"].between(left = 40, right = 42 )]
data = data[data["pickup_longitude"].between(left = -74.3, right = -72.9 )]
data = data[data["dropoff_latitude"].between(left = 40, right = 42 )]
data = data[data["dropoff_longitude"].between(left = -74, right = -72.9 )]

In [None]:
center_location = [40.758896, -73.985130]
m = folium.Map(location=center_location, control_scale=True, zoom_start=11)

In [None]:
data["count"] =1
heatmap_data = data.head(10000)[['pickup_latitude', 'pickup_longitude', 'count']].groupby(['pickup_latitude', 'pickup_longitude']).sum().reset_index().values.tolist()
gradient = {0.2: 'blue', 0.4: 'lime', 0.6: 'orange', 1: 'red'}
HeatMap(data=heatmap_data, radius=5, gradient=gradient, max_zoom=13).add_to(m)
m

In [None]:
heatmap_data_by_hour = []
__data__ = data.head(10000)
for hour in data.hour.sort_values().unique():
    _data = __data__[__data__.hour == hour][['pickup_latitude', 'pickup_longitude', 'count']].groupby(['pickup_latitude', 'pickup_longitude']).sum().reset_index().values.tolist()
    heatmap_data_by_hour.append(_data)

In [None]:
m2 = folium.Map(location=center_location, control_scale=True, zoom_start=11)
HeatMapWithTime(heatmap_data_by_hour, radius=5, 
                gradient=gradient, 
                min_opacity=0.5, max_opacity=0.8, 
                use_local_extrema=False).add_to(m2)
m2

In [None]:
#Distance feature

def haversine_distance(df,
                       start_lat="start_lat",
                       start_lon="start_lon",
                       end_lat="end_lat",
                       end_lon="end_lon"):
    
    # Calculate the great circle distance between two points 
    #on the earth (specified in decimal degrees).
       
    #Vectorized version of the haversine distance for pandas df
    #Computes distance in kms
    
    lat_1_rad, lon_1_rad = np.radians(df[start_lat].astype(float)), np.radians(df[start_lon].astype(float))
    lat_2_rad, lon_2_rad = np.radians(df[end_lat].astype(float)), np.radians(df[end_lon].astype(float))
    dlon = lon_2_rad - lon_1_rad
    dlat = lat_2_rad - lat_1_rad

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    haversine_distance = 6371 * c
    return haversine_distance

data["distance"] = haversine_distance(data, 
                                      start_lat="pickup_latitude", start_lon="pickup_longitude",
                                      end_lat="dropoff_latitude", end_lon="dropoff_longitude"
                                     )

In [None]:
data.distance.describe()

In [None]:
%matplotlib inline
plot_dist(series=data[data.distance<50].distance, title = "Distance distribution")

In [None]:
#passenger count feature

sns.catplot(x="passenger_count", y="fare_amount", palette="icefire", data=data, kind="bar", aspect=3)
sns.despine()
plt.show()

In [None]:
#fare amount by hour

sns.catplot(x="hour", y="fare_amount", palette="icefire", data=data, kind="bar", aspect=3)
sns.despine()
plt.show()

### CORRELATION BETWEEN FEATURES

In [None]:
# Correlation between fare_amount and distance

sns.scatterplot(x="distance", y="fare_amount", palette="icefire",data=data[data.distance < 80].sample(100000))
plt.show()

In [None]:
data.head()

# TRAINING

In [None]:
#starting on a fresh dataset to prepare for training

data_train=data_train = pd.read_csv("/kaggle/input/new-york-city-taxi-fare-prediction/train.csv", nrows=1000)

### CLEANING DATASET

In [None]:
#cleaning dataset

def clean_data(df, test=False, predict=False):
    df = df.drop(["key"], axis=1)
    df = df.dropna(how='any', axis='rows')
    df = df[(df.dropoff_latitude != 0) | (df.dropoff_longitude != 0)]
    df = df[(df.pickup_latitude != 0) | (df.pickup_longitude != 0)]
    if "fare_amount" in list(df):
        df = df[df.fare_amount.between(0, 4000)]
    df = df[df.passenger_count < 8]
    df = df[df.passenger_count >= 0]
    df = df[df["pickup_latitude"].between(left=40, right=42)]
    df = df[df["pickup_longitude"].between(left=-74.3, right=-72.9)]
    df = df[df["dropoff_latitude"].between(left=40, right=42)]
    df = df[df["dropoff_longitude"].between(left=-74, right=-72.9)]
    return df
data_train = clean_data(data_train)
data_train.head()

### PREPARING MODEL INPUTS

In [None]:
X_train = data_train.drop(["fare_amount"], axis=1)
y_train = data_train["fare_amount"]

### **PIPELINES**

#### TIME FEATURES

In [None]:
#class for time features encoding

class TimeFeaturesEncoder(BaseEstimator, TransformerMixin):
    #Extract the day of week (dow), the hour, the month and the year from a time column.

    def __init__(self, time_column, time_zone_name='America/New_York'):
        self.time_column = time_column
        self.time_zone_name = time_zone_name

    def extract_time_features(self, X):
        timezone_name = self.time_zone_name
        time_column = self.time_column
        df = X.copy()
        df.index = pd.to_datetime(df[time_column])
        df.index = df.index.tz_convert(timezone_name)
        df["dow"] = df.index.weekday
        df["hour"] = df.index.hour
        df["month"] = df.index.month
        df["year"] = df.index.year        
        return df
        
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        #Returns a copy of the DataFrame X with only four columns: 'dow', 'hour', 'month', 'year'
        return self.extract_time_features(X)[['dow', 'hour', 'month', 'year']].reset_index(drop=True)

In [None]:
#test of the TimeFeaturesEncoder

time_enc = TimeFeaturesEncoder('pickup_datetime')
time_features = time_enc.fit_transform(X_train, y_train)
time_features.head()

#### PIPELINE FOR TIME FEATURES

In [None]:
# TIME PIPELINE
pipe_time = Pipeline([
    ('time_features_create', TimeFeaturesEncoder('pickup_datetime')),
    ('time_features_ohe', OneHotEncoder(handle_unknown='ignore', sparse=False))
])
pipe_time

#### DISTANCES

In [None]:
# Class for distance encoding

def haversine_vectorized(df, 
         start_lat="pickup_latitude",
         start_lon="pickup_longitude",
         end_lat="dropoff_latitude",
         end_lon="dropoff_longitude"):
    
    lat_1_rad, lon_1_rad = np.radians(df[start_lat].astype(float)), np.radians(df[start_lon].astype(float))
    lat_2_rad, lon_2_rad = np.radians(df[end_lat].astype(float)), np.radians(df[end_lon].astype(float))
    dlon = lon_2_rad - lon_1_rad
    dlat = lat_2_rad - lat_1_rad

    a = np.sin(dlat / 2.0) ** 2 + np.cos(lat_1_rad) * np.cos(lat_2_rad) * np.sin(dlon / 2.0) ** 2
    c = 2 * np.arcsin(np.sqrt(a))
    return 6371 * c

In [None]:
class DistanceTransformer(BaseEstimator, TransformerMixin):
    
    def __init__(self, 
                 start_lat="pickup_latitude",
                 start_lon="pickup_longitude", 
                 end_lat="dropoff_latitude", 
                 end_lon="dropoff_longitude"):
        self.start_lat = start_lat
        self.start_lon = start_lon
        self.end_lat = end_lat
        self.end_lon = end_lon

    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        
        return pd.DataFrame(haversine_vectorized(X)).rename(columns={0: "course distance [km]"}).copy()

In [None]:
# Test of DistanceTransformer

dist_trans = DistanceTransformer()
distance = dist_trans.fit_transform(X_train, y_train)
distance.head()

### COMBINATION OF DISTANCE AND TIME FEATURES PIPELINE

In [None]:
preprocessor = ColumnTransformer([
    ('distance', DistanceTransformer(), ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']),
    ('time', pipe_time, ['pickup_datetime'])
], remainder='passthrough')
preprocessor

### FULL PREPROCESSOR PIPELINE

In [None]:
pipe_prepro = Pipeline([
    ('dist_and_time', preprocessor),
    ('scaler', MinMaxScaler())
])
pipe_prepro

### FULL PIPELINE (Random Forest Generator)

In [None]:
final_pipe = Pipeline([
    ('preprocessor', pipe_prepro),
    ('model', RandomForestRegressor())
])
final_pipe

## Definition of custom score for RMSE

In [None]:
def custom_rmse(y_true, y_pred):
    return np.sqrt(np.mean(np.square(y_true - y_pred)))

In [None]:
rmse = make_scorer(custom_rmse, greater_is_better=False)

### Baseline RMSE for RandomForestRegressor

In [None]:
baseline = cross_validate(final_pipe,
                          X_train,
                          y_train,
                          scoring=rmse,
                          cv=5)
baseline_rmse = -round(baseline["test_score"].mean(), 3)
baseline_rmse

### RandomisedSearchCV for an Optimised Model

In [None]:
grid_RFR = {'model__n_estimators': stats.randint(1, 300),
            'model__max_depth': stats.randint(1, 300),
            'model__max_samples': stats.randint(1, 300),
            "preprocessor__scaler": [StandardScaler(), RobustScaler(), MinMaxScaler()]
            }

search_RFR = RandomizedSearchCV(final_pipe,
                                grid_RFR,
                                scoring=rmse,
                                n_iter=100,
                                cv=5,
                                n_jobs=-1,
                                verbose=True)
search_RFR.fit(X_train, y_train);

In [None]:
search_RFR.best_params_

In [None]:
print("Tuned RandomForestRegressor model rmse: " + str(-round(search_RFR.best_score_, 2)))