Libraries importation
----------

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# Tools
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_theme(style="darkgrid")
import matplotlib
import matplotlib.pyplot as plt
import math
import shap
import folium
from folium.plugins import MarkerCluster
import plotly.express as px
import json
import geopandas as gpd
from shapely.geometry import Polygon, Point

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error,mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from sklearn import neighbors
from sklearn.metrics import make_scorer


# Optimizing models
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,train_test_split

df_train = data = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/train.zip", compression="zip")
df_test = pd.read_csv("/kaggle/input/nyc-taxi-trip-duration/test.zip", compression="zip")

# Data visualization/Analysis

Let's visualize the datasets : general informations, NaN values, column types etc.

## Basic analysis

In [None]:
df_test.head()

In [None]:
df_train.describe()

In [None]:
print(df_train.info())
print()
print("Dataframe train NaN values : \n {}".format(df_train.isnull().sum()))

In [None]:
df_train.drop(['id'], 1).hist(bins=50, figsize=(20,15))
plt.show()

## Drop outliers

In [None]:
df_train = df_train[(df_train['trip_duration'].between(-1,5400))]
df_train = df_train[(df_train['pickup_longitude'].between(-74.05,-73.75)) & (df_train['pickup_latitude'].between(40.550,40.95)) & (df_train['dropoff_longitude'].between(-74.05,-73.75)) & (df_train['dropoff_latitude'].between(40.550,40.95))]

df_test = df_test[(df_test['pickup_longitude'].between(-74.05,-73.75)) & (df_test['pickup_latitude'].between(40.550,40.95)) & (df_test['dropoff_longitude'].between(-74.05,-73.75)) & (df_test['dropoff_latitude'].between(40.550,40.95))]

df_train.describe()


## Pickup/Dropoff localization analysis

In [None]:
#data_without_outlier = df_train[(df_train['pickup_longitude'].between(-74.05,-73.75)) & (df_train['pickup_latitude'].between(40.550,40.95)) & (df_train['dropoff_longitude'].between(-74.05,-73.75)) & (df_train['dropoff_latitude'].between(40.550,40.95))]
#print(data_without_outlier)

fig, axes = plt.subplots(1, 2, figsize=(20, 10))
fig.suptitle('Pickup-Dropoff repartition')

sns.scatterplot(ax=axes[0], x="pickup_longitude", y="pickup_latitude", label="pickup", s=2, data=df_train)
sns.scatterplot(ax=axes[1], x="dropoff_longitude", y="dropoff_latitude", label="dropoff", s=2, data=df_train)


plt.figure(figsize=(20,20))
markers = {"Dropoff": "X","Pickup": "o"}
sns.scatterplot(x="dropoff_longitude", y="dropoff_latitude", label="dropoff", markers=markers, s=2, data=df_train)
sns.scatterplot(x="pickup_longitude", y="pickup_latitude", label="pickup", markers=markers, s=2, data=df_train)



## Pickup/Dropoff Date time analysis

In [None]:
df_train['pickup_datetime'] = pd.to_datetime(df_train['pickup_datetime'])

df_test['pickup_datetime'] = pd.to_datetime(df_test['pickup_datetime'])

df_train['pickup_datetime'].head()

dt.time/hour/year etc pour recup les infos de la dates (voir doc)

In [None]:
## Analyzing date pickup-informations 

# Date datas
date_year = pd.DataFrame(df_train['pickup_datetime'].dt.year)
df_train['pu_month'] = df_train['pickup_datetime'].dt.month
df_train['pu_day'] = df_train['pickup_datetime'].dt.day
df_train['pu_dayweek'] = df_train['pickup_datetime'].dt.dayofweek #0->monday 6->sunday
df_train['pu_hour'] = df_train['pickup_datetime'].dt.hour
date_week = pd.DataFrame(df_train['pickup_datetime'].dt.week)

df_test['pu_month'] = df_test['pickup_datetime'].dt.month
df_test['pu_day'] = df_test['pickup_datetime'].dt.day
df_test['pu_dayweek'] = df_test['pickup_datetime'].dt.dayofweek #0->monday 6->sunday
df_test['pu_hour'] = df_test['pickup_datetime'].dt.hour

# Plot
fig, axes = plt.subplots(2, 3, figsize=(20, 10))
fig.suptitle('Date pickup-information analysis')

sns.countplot(ax=axes[0, 0], x='pickup_datetime', data=date_year)
sns.countplot(ax=axes[0, 1], x='pu_month', data=df_train)
sns.countplot(ax=axes[0, 2], x='pu_day', data=df_train)
sns.countplot(ax=axes[1, 0], x='pu_dayweek', data=df_train)
sns.countplot(ax=axes[1, 1], x='pu_hour', data=df_train)
sns.countplot(ax=axes[1, 2], x='pickup_datetime', data=date_week)

print(date_week.describe())




In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 10))
fig.suptitle('Correlation analytics')
sns.countplot(ax=axes[0, 0], x='pu_month',hue='vendor_id',data=df_train)
sns.countplot(ax=axes[0, 1], x='pu_day',hue='vendor_id',data=df_train)
sns.countplot(ax=axes[1, 0], x='pu_hour',hue='vendor_id',data=df_train)
sns.countplot(ax=axes[1, 1], x='pu_dayweek',hue='vendor_id',data=df_train)

## Trip duration relations

Let's put the max limit of a trip duration at 5400 seconds = 1h30

In [None]:
fig, axes = plt.subplots(3, 2, figsize=(20, 20))
fig.suptitle('Correlation analytics')
sns.countplot(ax=axes[0, 0], x='passenger_count',hue='vendor_id',data=df_train)
sns.histplot(ax=axes[0, 1], x='trip_duration', bins = 100, kde=True, data=df_train)
sns.boxplot(ax=axes[1, 0], x='pu_dayweek',y='trip_duration', data=df_train[df_train['trip_duration'] < 2000])
sns.boxplot(ax=axes[1, 1], x='pu_hour', y='trip_duration', data=df_train[df_train['trip_duration'] < 2000])
sns.violinplot(ax=axes[2, 0], x='vendor_id',y='trip_duration', data=df_train[df_train['trip_duration'] < 2000])
sns.violinplot(ax=axes[2, 1], x='pu_dayweek', y='trip_duration', hue='vendor_id', data=df_train[df_train['trip_duration'] < 2000])

# Creating features

**Data which will be used**
* Euclidian distance using localization
* Trip's speed using Euclidian distance with trip duration
* Neighborhood localization : Manathan, Queens, Brooklyn
* Pick hours : evenning, morning... using hour informations
* Pick days : start/end/middle of the week
* Trip durations/vendor id : relation?
* passenger_count/distance/speed/duration relation?

**Data which will not be used**
* Year -> 2016
* Month -> approximately no difference
* store&fwdflag 


## Average distance using the Haversine method

In [None]:
#Calculates the distance between two points A and B
def haversine(row):
    earth_radius = 6371 
    #Convert into radius
    lat_A = row['pickup_latitude'] * math.pi / 180
    lat_B = row['dropoff_latitude'] * math.pi / 180
    long_A = row['pickup_longitude'] * math.pi / 180
    long_B = row['dropoff_longitude'] * math.pi / 180
    
    lat = lat_B - lat_A
    long = long_B - long_A
    a = np.sin(lat * 0.5) ** 2 + np.cos(lat_A) * np.cos(lat_B) * np.sin(long * 0.5) ** 2
    d = earth_radius * 2 * np.arcsin(np.sqrt(a))
    return d

#Adding the distance column in the df_train
df_train['distance'] = df_train.apply(haversine, axis=1)
df_test['distance'] = df_test.apply(haversine, axis=1)

df_train.head()

In [None]:
df_train.describe()

In [None]:
sns.histplot(x='distance', bins = 100, data=df_train)

## Important geographical zones

### Airport

In [None]:
df_train['pu_airport'] = (df_train['pickup_longitude'].between(-73.82,-73.77)) & (df_train['pickup_latitude'].between(40.63,40.66))
df_train['do_airport'] = (df_train['dropoff_longitude'].between(-73.82,-73.77)) & (df_train['dropoff_latitude'].between(40.63,40.66))
df_train['trip_airport'] = (df_train['dropoff_longitude'].between(-73.82,-73.77)) & (df_train['dropoff_latitude'].between(40.63,40.66) & df_train['pickup_longitude'].between(-73.82,-73.77)) & (df_train['pickup_latitude'].between(40.63,40.66))

df_test['pu_airport'] = (df_test['pickup_longitude'].between(-73.82,-73.77)) & (df_test['pickup_latitude'].between(40.63,40.66))
df_test['do_airport'] = (df_test['dropoff_longitude'].between(-73.82,-73.77)) & (df_test['dropoff_latitude'].between(40.63,40.66))
df_test['trip_airport'] = (df_test['dropoff_longitude'].between(-73.82,-73.77)) & (df_test['dropoff_latitude'].between(40.63,40.66) & df_test['pickup_longitude'].between(-73.82,-73.77)) & (df_test['pickup_latitude'].between(40.63,40.66))

df_train.head()

fig, axes = plt.subplots(3, 2, figsize=(20, 20))
fig.suptitle('Correlation analytics')
sns.countplot(ax=axes[0, 0], x='pu_airport', data=df_train)
sns.scatterplot(ax=axes[0, 1], x="pickup_longitude", y="pickup_latitude", label="pickup airport", markers=markers, s=2, data=df_train[(df_train['pickup_longitude'].between(-73.82,-73.77)) & (df_train['pickup_latitude'].between(40.63,40.66))])
sns.boxplot(ax=axes[1, 0], x='trip_airport', y='trip_duration', data=df_train)
sns.scatterplot(ax=axes[1, 1], x="dropoff_longitude", y="dropoff_latitude", label="dropoff airport", markers=markers, s=2, data=df_train[(df_train['dropoff_longitude'].between(-73.82,-73.77)) & (df_train['dropoff_latitude'].between(40.63,40.66))])
sns.boxplot(ax=axes[2, 0], x='pu_airport', y='trip_duration', data=df_train)
sns.boxplot(ax=axes[2, 1], x='do_airport', y='trip_duration', data=df_train)


### Borough

In [None]:
manathan_coords = [(40.70, -74.02), (40.8, -74), (40.85, -73.95), (40.82, -73.92), (40.76, -73.94), (40.70, -73.98)]
manathan = Polygon(manathan_coords)

def pickup_within_manathan(row):
    point = Point(row['pickup_latitude'],row['pickup_longitude'])
    if point.within(manathan) == True:
        return True
    else:
        return False
    return False

def dropoff_within_manathan(row):
    point = Point(row['dropoff_latitude'],row['dropoff_longitude'])
    if point.within(manathan) == True:
        return True
    else:
        return False
    return False

def trip_within_manathan(row):
    point_pickup = Point(row['pickup_latitude'],row['pickup_longitude'])
    point_dropoff = Point(row['dropoff_latitude'],row['dropoff_longitude'])
    if point_pickup.within(manathan) == True and point_dropoff.within(manathan) == True:
        return True
    else:
        return False
    return False

df_train['pu_manathan'] = df_train.apply(pickup_within_manathan, axis=1)
df_train['do_manathan'] = df_train.apply(dropoff_within_manathan, axis=1) 
df_train['trip_manathan'] = df_train.apply(trip_within_manathan, axis=1)
df_test['pu_manathan'] = df_test.apply(pickup_within_manathan, axis=1)
df_test['do_manathan'] = df_test.apply(dropoff_within_manathan, axis=1) 
df_test['trip_manathan'] = df_test.apply(trip_within_manathan, axis=1)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(20, 20))
fig.suptitle('Borough analytics')
sns.boxplot(ax=axes[0 ,0], x='pu_manathan', y='trip_duration', data=df_train)
sns.boxplot(ax=axes[0 ,1], x='do_manathan', y='trip_duration', data=df_train)
sns.boxplot(ax=axes[1 ,0], x='trip_manathan', y='trip_duration', data=df_train)
sns.countplot(ax=axes[1, 1], x='trip_manathan', data=df_train)


In [None]:
def airport2manathan(row):
    if row['pu_airport'] == True and row['do_manathan'] == True:
        return True
    elif row['do_airport'] == True and row['pu_manathan'] == True:
        return True
    else:
        return False
    return False

df_train['airport_manathan'] = df_train.apply(airport2manathan, axis=1)
df_test['airport_manathan'] = df_test.apply(airport2manathan, axis=1)


fig, axes = plt.subplots(1, 2, figsize=(20, 10))
fig.suptitle('Borough analytics')
sns.boxplot(ax=axes[0], x='airport_manathan', y='trip_duration', data=df_train)
sns.countplot(ax=axes[1], x='airport_manathan', data=df_train)


# Dataframe preparation

In [None]:
df_test.head()

## Droping useless columns

In [None]:
df_test.head()

In [None]:
df_train = df_train.drop(['id','pickup_datetime','dropoff_datetime','pickup_latitude','pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'store_and_fwd_flag'],axis=1)
id_submission = df_test['id']
df_test = df_test.drop(['id','pickup_datetime','pickup_latitude','pickup_longitude', 'dropoff_latitude', 'dropoff_longitude', 'store_and_fwd_flag'],axis=1)


df_train.head()

## bool to binary values

In [None]:
df_train.info()

In [None]:
bool_cols = df_train.select_dtypes(include='bool').columns
df_train[bool_cols] = df_train[bool_cols].astype(int)

bool_cols_test = df_test.select_dtypes(include='bool').columns
df_test[bool_cols_test] = df_test[bool_cols_test].astype(int)

df_train.head()

## Test samples creation & scalability

In [None]:
X = df_train.drop(["trip_duration"],1)
Y = df_train["trip_duration"]
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)
x_train.describe()

# Model processing

## Model creation

In [None]:
#Model creation
#model = RandomForestRegressor(n_jobs=-1)
#model = LGBMRegressor(n_jobs=-1, random_state=42)
#model = neighbors.KNeighborsRegressor(n_neighbors=5)
model_optimized = LGBMRegressor(n_jobs=-1, random_state=42, n_estimators=500, max_depth=30, num_leaves=64)
model_optimized.fit(x_train, y_train)

### GridSeachCV : optimizing model and cross validation

In [None]:
#def rmsle(y_test, y_pred): #score function
#    y_pred_without_outliers = np.where(y_pred < 0, 0, y_pred)
#    return np.sqrt(mean_squared_log_error(y_test, y_pred_without_outliers))

#scorer = make_scorer(rmsle, greater_is_better=False)

#parameters ={
#    'max_depth':[30, 48, 64],
#    'n_estimators':[500,1000, 1500], 
#    'num_leaves':[64, 128, 256],
#    'boosting':['gbdt']
#}

#clf = GridSearchCV(model,parameters, scoring=scorer, cv=3, refit=True)
#clf.fit(x_train, y_train)


### Model training and predictions

In [None]:
#print('best parameters found : ',clf.best_params_)
#model_optimized = clf.best_estimator_
#print('optimized model : ', model_optimized)

y_pred = model_optimized.predict(x_test) #predictions
y_submission = model_optimized.predict(df_test) #real predictions

#drop outliers
print('check negative values : ', y_pred.min())
y_pred_without_outliers = np.where(y_pred < 0, 0, y_pred)
y_submission_without_outliers = np.where(y_submission < 0, 0, y_submission)

### Check overfitting/under fitting

In [None]:
x_pred = model_optimized.predict(x_train) #predictions

#drop outliers
print('check negative values : ', x_pred.min())
x_pred_without_outliers = np.where(x_pred < 0, 0, x_pred)

#RMLSE_train = np.sqrt(mean_squared_log_error(x_pred_without_outliers,x_test))

#print('Root mean logarithm squared error : ',RMLSE_train)

In [None]:
RMLSE_train = np.sqrt(mean_squared_log_error(x_pred_without_outliers,y_train))
print('Root mean logarithm squared error : ',RMLSE_train)

## Model performance

In [None]:
ABS = mean_absolute_error(y_test, y_pred_without_outliers)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred_without_outliers))
RMLSE = np.sqrt(mean_squared_log_error(y_pred_without_outliers, y_test))

print('Mean absolute error : ',ABS)
print('Root mean squared error : ',RMSE)
print('Root mean logarithm squared error : ',RMLSE)

## Features importance

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model_optimized.feature_importances_,X.columns)), columns=['Value','Feature'])

plt.figure(figsize=(20, 10))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features')

# Submission

In [None]:
submission = pd.DataFrame({'id':id_submission,'trip_duration':y_submission_without_outliers})

#Visualize the first 5 rows
submission.head()

filename = 'NYC_sub.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

