# New York Trip Duration Prediction

## 1. Importation des librairies

In [None]:
import os
from pathlib import Path

import datetime as dt
import math
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
sns.set({'figure.figsize':(16,10)})
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import ShuffleSplit
import matplotlib.pyplot as plt
%matplotlib inline



## 2. Chargement et exploration des données

In [None]:
print(os.listdir("../input"))

In [None]:
train = pd.read_csv('../input/dataset-de-landry/train.csv')
test = pd.read_csv('../input/dataset-de-landry/test.csv')
sample = pd.read_csv('../input/dataset-de-landry/sample_submission.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.tail()

In [None]:
train.dtypes

- VALEURS MANQUANTES

In [None]:
train.isna().sum()

- Notre Dataset est propre, pas de valeurs manquantes
- C'est un bon point pour pouvoir faire les prédictions

In [None]:
train.trip_duration.min()

In [None]:
train.trip_duration.max()

- De 1 seconde à 3526282 secondes, on peut soupconner des outliers.
- On doit supprimer ses outliers pour des meilleures prédictions

In [None]:
fig, ax = plt.subplots(ncols=1, nrows=1,figsize=(12,10))
plt.ylim(40.6, 40.9)
plt.xlim(-74.1,-73.7)
ax.scatter(train['pickup_longitude'],train['pickup_latitude'], s=0.0002, alpha=1)

## 3.Gestion des outliers

In [None]:
fig, ax = plt.subplots(7, sharex=True)
for i,c in enumerate(["vendor_id","passenger_count","pickup_longitude","pickup_latitude","dropoff_longitude","dropoff_latitude","trip_duration"]):
    sns.boxplot(train[c],ax=ax[i],width=1.5)
    ax[i].set_xscale("log")
    ax[i].set_xlabel("")
    ax[i].set_ylabel(c, fontsize=15,rotation=45)
fig.suptitle('Analyse des outliers', fontsize=20)

- Histogramme des différentes durées de voyage

In [None]:
train.loc[train.trip_duration<4000,"trip_duration"].hist(bins=120)

- On va choisir un Trip duration jusqu'à 4000
- On va admettre le trip duration 0 car une course peut être annulée.

In [None]:
train = train[train['passenger_count']>0]
train = train[train['passenger_count']<9]

In [None]:


train = train[(train['trip_duration'] > 60) & (train['trip_duration'] < 3600)]


train['trip_duration'] = np.log(train['trip_duration'].values)

train['hour'] = train['pickup_datetime'].apply(lambda x: int(x.split()[1][0:2]))


test['hour'] = test['pickup_datetime'].apply(lambda x: int(x.split()[1][0:2]))

#outliers coordonnés
train = train.loc[train['pickup_longitude']> -80]
train = train.loc[train['pickup_latitude']< 44]
train = train.loc[train['dropoff_longitude']> -90]
train = train.loc[train['dropoff_latitude']> 34]

- Estimation des distances des courses

In [None]:
def haversine(lat1, lon1, lat2, lon2):
   R = 6372800  # Earth radius in meters
   phi1, phi2 = math.radians(lat1), math.radians(lat2)
   dphi       = math.radians(lat2 - lat1)
   dlambda    = math.radians(lon2 - lon1)

   a = math.sin(dphi/2)**2 + \
       math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2

   return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))

train['dist_long'] = train['pickup_longitude'] - train['dropoff_longitude']
test['dist_long'] = test['pickup_longitude'] - test['dropoff_longitude']

train['dist_lat'] = train['pickup_latitude'] - train['dropoff_latitude']
test['dist_lat'] = test['pickup_latitude'] - test['dropoff_latitude']

train['dist'] = np.sqrt(np.square(train['dist_long']) + np.square(train['dist_lat']))
test['dist'] = np.sqrt(np.square(test['dist_long']) + np.square(test['dist_lat']))

train['speed'] = 100000*train['dist'] / train['trip_duration']

- On va supprimer tous les outliers,
- Distances nulles

In [None]:
train.isnull().sum()

## 4. DataFrame nettoyé

In [None]:
col_diff = list(set(train.columns).difference(set(test.columns)))

train.head()

In [None]:
y_train = train["trip_duration"] # <-- target
X_train = train[["vendor_id","passenger_count","pickup_longitude", "pickup_latitude", "dropoff_longitude","dropoff_latitude","dist","hour"]] # <-- features

X_datatest = test[["vendor_id","passenger_count","pickup_longitude", "pickup_latitude", "dropoff_longitude","dropoff_latitude","dist","hour"]]

In [None]:
train.drop(['speed','dist','hour']+col_diff, axis=1, inplace=True)

## 5. Selection des modèles

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size = 0.1, random_state=42)

In [None]:
#rfr = RandomForestRegressor(n_estimators=200,min_samples_leaf=5, min_samples_split=15, max_depth=80,verbose=0,max_features="auto",n_jobs=-1)
#rfr.fit(X_train, y_train)

In [None]:
# Un peu long
# calculer les scores de cross validation du model selon une decoupe du dataset de train
# cv_scores = cross_val_score(rfr, X_train, y_train, cv=5, scoring= 'neg_mean_squared_log_error')

In [None]:
# cv_scores

In [None]:
#for i in range(len(cv_scores)):
#    cv_scores[i] = np.sqrt(abs(cv_scores[i]))
#print(np.mean(cv_scores))

## xgb parameters
params = {
    'booster':            'gbtree',
    'objective':          'reg:linear',
    'learning_rate':      0.1,
    'max_depth':          14,
    'subsample':          0.8,
    'colsample_bytree':   0.7,
    'colsample_bylevel':  0.7,
    'silent':             1
}

In [None]:
nrounds = 1200
dtrain = xgb.DMatrix(X_train, np.log(y_train+1))
gbm = xgb.train(params,
                dtrain,
                num_boost_round = nrounds)

In [None]:
#train_pred = rfr.predict(X_datatest)
train_pred = np.exp(gbm.predict(xgb.DMatrix(X_datatest))) - 1

In [None]:
train_pred

In [None]:
len(train_pred)

In [None]:
sample.shape[0]

In [None]:
my_submission = pd.DataFrame({"id": test.id, "trip_duration": np.exp(train_pred)})
my_submission.head()


## 6. export du fichier submission

In [None]:


my_submission.to_csv('submission.csv', index=False)
my_submission.head()