### Importation des librairies

In [None]:
import os

import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

from datetime import datetime

In [None]:
%matplotlib inline
sns.set({'figure.figsize':(10,6), 'axes.titlesize':20, 'axes.labelsize':8})

## 1 - Data loading

In [None]:
df_train = pd.read_csv('../input/train.csv')
df_train.head()

In [None]:
df_test = pd.read_csv('../input/test.csv')
df_test.head()

## 2 - Data exploration

In [None]:
df_train.info()

In [None]:
df_train.describe()

## 3 -  Data pre-processing

### 3.1 - Outliers

In [None]:
plt.hist(df_train[df_train.trip_duration < 5000].trip_duration, bins = 100)
plt.title('Trip duration distribution')
plt.xlabel('Duration of a trip (in seconds)')
plt.ylabel('Number of trips')
plt.show()

In [None]:
df_train = df_train[(df_train.trip_duration < 3000)]
df_train.info()

### 3.2 - Missing & duplicate values

In [None]:
df_train.isna().sum()

In [None]:
df_train.duplicated().sum()

### 3.3 - Categorical variables

In [None]:
cat_vars = ['store_and_fwd_flag']

In [None]:
for col in cat_vars:
    df_train[col] = df_train[col].astype('category').cat.codes
df_train.head()

In [None]:
for col in cat_vars:
    df_test[col] = df_test[col].astype('category').cat.codes
df_test.head()

## 4 - Features engineering

### 4.1 - Features creation

In [None]:
df_train['log_trip_duration'] = np.log(df_train.trip_duration)

In [None]:
df_train['distance'] = np.sqrt((df_train.pickup_latitude - df_train.dropoff_latitude)**2 + (df_train.pickup_longitude - df_train.dropoff_longitude)**2)

In [None]:
df_test['distance'] = np.sqrt((df_test.pickup_latitude - df_test.dropoff_latitude)**2 + (df_test.pickup_longitude - df_test.dropoff_longitude)**2)

In [None]:
df_train['log_distance'] = np.log(df_train.distance)

In [None]:
df_test['log_distance'] = np.log(df_test.distance)

### 4.2 - Features selection

In [None]:
df_train = df_train.drop(['vendor_id', 'store_and_fwd_flag'], axis=1)
df_train.head()

In [None]:
df_test = df_test.drop(['vendor_id', 'store_and_fwd_flag'], axis=1)
df_test.head()

In [None]:
num_features = ['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude']
target = 'log_trip_duration'

In [None]:
X_train = df_train.loc[:, num_features]
y_train = df_train[target]
X_test = df_test.loc[:, num_features]
X_train.shape, y_train.shape, X_test.shape

## 5 - Model training

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
m = RandomForestRegressor(n_estimators=20)
m.fit(X_train, y_train)

## 6 - Validation

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
cv_scores = cross_val_score(m,X_train,y_train,cv=5,scoring='neg_mean_squared_log_error')
cv_scores

In [None]:
for i in range(len(cv_scores)):
    cv_scores[i] = np.sqrt(abs(cv_scores[i]))
cv_scores

## 7 - Predictions

In [None]:
y_test_pred = m.predict(X_test)
y_test_pred[:5]

## 8 - Submit predictions

In [None]:
submission = pd.DataFrame({'id': df_test.id, 'trip_duration': np.exp(y_test_pred)})
submission.head()

In [None]:
submission.to_csv('Submission_file.csv', index=False)