## 1. Data loading

In [None]:
import os

import numpy as np
import pandas as pd
import seaborn as sns

import math

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_log_error as MSLE
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.preprocessing import StandardScaler

In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt

mpl.rcParams['axes.titlesize']=12
mpl.rcParams['xtick.labelsize']=12
mpl.rcParams['ytick.labelsize']=12


%matplotlib inline 

In [None]:
from sklearn.preprocessing import StandardScaler

In [None]:
BASEPATH = os.path.join('../input')
TRAIN_PATH = os.path.join(BASEPATH, 'train.csv')
TEST_PATH = os.path.join(BASEPATH, 'test.csv')

In [None]:
train = pd.read_csv(TRAIN_PATH)

## 2. Data exploration

In [None]:
train.head()

In [None]:
train.describe()

In [None]:
print("Train size :",len(train))

In [None]:
mpl.rcParams['figure.figsize']=(10,5)

plt.hist(train['trip_duration'])
plt.title("Trip Duration Distribution");

In [None]:
train.trip_duration[train['trip_duration'] > 3000][train['trip_duration'] < 10000].hist()
plt.title("Under 50 Minutes Trips Duration Distribution");

In [None]:
train.trip_duration[train['trip_duration'] > 9500][train['trip_duration'] < 15000].hist()
plt.title("Under 50 Minutes Trips Duration Distribution");

In [None]:
test = pd.read_csv(TEST_PATH)
test.head()

In [None]:
train.dtypes, test.dtypes

In [None]:
NUM_VARS = ['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'pickup_latitude',
           'dropoff_longitude', 'dropoff_latitude']
CAT_VARS = ['pickup_datetime','dropoff_datetime', 'store_and_fwd_flag']

## 3. Data preprocessing

### 3.1 Outliers 

In [None]:
train['vendor_id'].hist();

In [None]:
train['passenger_count'].hist();

### Outliers : Pick-up location

In [None]:
train['pickup_longitude'].plot.box()
plt.title("Pickup Longitude Distribution");

In [None]:
plt.boxplot(train['pickup_latitude'])
plt.title("Pickup Lattitude Distribution");

In [None]:
plt.scatter(train['pickup_longitude'],train['pickup_latitude']);

In [None]:
mpl.rcParams['figure.figsize']=(15,7)
plt.scatter(train['pickup_longitude'],train['pickup_latitude'])
plt.axis([-74.02,-73.92,40.7,40.82])

On drop les quelques outliers larges sur les positions de récupération.

#### Outliers : Pick-up location, par rapport à la longitude : 

In [None]:
mpl.rcParams['figure.figsize']=(10,5)
plt.scatter(train['pickup_longitude'],train['trip_duration'],color='r');

In [None]:
train2 = train[train['pickup_longitude'] > -80][train['pickup_longitude'] < -60]
train2 = train2[train2['trip_duration'] <= 500000]
plt.scatter(train2['pickup_longitude'],train2['trip_duration'],color='r');

In [None]:
train2 = train2[train2['pickup_longitude'] > -75][train2['pickup_longitude'] < -72]
plt.scatter(train2['pickup_longitude'],train2['trip_duration'],color='r');

#### Outliers : Pick-up location par rapport à la latitude

In [None]:
plt.scatter(train2['pickup_latitude'],train2['trip_duration'],color='g');

In [None]:
train2 = train2[train2['pickup_latitude'] < 42][train2['pickup_latitude'] > 39]
plt.scatter(train2['pickup_latitude'],train2['trip_duration'],color='g');

#### Finalement :

In [None]:
plt.scatter(train2['pickup_longitude'],train2['pickup_latitude']);

### Drop-off location

In [None]:
mpl.rcParams['figure.figsize']=(10,5)
plt.boxplot(train2['dropoff_longitude']);

In [None]:
plt.boxplot(train2['dropoff_latitude']);

In [None]:
plt.scatter(train2['dropoff_longitude'],train2['dropoff_latitude']);

In [None]:
mpl.rcParams['figure.figsize']=(15,7)
plt.scatter(train2['dropoff_longitude'],train2['dropoff_latitude'])
plt.axis([-74.02,-73.92,40.7,40.82])

#### Même chose ces outliers
#### Drop-off location et longitude

In [None]:
mpl.rcParams['figure.figsize']=(10,5)
plt.scatter(train2['dropoff_longitude'],train2['trip_duration'],color='r');

In [None]:
train2 = train2[train2['dropoff_longitude'] > -76][train2['dropoff_longitude'] < -72]
plt.scatter(train2['dropoff_longitude'],train2['trip_duration'],color='r');

#### Drop-off location et latitude

In [None]:
plt.scatter(train2['dropoff_latitude'],train2['trip_duration'],color='g');

In [None]:
train2 = train2[train2['dropoff_latitude'] > 40][train2['dropoff_latitude'] < 41.5]
plt.scatter(train2['dropoff_latitude'],train2['trip_duration'],color='g');

 #### Finalement :

In [None]:
plt.scatter(train2['dropoff_longitude'],train2['dropoff_latitude']);

### 3.2 Missing values 

In [None]:
missing_val_count = (train2.isnull().sum())
missing_val_count

### 3.3 Categorical Data

In [None]:
# train
for column in CAT_VARS:
    train2[column] = train2[column].astype('category').cat.codes
train2.head()

In [None]:
# test
for column in CAT_VARS:
    if(column != 'dropoff_datetime'):
        test[column] = test[column].astype('category').cat.codes
test.head()

### 3.4 Log-transformation

In [None]:
train2.dtypes

In [None]:
train3 = np.abs(train2[['vendor_id', 'pickup_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag']]) 
train3 = np.log1p(train3)
train3.head()

## 4. Features

In [None]:
X_train = train3 #= train2[['vendor_id', 'pickup_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag']]
X_train.head()

In [None]:
y_train = np.log1p(train2['trip_duration'])
y_train.head()

## 6. Model 

Scaling des données et entraînement : 

In [None]:
t_X, val_X, t_y, val_y = train_test_split(X_train, y_train, test_size=0.2, random_state = 0)

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(t_X)

In [None]:
X_val_scaled = scaler.transform(val_X)

In [None]:
rf = RandomForestRegressor(random_state=1, n_jobs=-1)
rf.fit(X_train_scaled, t_y)
preds = rf.predict(X_val_scaled)

In [None]:
print(np.sqrt(MSLE(np.exp(val_y), np.exp(preds))))

Cross Validation :

In [None]:
cv_preds = cross_val_predict(rf, X_train, y_train, cv=10, n_jobs=-1)

In [None]:
# check cv_preds size
print(cv_preds)

print(np.sqrt(MSLE(np.exp(y_train), np.exp(cv_preds))))

## 8. Predictions

In [None]:
test_p = np.log1p(np.abs(test[['vendor_id', 'pickup_datetime', 'passenger_count', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag']]))
test_scaled = scaler.transform(test_p)

In [None]:
preds = rf.predict(test_scaled)
np.exp(preds)

In [None]:
sub = pd.DataFrame({'id':test.id,'trip_duration':np.exp(preds)})
sub.head(5)

In [None]:
sub.to_csv('submission.csv', index=0)