## Module Import

In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import math

%matplotlib inline

## Data Loading

In [None]:
train = pd.read_csv ('../input/train.csv')
test = pd.read_csv ('../input/test.csv')

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.dtypes

In [None]:
train.describe()

## Data Cleaning

In [None]:
train.isna().sum()

###### Il ne manque aucune valeur cependant il faut vérifier la qualité des valeurs du tableau et d'après le describe ci dessous on peut voir qu'il y a des courses avec 0 passager, on observe également des courses qui durent 1 seconde et d'autres qui durent 40 jours

In [None]:
#train = train[train['passenger_count']>0]
#train = train[train['passenger_count']<6]

In [None]:
#train.loc[train.trip_duration<4000
#          ,"trip_duration"].hist(bins=120
                                                        )

on peut voir qu'il y a très de course au dessus de 4000, on va donc admettre qu'une course dure entre 300 secondes(5mins) et 4000 secondes


In [None]:
#train = train[(train['trip_duration'] > 60) & (train['trip_duration'] < 4000*2)]
train['trip_duration'] = np.log(train['trip_duration'].values)


In [None]:
sns.set(style="white", palette="muted", color_codes=True)
f, axes = plt.subplots(2,2,figsize=(10, 10), sharex=False, sharey = False)
sns.despine(left=True)
sns.distplot(train['pickup_latitude'].values, label = 'pickup_latitude',color="m",bins = 100, ax=axes[0,0])
sns.distplot(train['pickup_longitude'].values, label = 'pickup_longitude',color="m",bins =100, ax=axes[0,1])
sns.distplot(train['dropoff_latitude'].values, label = 'dropoff_latitude',color="m",bins =100, ax=axes[1, 0])
sns.distplot(train['dropoff_longitude'].values, label = 'dropoff_longitude',color="m",bins =100, ax=axes[1, 1])
plt.setp(axes, yticks=[])
plt.tight_layout()
plt.show()

#### on devrait selectionner le pickup latitude et le drop off latitude entre 34 et 44, et pour pickup longitude et dropoff longitude on devrait prendre les valeurs entre - 90 et -80


In [None]:
#train = train.loc[train['pickup_longitude']> -80]
#train = train.loc[train['pickup_latitude']< 44]
#train = train.loc[train['dropoff_longitude']> -90]
#train = train.loc[train['dropoff_latitude']> 34]


## Features Selection, Extraction, Creation

In [None]:
def haversine(lat1, lon1, lat2, lon2):
   R = 6372800  # Earth radius in meters
   phi1, phi2 = math.radians(lat1), math.radians(lat2)
   dphi       = math.radians(lat2 - lat1)
   dlambda    = math.radians(lon2 - lon1)

   a = math.sin(dphi/2)**2 + \
       math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2

   return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))


In [None]:
train["distance"] = train.apply(lambda row: haversine(row["pickup_latitude"], row["pickup_longitude"], row["dropoff_latitude"], row["dropoff_longitude"]), axis=1)
test["distance"]  = test.apply(lambda row: haversine(row["pickup_latitude"], row["pickup_longitude"], row["dropoff_latitude"], row["dropoff_longitude"]), axis=1)



In [None]:
train['pickup_datetime'] = pd.to_datetime(train['pickup_datetime'])
train['dropoff_datetime'] = pd.to_datetime(train['dropoff_datetime'])
test['pickup_datetime'] = pd.to_datetime(test['pickup_datetime'])

train['hour'] = train.pickup_datetime.dt.hour
train['day'] = train.pickup_datetime.dt.dayofweek
train['month'] = train.pickup_datetime.dt.month
test['hour'] = test.pickup_datetime.dt.hour
test['day'] = test.pickup_datetime.dt.dayofweek
test['month'] = test.pickup_datetime.dt.month
                                                    


In [None]:
y_train = train["trip_duration"] # <-- target
X_train = train[["vendor_id","passenger_count","pickup_longitude", "pickup_latitude", "dropoff_longitude","dropoff_latitude","month","hour","day","distance"]] # <-- features

X_testdata = test[["vendor_id","passenger_count","pickup_longitude", "pickup_latitude", "dropoff_longitude","dropoff_latitude","month","hour","day","distance"]]

## Modele Selection & Training

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import ShuffleSplit
import xgboost as xgb

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, random_state=42,test_size= 0.1)


In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
rfr = RandomForestRegressor(n_estimators=100,min_samples_leaf=5, min_samples_split=50, max_depth=80,verbose=0,max_features="auto",n_jobs=-1)
rfr.fit(X_train, y_train)

## Submission

In [None]:
train_pred = rfr.predict(X_testdata)


In [None]:
train_pred


In [None]:
len(train_pred)

In [None]:
sample = pd.read_csv('../input/sample_submission.csv')

In [None]:
#my_submission = pd.DataFrame({'id': test.id, 'trip_duration': train_pred})
my_submission = pd.DataFrame({'id': test.id, 'trip_duration': np.exp(train_pred)})



In [None]:
my_submission.to_csv('sub.csv', index=False)