### Import Libraries

In [None]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import os

In [None]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [None]:
train = pd.read_csv("Data2/Train.csv")
test = pd.read_csv("Data2/Test.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.drop(["UserId", 
            "ArrivalatDestination-Weekday(Mo=1)",
            "ArrivalatDestination-Time", 
            "ArrivalatDestination-DayofMonth",
            "RiderId"], axis=1, inplace=True)

In [None]:
test.drop(["UserId","RiderId"], axis=1, inplace=True)

In [None]:
train.columns.values

In [None]:
test.columns.values

In [None]:
# Checking the data types of all the columns
train.dtypes

In [None]:
train.isnull().sum()

#### Replace Nan values with mean

In [None]:
train['Temperature'] = train['Temperature'].fillna((train['Temperature'].mean()))
train['Precipitationinmillimeters'] = train['Precipitationinmillimeters'].fillna((train['Precipitationinmillimeters'].mean()))

In [None]:
train.isnull().sum()

In [None]:
train.head()

## *Data Exploration*

In [None]:
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [None]:
train['TimefromPickuptoArrival'].describe()

In [None]:
sns.distplot(train['TimefromPickuptoArrival'])

In [None]:
print("Skewness: %f" % train['TimefromPickuptoArrival'].skew())
print("Kurtosis: %f" % train['TimefromPickuptoArrival'].kurt())

In [None]:
corr = train.corr()

mask = np.zeros_like(corr)
mask[np.triu_indices_from(mask)] = True

plt.subplots(figsize=(12, 12))
cmap = sns.diverging_palette(240, 10, n=9, as_cmap=True)
sns.heatmap(corr, mask=mask, square=True, annot=True, fmt='.1f', cmap=cmap, vmin=-1, center=0, linewidths=.5)
plt.show()

#### Onehot encoding

In [None]:
def LabelEncord_categorical(df):
    categorical_params = ["VehicleType","PersonalorBusiness"]
    for params in categorical_params:
        le = LabelEncoder()
        df[params] = le.fit_transform(df[params])
    return df

def to_date_time(df):
    col_parameters = ["Placement-Time","Confirmation-Time","ArrivalatPickup-Time","Pickup-Time"]
    for params in col_parameters:
        df[params] = pd.to_datetime(df[params])
        df[params] = (pd.to_timedelta(df[params].dt.strftime('%H:%M:%S')).dt.total_seconds().astype(int))
    return df


def pre_processing(df):
    df = LabelEncord_categorical(df)
    df = to_date_time(df)
    return df

In [None]:
train = pre_processing(train)

In [None]:
test = pre_processing(test)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
#list(train)

# Model: RandomForestRegressor

In [None]:
from sklearn.model_selection import cross_val_score,cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

In [None]:
X = train.drop(["TimefromPickuptoArrival","OrderNo"],axis=1)
y = train["TimefromPickuptoArrival"]

In [None]:
X_train,X_test,y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)

In [None]:
rf_model = RandomForestRegressor(random_state=1)

In [None]:
rf_model.fit(X_train,y_train)

In [None]:
predicted = rf_model.predict(X_test)

In [None]:
rmse = np.sqrt(mean_squared_error(y_test,predicted))
print("Root Mean Squared Error:", rmse)

## With GridSearch

In [None]:
gsc = GridSearchCV(estimator=RandomForestRegressor(),
                   param_grid = {'max_depth': range(3,7),'n_estimators': (10, 50, 100, 1000),},
                   cv=5,
                   scoring='neg_mean_squared_error',
                   verbose=0,
                   n_jobs=-1)
    
grid_result = gsc.fit(X, y)
best_params = grid_result.best_params_

In [None]:
rfr = RandomForestRegressor(max_depth=best_params["max_depth"],
                            n_estimators=best_params["n_estimators"],
                            random_state=False,
                            verbose=False)   
 

In [None]:
scores = cross_val_score(rfr, X, y, cv=10, scoring='neg_mean_absolute_error')

In [None]:
predictions = cross_val_predict(rfr, X, y, cv=10)

In [None]:
rmse = np.sqrt(mean_squared_error(y,predictions))
print("Root Mean Squared Error:", rmse)

# Model: XGBRegressor

In [None]:
# Create training predictors data
train_X = train[predictor_cols]
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import plot_importance


xgb = XGBRegressor(n=1500)
xgb.fit(train_X, train_y, verbose=False)

In [None]:
# Read the test data
#test = pd.read_csv('../input/test.csv')
# Treat the test data in the same way as training data. In this case, pull same columns.
test_X = test[predictor_cols]
#model=Lasso(alpha =0.001, random_state=1)

#model.fit(train_X,train_y)

#pred=model.predict(test_X)
#print(pred)
# Use the model to make predictions
xgb_predictions = xgb.predict(test_X)
# We will look at the predicted prices to ensure we have something sensible.
print(predicted_prices)

In [None]:
rmse = np.sqrt(mean_squared_error(y,xgb_predictions))
print("XGB Root Mean Squared Error:", rmse)

In [None]:
# Print the best parameters and lowest RMSE
print("Best parameters found: ", xgb_predictions.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.xgb_predictions)))

In [None]:
plot_importance(xgb, max_num_features=10)

## Grid Search with XGBoost

Let’s take our parameter tuning to the next level by using scikit-learn’s GridSearch and RandomizedSearch capabilities with internal cross-validation using the GridSearchCV and RandomizedSearchCV functions. 

In [None]:
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV

In [None]:
# A parameter grid for XGBoost
params = {'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]}

# Initialize XGB and GridSearch
xgb = XGBRegressor(nthread=-1) 

grid = GridSearchCV(xgb, params)
grid.fit(X, Y)


In [None]:
# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'colsample_bytree': [0.3, 0.7],
    'n_estimators': [10, 50, 100, 1000],
    'max_depth': [2, 7]
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor()

# Perform grid search: grid_mse
grid_mse = GridSearchCV(estimator=gbm, 
                        param_grid=gbm_param_grid,
                        scoring='neg_mean_squared_error', 
                        cv=4,
                        n_jobs = -1,
                        verbose=1)

grid_mse.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))

## Random Search with XGBoost

Often, GridSearchCV can be really time consuming, so in practice, we may want to use RandomizedSearchCV instead.

In [None]:
# Create the parameter grid: gbm_param_grid
gbm_param_grid = {
    'n_estimators': [25],
    'max_depth': range(2, 12)
}

# Instantiate the regressor: gbm
gbm = xgb.XGBRegressor(n_estimators=10)

# Perform random search: grid_mse
randomized_mse = RandomizedSearchCV(param_distributions=gbm_param_grid, 
                                    estimator=gbm, 
                                    scoring="neg_mean_squared_error", 
                                    n_iter=5, 
                                    cv=4, 
                                    n_jobs = -1,
                                    verbose=1)

# Fit randomized_mse to the data
randomized_mse.fit(X, y)

# Print the best parameters and lowest RMSE
print("Best parameters found: ", randomized_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(randomized_mse.best_score_)))

## Submission

In [None]:
#my_submission = pd.DataFrame({'OrderNo': test.OrderNo, 'TimefromPickuptoArrival': predicted_prices_2})
# you could use any filename. We choose submission here
#my_submission.to_csv('submission2.csv', index=False)