In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.model_selection import train_test_split, GridSearchCV
from haversine import haversine
import statsmodels.formula.api as sm
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
import warnings; warnings.simplefilter('ignore')

In [None]:
#import the data from a csv file.
data = pd.read_csv("../input/nyc-taxi-trip-duration/train.zip")

In [None]:
data.head()

In [None]:
#Convert timestamp to datetime format to fetch the other details as listed below
data['pickup_datetime'] = pd.to_datetime(data['pickup_datetime'])
#data['dropoff_datetime'] = pd.to_datetime(data['dropoff_datetime'])

In [None]:
#Calculate and assign new columns to the dataframe such as weekday,
#month and pickup_hour which will help us to gain more insights from the data.
#data['weekday'] = data.pickup_datetime.dt.weekday_name
data['month'] = data.pickup_datetime.dt.month
data['weekday_num'] = data.pickup_datetime.dt.weekday
data['pickup_hour'] = data.pickup_datetime.dt.hour

In [None]:
#calc_distance is a function to calculate distance between pickup and dropoff coordinates using Haversine formula.
def calc_distance(df):
    pickup = (df['pickup_latitude'], df['pickup_longitude'])
    drop = (df['dropoff_latitude'], df['dropoff_longitude'])
    return haversine(pickup, drop)

In [None]:
#Calculate distance and assign new column to the dataframe.
if 'distance' not in data.columns:
    data['distance'] = data.apply(lambda x: calc_distance(x), axis = 1)

In [None]:
#Calculate Speed in km/h for further insights
if 'speed' not in data.columns:
    data['speed'] = (data.distance/(data.trip_duration/3600))

In [None]:
#Dummify all the categorical features like "store_and_fwd_flag, vendor_id, month, weekday_num, pickup_hour, passenger_count" except the label i.e. "trip_duration"

dummy = pd.get_dummies(data.month, prefix='month')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #avoid dummy trap
data = pd.concat([data,dummy], axis = 1)

dummy = pd.get_dummies(data.weekday_num, prefix='weekday_num')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #avoid dummy trap
data = pd.concat([data,dummy], axis = 1)

dummy = pd.get_dummies(data.pickup_hour, prefix='pickup_hour')
dummy.drop(dummy.columns[0], axis=1, inplace=True) #avoid dummy trap
data = pd.concat([data,dummy], axis = 1)

if 'passenger_count' in data.columns:
    dummy = pd.get_dummies(data.passenger_count, prefix='passenger_count')
    dummy.drop(dummy.columns[0], axis=1, inplace=True) #avoid dummy trap
    data = pd.concat([data,dummy], axis = 1)

In [None]:
data.head()

Now our dataset is complete for the further analysis before we train our model with optimal variables.

In [None]:
data['passenger_count'] = data.passenger_count.map(lambda x: 1 if x == 0 else x)

In [None]:
data = data[data.passenger_count <= 6]

In [None]:
data = data[data.trip_duration <= 86400]

In [None]:
data = data[data.speed <= 104]

In [None]:
data = data[~((data.distance == 0) & (data.trip_duration >= 60))]

In [None]:
duo = data.loc[(data['distance'] <= 1) & (data['trip_duration'] >= 3600),['distance','trip_duration']].reset_index(drop=True)

In [None]:
data = data[~((data['distance'] <= 1) & (data['trip_duration'] >= 3600))]

In [None]:
data = data[data.pickup_longitude != data.pickup_longitude.min()]

In [None]:
data = data[data.pickup_longitude != data.pickup_longitude.min()]
#map_marker(data)

In [None]:
#First chech the index of the features and label
del data['id']
del data['dropoff_datetime']
del data['passenger_count']
del data['store_and_fwd_flag']
list(zip( range(0,len(data.columns)),data.columns))

In [None]:
Y = data.iloc[:,6].values
del data['trip_duration']
del data['pickup_datetime']
del data['vendor_id']
list(zip( range(0,len(data.columns)),data.columns))

In [None]:
X = data.iloc[:,range(0,46)].values

In [None]:
X1 = np.append(arr = np.ones((X.shape[0],1)).astype(int), values = X, axis = 1)

In [None]:
X1.shape

There we go, our feature set is now ready for the feature selection model with 1s in the first column for a0 constant.

Let's fit stats model on the X array to figure out an optimal set of features by recursively checking for the highest p value and removing the feature of that index.

### Note:
Here we will take the level of significance as 0.05 i.e. 5% which means that we will reject feature from the list of array and re-run the model till p value for all the features goes below .05 to find out the optimal combination for our model.

In [None]:
#Select all the features in X array
X_opt = X1[:,range(0,47)]
#regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()

#Fetch p values for each feature
#p_Vals = regressor_OLS.pvalues

#define significance level for accepting the feature.
#sig_Level = 0.05

#Loop to iterate over features and remove the feature with p value less than the sig_level
#while max(p_Vals) > sig_Level:
    #print("Probability values of each feature \n")
    #print(p_Vals)
    #X_opt = np.delete(X_opt, np.argmax(p_Vals), axis = 1)
    #print("\n")
    #print("Feature at index {} is removed \n".format(str(np.argmax(p_Vals))))
    #print(str(X_opt.shape[1]-1) + " dimensions remaining now... \n")
    #regressor_OLS = sm.OLS(endog = Y, exog = X_opt).fit()
    #p_Vals = regressor_OLS.pvalues
    #print("=================================================================\n")
    
#Print final summary
#print("Final stat summary with optimal {} features".format(str(X_opt.shape[1]-1)))
#regressor_OLS.summary()

In [None]:
#Split raw data
X_train, X_test, y_train, y_test = train_test_split(X,Y, random_state=4, test_size=0.2)

#Split data from the feature selection group
X_train_fs, X_test_fs, y_train_fs, y_test_fs = train_test_split(X_opt,Y, random_state=4, test_size=0.2)

In [None]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X,Y, random_state=4, test_size=0.2)

### Scale Data
It is suggested to scale the input varibles first before applying PCA to standardise the variance and avoid the bias. Lets Scale the data using StandardScaler.

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_pca = scaler.fit_transform(X_train_pca)
X_test_pca = scaler.transform(X_test_pca)

### PCA application
Let's apply PCA technique on the training features to understand how many principal components should we select for our model to capture atleast 90% variance. For that we will take help of plot and cumsum function of numpy package.

In [None]:
from sklearn.decomposition import PCA
pca = PCA().fit(X_train_pca)
#.plot(np.cumsum(pca.explained_variance_ratio_))
#plt.xlabel("number of components")
#plt.ylabel("Cumulative explained variance")
#plt.show()

In [None]:
arr = np.cumsum(np.round(pca.explained_variance_ratio_, decimals=4)*100)
list(zip(range(1,len(arr)), arr))

### Observation
- Here we can see that almost 40 variables are needed for capturing atleast 99% of the variance in the training dataset. Hence we will use the same set of variables.

In [None]:
pca_10 = PCA(n_components=43)
X_train_pca = pca_10.fit_transform(X_train_pca)
X_test_pca = pca_10.transform(X_test_pca)

PCA is applied on the training and the test dataset. Our input features are now ready for the regression.

### Observations
 - All of the features shows **NO** correlation at all. Because feature extraction removes all collinearity.
 
Let's move on to the Model now.

<a id=model></a>
# Model
***
We need a model to train on our dataset to serve our purpose of prediciting the NYC taxi trip duration given the other features as training and test set. Since our dependent variable contains continous values so we will use regression technique to predict our output.

<a id=lin_reg></a>
## Multiple Linear Regression
***
It is used to explain the relationship between one continuous dependent variable and two or more independent variables. Let's proceed

<a id=train></a>
## Model training
***
We will first try with the default instantiation of the regressor object without using any generalization parameter. We will also **not perform any scaling** of the features because linear regression model takes care of that inherently. This is a plus point to use Linear regression model. It is quite fast to train even on very large datasets. So considering the size of our dataset this seems to be the correct approach as of now. Let's see how it performs.

<img src='http://www.sixthcents.net/images/macbook.gif'/>

In [None]:
#Linear regressor for the raw data
#regressor = LinearRegression() 
#regressor.fit(X_train,y_train) 

#Linear regressor for the Feature selection group
#regressor1 = LinearRegression() 
#regressor1.fit(X_train_fs,y_train_fs) 

#Linear regressor for the Feature extraction group
regressor2 = LinearRegression() 
regressor2.fit(X_train_pca,y_train_pca) 

### Interesting find:
 - It took **approx 1 second to train the model** on dataset of more than 1 million records.
 - It is evident that Linear regression model is **extremely fast** to train on the high dimension datasets consisting of even **millions** of records.
 - Linear regression object for the feature extraction group took less time to train on the input features.

<a id=predict></a>
## Model prediction
***
So now, our model has been fitted to the training set. It's time to predict the dependent variable. Let's do that now.

In [None]:
#Predict from the test features of raw data
#y_pred = regressor.predict(X_test) 

#Predict from the test features of Feature Selection group
#y_pred = regressor1.predict(X_test_fs) 

#Predict from the test features of Feature Extraction group
y_pred_pca = regressor2.predict(X_test_pca) 

<a id=evaluate></a>
## Model evaluation
***
We will evaluate our model's accuracy through two suggested metrics for the regression models. i.e. RMSE and variance score. Where RMSE of 0 and variance of 1 is considered as the best score for a prediction model.

In [None]:
#Evaluate the regressor on the raw data
#print('RMSE score for the Multiple LR raw is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test,y_pred))))
#print('Variance score for the Multiple LR raw is : %.2f' % regressor.score(X_test, y_test))
#print("\n")

#Evaluate the regressor on the Feature selection group
#print('RMSE score for the Multiple LR FS is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test_fs,y_pred))))
#print('Variance score for the Multiple LR FS is : %.2f' % regressor1.score(X_test_fs, y_test_fs))
#print("\n")

#Evaluate the regressor on the Feature extraction group
print('RMSE score for the Multiple LR PCA is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test_pca,y_pred_pca))))
print('Variance score for the Multiple LR PCA is : %.2f' % regressor2.score(X_test_pca, y_test_pca))

In [None]:
import pickle
import joblib
filename = 'mlr.pkl'
joblib.dump(regressor2, filename)

### Observations
 - Very poor **Root mean squared** value. 
 - And the low **variance score** which is also bad.
 - Both the models i.e. from the feature selection and the feature extraction group resulted quite bad in prediction.
 
 **Let's find out the reason of this behaviour:-**

In [None]:
X_train.shape

In [None]:
#Find linear correlation of each feature with the target variable
from scipy.stats import pearsonr
df1 = pd.DataFrame(np.concatenate((X_train,y_train.reshape(len(y_train),1)),axis=1))
df1.columns = df1.columns.astype(str)

features = df1.iloc[:,:35].columns.tolist()
target = df1.iloc[:,35].name

correlations = {}
for f in features:
    data_temp = df1[[f,target]]
    x1 = data_temp[f].values
    x2 = data_temp[target].values
    key = f + ' vs ' + target
    correlations[key] = pearsonr(x1,x2)[0]
    
data_correlations = pd.DataFrame(correlations, index=['Value']).T
data_correlations.loc[data_correlations['Value'].abs().sort_values(ascending=False).index]

### Observations
We can see that none of the feature is linearly correlated with the target variable **"46"**. That is why it is not a good model for the prediction of the trip duration. So let's move ahead and try the **random forest regressor**. We are not using decision tree regressor because the random forest will anyways consist of almost all its properties. Also, we will not use SVR because it takes too much time to train on this huge dataset even with the default settings. It seems to be not good with high dimensional dataset as well as for the huge instances.

<a id=rf_reg></a>
## Random Forest Regressor
***
A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting

### Model training
***
Now we will train the model on the filtered features. Our data has already been split so we will not split the data further.

#### Note:
We used **GridSearch** to tune the **hyperparameters** of random forest regressor to get the best possible test score. We tried various combination of the allowed hyper params values. _But any kind of combination could not produce significantly better results than the default settings. There can be many reasons for that and it totally depends on the type of data we have in hand. Therefore we will not show tuned regressor results here._

In [None]:
#instantiate the object for the Random Forest Regressor with default params from raw data
#regressor_rfraw = RandomForestRegressor(n_jobs=-1)

#instantiate the object for the Random Forest Regressor with default params for Feature Selection Group
#regressor_rf = RandomForestRegressor(n_jobs=-1)

# #instantiate the object for the Random Forest Regressor with tuned hyper parameters for Feature Selection Group
# regressor_rf1 = RandomForestRegressor(n_estimators = 26,
#                                      max_depth = 22,
#                                      min_samples_split = 9,
#                                      n_jobs=-1)

#instantiate the object for the Random Forest Regressor for Feature Extraction Group
regressor_rf2 = RandomForestRegressor(n_jobs=-1)


#Train the object with default params for raw data
#regressor_rfraw.fit(X_train,y_train)

#Train the object with default params for Feature Selection Group
#regressor_rf.fit(X_train_fs,y_train_fs)

# #Train the object with tuned params for Feature Selection Group
# regressor_rf1.fit(X_train_fs,y_train_fs)

# #Train the object with default params for Feature Extraction Group
regressor_rf2.fit(X_train_pca,y_train_pca)

print("\n")

### Model prediction

In [None]:
#Predict the output with object of default params for Feature Selection Group
#y_pred_rfraw = regressor_rfraw.predict(X_test)

#Predict the output with object of default params for Feature Selection Group
#y_pred_rf = regressor_rf.predict(X_test_fs)

# #Predict the output with object of hyper tuned params for Feature Selection Group
# y_pred_rf1 = regressor_rf1.predict(X_test_fs)

#Predict the output with object of PCA params for Feature Extraction Group
y_pred_rfpca = regressor_rf2.predict(X_test_pca)

print("\n")

### Model evaluation

In [None]:
#Evaluate the model with default params for raw data
#print('RMSE score for the RF regressor raw is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test,y_pred_rfraw))))
#print('RMSLE score for the RF regressor raw is : {}'.format(np.sqrt(metrics.mean_squared_log_error(y_test,y_pred_rfraw))))
#print('Variance score for the RF regressor raw is : %.2f' % regressor_rfraw.score(X_test, y_test))

#print("\n")

#Evaluate the model with default params for Feature Selection Group
#print('RMSE score for the RF regressor is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test_fs,y_pred_rf))))
#print('RMSLE score for the RF regressor is : {}'.format(np.sqrt(metrics.mean_squared_log_error(y_test_fs,y_pred_rf))))
#print('Variance score for the RF regressor is : %.2f' % regressor_rf.score(X_test_fs, y_test_fs))

# print("\n")

# #Evaluate the model with tuned params for Feature Selection Group
# print('RMSE score for the RF regressor1 is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test_fs,y_pred_rf1))))
# print('RMSLE score for the RF regressor1 is : {}'.format(np.sqrt(metrics.mean_squared_log_error(y_test_fs,y_pred_rf1))))
# print('Variance score for the RF regressor1 is : %.2f' % regressor_rf1.score(X_test_fs, y_test_fs))

#print("\n")

#Evaluate the model with PCA params  for Feature Extraction Group
print('RMSE score for the RF regressor2 is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test_pca, y_pred_rfpca))))
print('Variance score for the RF regressor2 is : %.2f' % regressor_rf2.score(X_test_pca, y_test_pca))

In [None]:
filename = 'rfr.pkl'
joblib.dump(regressor_rf2, filename)

### Interesting find
- There is approx **200% improvement** on the RMSE score for the Random forest regressor over the Linear regressor of the feature selection group.
- Even the variance score is approx 1 which is a good score.
- RMSE score for the RF regressor of feature extraction group is still very bad along with the variance score.
- RMSE score for the feature selection group is more or less same as the raw data score. Sometimes the RMSE score for the raw data is better and vice versa. It fluctuates on every iteration and this is quite weird!

Let's see if we can improve this further with the most sought after algorigthm i.e. XGBoost!!

<a id=xgboost></a>
## XGBoost Regressor
***
XGBoost (Extreme Gradient Boosting) is an optimized distributed gradient boosting library. It uses gradient boosting (GBM) framework at core. It belongs to a family of boosting algorithms that convert weak learners into strong learners. A weak learner is one which is slightly better than random guessing.

'Boosting' here is a sequential process; i.e., trees are grown using the information from a previously grown tree one after the other. This process slowly learns from data and tries to improve its prediction in the subsequent iterations.

### Model training
***
We will train the model on the filtered features. Our data has already been split so we will not split the data further.

#### Note:
We used **GridSearch** to tune the **hyperparameters** of XGBoost regressor to get the best possible test score.  We will compare results from the default regressor and the tuned regressor.

In [None]:
#instantiate the object for the XGBoost Regressor with default params for raw data
#regressor_xgbraw = XGBRegressor(n_jobs=-1)

#instantiate the object for the XGBoost Regressor with default params for Feature Selection Group
#regressor_xgb = XGBRegressor(n_jobs=-1)

#instantiate the object for the XGBoost Regressor with tuned hyper parameters for Feature Selection Group
regressor_xgb1 = XGBRegressor(n_estimators=300,
                            learning_rate=0.09,
                            gamma=0,
                            subsample=0.75,
                            colsample_bytree=1,
                            max_depth=7,
                            min_child_weight=4,
                            silent=1,
                            n_jobs=-1)

#instantiate the object for the XGBoost Regressor for Feature Extraction Group
#regressor_xgb2 = XGBRegressor(n_jobs=-1)


#Train the object with default params for raw data
#regressor_xgbraw.fit(X_train,y_train)

#Train the object with default params for Feature Selection Group
#regressor_xgb.fit(X_train_fs,y_train_fs)

#Train the object with tuned params for Feature Selection Group
regressor_xgb1.fit(X_train_pca,y_train_pca)

#Train the object with default params for Feature Extraction Group
#regressor_xgb2.fit(X_train_pca,y_train_pca)

print("\n")

### Model prediction

In [None]:
#Predict the output with object of default params for raw data
#y_pred_xgbraw = regressor_xgbraw.predict(X_test)

#Predict the output with object of default params for Feature Selection Group
#y_pred_xgb = regressor_xgb.predict(X_test_fs)

#Predict the output with object of hyper tuned params for Feature Selection Group
y_pred_xgb1 = regressor_xgb1.predict(X_test_pca)

#Predict the output with object of PCA params for Feature Extraction Group
#y_pred_xgb_pca = regressor_xgb2.predict(X_test_pca)

print("\n")

### Model Evaluation

In [None]:
#Evaluate the model with default params for raw data
#print('RMSE score for the XGBoost regressor raw is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test,y_pred_xgbraw))))
# print('RMSLE score for the XGBoost regressor is : {}'.format(np.sqrt(metrics.mean_squared_log_error(y_test,y_pred_xgb))))
#print('Variance score for the XGBoost regressor raw is : %.2f' % regressor_xgbraw.score(X_test, y_test))

print("\n")

#Evaluate the model with default params for Feature Selection Group
#print('RMSE score for the XGBoost regressor is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test_fs,y_pred_xgb))))
# print('RMSLE score for the XGBoost regressor is : {}'.format(np.sqrt(metrics.mean_squared_log_error(y_test,y_pred_xgb))))
#print('Variance score for the XGBoost regressor is : %.2f' % regressor_xgb.score(X_test_fs, y_test_fs))

print("\n")

#Evaluate the model with Tuned params for Feature Selection Group
#print('RMSE score for the XGBoost regressor1 is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test_fs,y_pred_xgb1))))
# print('RMSLE score for the XGBoost regressor1 is : {}'.format(np.sqrt(metrics.mean_squared_log_error(y_test_fs,y_pred_xgb1))))
#print('Variance score for the XGBoost regressor1 is : %.2f' % regressor_xgb1.score(X_test_fs,y_test_fs))

print("\n")

#Evaluate the model with PCA params  for Feature Extraction Group
print('RMSE score for the XGBoost regressor2 is : {}'.format(np.sqrt(metrics.mean_squared_error(y_test_pca, y_pred_xgb1))))
print('Variance score for the XGBoost regressor2 is : %.2f' % regressor_xgb1.score(X_test_pca, y_test_pca))

In [None]:
filename = 'xgbr.pkl'
joblib.dump(regressor_xgb1, filename)

### Observations
- There is a significant **improvement** in the RMSE score for the **tuned** XGBoost regressor over the Random forest regressor when trained on the feature selection group.
- But the performance of the **default** XGBoost regressor is quite **worse** than the default RF regressor on the same data.
- Also, the RMSE score on the raw data and feature selected data are same, which disproves the theory that it is always better to select the relevant features which are statistically important. As the data behaves differently in different models.
- Not to mention the fact that RMSE score for the XGBoost regressor of the feature extraction group is still bad along with the variance score. 

In [None]:
#Comparing test results for the XGBoost and RF regressor
print("Total sum of difference between the actual and the predicted values for the RF regressor is : %d"%np.abs(np.sum(np.subtract(y_test,y_pred_rfpca))))
print("Total sum of difference between the actual and the predicted values for the tuned XGB regressor is : %d"%np.abs(np.sum(np.subtract(y_test,y_pred_xgb1))))

### General inference
- XGBoost proved to be much more efficient in predicting the output. But it takes much more time to train it over the large dataset wih more complexity as compared to the RF and Linear regression model but less time then the SVR.
- It didn't helped us much to generalize the model by tuning hyper parameters for the RF model as there is not much difference in the RMSE scores of the default model and the tuned model of the feature selection group infact both varies on every iteration and sometimes the tuned model gives poor results than the default model. Though we tried many possible alterations with GSCV but the tuning could not achieve a significant improvement over the default model which also depends on the contents of the dataset.
- Contrast to the RF regressor, XGBoost regressor prediction results were consistent on every iteration i.e. for each param configuration the results were the same.
- Feature extraction didn't helped in anyway to improve the RMSE score with any of the regressor models. This shows us that the feature extraction is somewhat not a good technique to preprocess the data before feeding it into the regressor models for the continous target value prediction. Whereas it also depends on the type and features of data that how it behaves with the model.

<a id=curve></a>
## Learning curves
***
Learning curves constitute a great tool to diagnose bias and variance in any supervised learning algorithm. It shows how error changes as the training set size increases. We'll use the learning_curve() [function](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html) from the scikit-learn library to generate a learning curve for the regression model. There's no need to put aside a validation set because learning_curve() will take care of that and that's why we will plot the learning curve over whole dataset.

In [None]:
#Define a function to plot learning curve.
def learning_curves(estimator, title, features, target, train_sizes, cv, n_jobs=-1):
    plt.figure(figsize = (14,5))
    train_sizes, train_scores, validation_scores = learning_curve(estimator, features, target, train_sizes = train_sizes, cv = cv, scoring = 'neg_mean_squared_error',  n_jobs=n_jobs)
    train_scores_mean = -train_scores.mean(axis = 1)
    validation_scores_mean = -validation_scores.mean(axis = 1)
    
    plt.grid()
    
    plt.plot(train_sizes, train_scores_mean,'o-', color="r", label = 'Training error')
    plt.plot(train_sizes, validation_scores_mean,'o-', color="g", label = 'Validation error')

    plt.ylabel('MSE', fontsize = 14)
    plt.xlabel('Training set size', fontsize = 14)
    
    title = 'Learning curves for a ' + title + ' model'
    plt.title(title, fontsize = 18, loc='left')
    
    plt.legend(loc="best")
    
    return plt

# score curves, each time with 20% data randomly selected as a validation set.
cv = ShuffleSplit(n_splits=10, test_size=0.2, random_state=4)

# Plot learning curve for the RF Regressor
title = "Random Forest Regressor"

# Call learning curve with all dataset i.e. traininig and test combined because CV will take of data split.
learning_curves(regressor_xgb1, title, X_opt,Y, train_sizes=np.linspace(.1, 1.0, 5), cv=cv, n_jobs=-1)

#Plot learning curve for the XGBoost Regressor
#title = "XGBoost Regressor"

# Call learning curve on less number of estimators than the tuned estimator because it took too much time for the compilation.
#learning_curves(XGBRegressor(n_estimators=111,
                            #learning_rate=0.08,
                            #gamma=0,
                            #subsample=0.75,
                           # colsample_bytree=1,
                            #max_depth=7,
                            #min_child_weight=4,
                            #silent=1), title, X_opt,Y, train_sizes=np.linspace(.1, 1.0, 5), cv=cv, n_jobs=-1)

plt.show()

### Observations:
- We can observe that both the models shows somewhat similar learning rate but with visible differences in error rates. 
- RF training curve initially starts high but later on improves as the training size increases and then seems to plateaud by the end.
- XGBoost training curve on the other hand starts quite low and further improves with the increase in the training size and it too plateau towards the end.
- Validation curve seems to show similar trend in both the models i.e. starts very high but improves with the training size with some differences in error rate i.e. XGBoost curve learning is quite fast and more accurate as compared to the RF one.
- Both the models seems to suffer from **high variance** since the training curve error is very less in both the models.
- The large gap at the end also indicates that the model suffers from quite a **low bias** i.e. overfitting the training data.
- Also, both the model's still has potential to decrease and converge towards the training curve by the end.

**At this point, here are a few things we could do to improve our model:**

1. Add more training instances to improve validation curve in the XGBoost model.
2. Increase the regularization for the learning algorithm. This should decrease the variance and increase the bias towards the validation curve.
3. Reduce the numbers of features in the training data that we currently use. The algorithm will still fit the training data very well, but due to the decreased number of features, it will build less complex models. This should increase the bias and decrease the variance.

<a id=final></a>
## End Notes
***
In this project we covered various aspects of the Machine learning development cycle. We observed that the data exploration and variable analysis is a very important aspect of the whole cycle and should be done for thorough understanding of the data. We also cleaned the data while exploring as there were some outliers which should be treated before feature engineering. Further we did feature engineering to filter and gather only the optimal features which are more significant and covered most of the variance in the dataset. Then finally we trained the models on the optimum featureset to get the results. 

## Further Scope..
***
There's always a room for the improvement and a lot more to explore, and **if this helped you** in any way, I'd like to see **One Upvote!**. Also, please **leave comments** about any further improvements to this notebook!! Your feedback or any constructive criticism is highly appreciated.

## Thank you guys... Yayyy!!!!


<img src='https://media.agoramt.com.br/2018/08/Minions.gif' align='left'/>