In [720]:
# Importing the Dependencies
import numpy as np
import pandas as pd
pd.options.display.max_rows = 10
pd.options.display.max_columns = 300
import seaborn as sns
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.express as px
from plotly.subplots import make_subplots
from plotly import graph_objects as go
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

* Features ---- Unit ---- Description
* Clonesize ---- m2 ---- The average blueberry clone size in the field
* Honeybee ---- bees/m2/min ---- Honeybee density in the field
* Bumbles ---- bees/m2/min ---- Bumblebee density in the field
* Andrena ---- bees/m2/min ---- Andrena bee density in the field
* Osmia ---- bees/m2/min ---- Osmia bee density in the field
* MaxOfUpperTRange ---- ℃ ---- The highest record of the upper band daily air temperature during the bloom season
* MinOfUpperTRange ---- ℃ ---- The lowest record of the upper band daily air temperature
* AverageOfUpperTRange ---- ℃ ---- The average of the upper band daily air temperature
* MaxOfLowerTRange ---- ℃ ---- The highest record of the lower band daily air temperature
* MinOfLowerTRange ---- ℃ ---- The lowest record of the lower band daily air temperature
* AverageOfLowerTRange ---- ℃ ---- The average of the lower band daily air temperature
* RainingDays ---- Day ---- The total number of days during the bloom season, each of which has precipitation larger than zero
* AverageRainingDays ---- Day	The average of raining days of the entire bloom season


In [721]:
Data = pd.read_csv( '../input/wild-blueberry-pollination-simulation-data/WildBlueberryPollinationSimulationData.csv' )
Data.drop(columns='Row#',inplace=True)
Data

In [722]:
Data.describe()

In [723]:
def missing_unique_count_skew( df ):  
  for column in df.columns.values:
    print( "Feature:- ", column )
    print("No. of Unique Values:-", len( list(df[column].unique())) )
    print("Unique Values:-", list(df[column].unique())) 
    print("Skewness:-", df[column].skew(skipna = True) )
    print('Percentage of Missing Values:- ', df[column].isnull().sum()/df[column].shape[0]*100 )
    print("--------------------------------------------------------------")
    print("\n")


missing_unique_count_skew( Data )

All features are numerical and none of them have any missing value

In [666]:
continous_features = [ 'fruitset', 'fruitmass', 'seeds' ]
target_feature = [ 'yield']
discrete_features = list(set(Data.columns.values) - set(continous_features)- set(target_feature))
discrete_features

In [667]:
# Coorealation Matrix
px.imshow( Data.corr(),color_continuous_scale='RdBu_r', width=1200, height=1000 )

In [668]:
Data

In [669]:
fig = px.ecdf(Data, x="yield", lines=True, marginal="histogram")
fig.update_layout( height=750 )
fig.show()

In [670]:
def plot_histograms( df ):
  i=0
  fig = make_subplots( rows=1, cols=df.shape[1],subplot_titles=df.columns.values )
  for feature in df.columns.values:
    fig.add_trace( go.Histogram( x=df[feature], name=feature), row=1,col=i+1 )
    i = i+1

  no_of_features = len(df.columns.values)
  fig.update_layout( bargap=0.2, width= no_of_features*800, height=700 )
  fig.show()
  return

plot_histograms( Data )

In [671]:
def plot_scatter_trend( df, columns ):
  i=0
  fig = make_subplots( rows=1, cols=len(columns),subplot_titles=columns )
  for feature in columns:
    fig.add_trace( go.Scatter( x=df[feature] , y=df['yield'], mode='markers', ), row=1,col=i+1 )
    i = i+1

  no_of_features = len(columns)
  fig.update_layout( bargap=0.2, width= no_of_features*800, height=700, yaxis_title='yield' )
  fig.show()
  return

plot_scatter_trend( Data, columns=continous_features )

In [672]:
fig = go.Figure()
for feature in continous_features:
    fig.add_trace( go.Box( x=Data[continous_features], name=feature)  )

fig.update_layout( height=500 )
fig.show()

In [673]:
# sns.pairplot(Data, size=4.5)

# Train Test Split

In [674]:
from sklearn.model_selection import train_test_split
Data_target_df = Data[['yield']]
X_train,X_test,y_train,y_test = train_test_split( Data.drop(columns=['yield']), Data_target_df, test_size=0.25, random_state=10 )
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Assumptions of Common Machine Learning Models

In [675]:
data=X_train.copy()
data['yield']=y_train['yield'].values
data

Assumption 1: There is a Linear Relationship between the Independent and Dependent Variables.

In [676]:
fig_sns = sns.pairplot(data, x_vars=data.columns.values, y_vars='yield')

In [677]:
data.corr()[['yield']]

Assumption 2: No Multicollinearity | VIF

In [678]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

def VIF( df ):
    vif = pd.DataFrame()
    vif['Features'] = df.columns.values
    vif["VIF Value"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
    return vif
    
VIF_df = VIF( data )
VIF_df

In [679]:
fig = px.histogram(x=VIF_df["Features"], y=np.log(VIF_df['VIF Value']))
fig.show()

VIF = 1 → No correlation

In [680]:
VIF_df[ VIF_df['VIF Value']<=1]

VIF = 1 to 5 → Moderate correlation

In [681]:
VIF_df[ (VIF_df['VIF Value']>1) & (VIF_df['VIF Value']<5) ]

VIF >10 → High correlation

In [682]:
VIF_df[ VIF_df['VIF Value']>10 ]

In [683]:
# Summary Report DataFrame
report_df = VIF_df.copy()
report_df['Correlation with Target Variable'] = data.corr()['yield'].values

### Assumption 3: No Autocorrelation

In [684]:
temp = []
for columns in report_df['Features'].values:
    temp.append(data[columns].autocorr())

report_df['Autocorrelation Lag_1'] = temp

In [685]:
fig = px.histogram(x=report_df["Features"], y=report_df['Autocorrelation Lag_1'])
fig.show()

In [686]:
from statsmodels.stats.stattools import durbin_watson
temp = []
for columns in report_df['Features'].values:
    temp.append( durbin_watson(data[columns].values) )

report_df['Durbin – Watson (DW) statistic'] = temp

DW = 2, implies no autocorrelation

In [687]:
report_df[ report_df['Durbin – Watson (DW) statistic']==2 ]

0 < DW < 2 implies positive autocorrelation

In [688]:
report_df[ report_df['Durbin – Watson (DW) statistic']<2 ]

2 < DW < 4 indicates negative autocorrelation

In [689]:
report_df[ (report_df['Durbin – Watson (DW) statistic']>2) & (report_df['Durbin – Watson (DW) statistic']<4)  ]

### Assumption 4: Mean of Residuals

In [690]:
from sklearn.linear_model import LinearRegression
regr = LinearRegression()
regr.fit(X_train,y_train)
y_pred = regr.predict(X_train)
residuals = y_train.values-y_pred
residuals = list(residuals.reshape(1,len(residuals))[0])
mean_residuals = np.mean( residuals )
print("Mean of Residuals",(mean_residuals))

### Assumption 5: Residuals should be Homoskedastic

In [691]:
y_pred = list(y_pred.reshape(1,len(y_pred))[0])
fig = px.scatter( x=list(y_train['yield'].values), y=y_pred, trendline="ols")
fig.update_layout(xaxis_title='Fitted Values', yaxis_title='Residuals', title='Residuals vs Fitted Values Plot' )
fig.show()

### Assumption 6: Normal Distribution

In [692]:
p = sns.distplot(residuals,kde=True)
p = plt.title('Normality of error terms/residuals')

In [693]:
# Plot the Q-Q plot to graphically check for the hypothesis
from scipy import stats
for columns in data.columns.values:
    print(columns)
    res = stats.probplot(data[columns], plot=plt)
    plt.show()

In [694]:
### Data Normalization using Standard Scaler

# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# scaler.fit( X_train )
# temp = scaler.transform( X_train )
# X_train = pd.DataFrame( data=scaler.transform(X_train),columns=X_train.columns.values )
# X_test = pd.DataFrame( data=scaler.transform(X_test),columns=X_test.columns.values )

In [695]:
Data.describe()

# Feature Selection

Dropping Highly Correlated Features based on Correlation Matrix

In [696]:
Data['Average_TRange'] = (Data['AverageOfUpperTRange'] + Data['AverageOfLowerTRange'])/2
highly_correlated_features = [ 'AverageOfUpperTRange', 'MaxOfUpperTRange', 'MaxOfUpperTRange', 'MinOfUpperTRange', 'MaxOfLowerTRange', 'MinOfLowerTRange' , 'AverageOfLowerTRange', 'RainingDays', 'fruitset', 'fruitmass' ]
Data.drop(columns=highly_correlated_features, inplace=True )
Data

Select top 2 features based on mutual info regression


In [697]:
# from sklearn.feature_selection import SelectKBest, mutual_info_regression
# X = Data.drop(columns='yield')
# y = Data[['yield']]
# selector = SelectKBest(mutual_info_regression, k=6)
# selector.fit(X, y)
# X.columns[selector.get_support()]

When selecting features using SelectKBest, there is high VIF values in the features

In [699]:
VIF_df_new = VIF( Data )
VIF_df_new

In [700]:
# Coorealation Matrix
px.imshow( Data.corr(),color_continuous_scale='RdBu_r', width=750, height=750 )

In [701]:
from sklearn.model_selection import train_test_split
# Data_target_df = Data[['yield']]
X_train,X_test,y_train,y_test = train_test_split( Data, y, test_size=0.25, random_state=10 )
X_train.shape,X_test.shape,y_train.shape,y_test.shape

# Training Models and Evaluation

In [702]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
def Evaluation(model,X_train,X_test,y_train,y_test,hypertuning=False):
  if hypertuning==True:
    print("Param for GS", model.best_params_)
    print("CV score for GS", model.best_score_)

  print( "-----------------------------------------------------------------------------------------------------------")
  #print( model )
  print( " For Train Set :  ")
  y_pred = model.predict(X_train)

  rmse_train = mean_squared_error( y_train, y_pred, squared=False )
  print("Train RMSE = ", rmse_train )
  r2_score_train = r2_score( y_train, y_pred )
  print( "Train R2 Score: ", r2_score_train )
    
  print( " For Test Set :  ")
  y_pred = model.predict(X_test)
  rmse_test = mean_squared_error( y_test, y_pred, squared=False )
  print("Test RMSE = ", rmse_test )
  r2_score_test = r2_score( y_test, y_pred )
  print( "Test R2 Score: ", r2_score_test )

  print('------------------------------------------------------------------------------------------------------------')
  print("\n")

  return  rmse_train, rmse_test, r2_score_train, r2_score_test

In [703]:
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, LogisticRegression, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

In [704]:
def apply_models_with_default_paramters(X_train,X_test,y_train,y_test):
  models_default = [ RandomForestRegressor(), XGBRegressor(base_score=0.5), 
                    GradientBoostingRegressor(), ExtraTreesRegressor(),
                     ElasticNet(), Lasso(), Ridge(), LinearRegression(),
                     SGDRegressor(),AdaBoostRegressor(),
                     KNeighborsRegressor(), # SVR(), 
                     ]

  RMSE_train = []
  RMSE_test = []
  R2_Score_train = []
  R2_Score_test = []
  Model_Name = []

  for model in models_default:
    print(model)
    Model_Name.append( model )

    model.fit(X_train, y_train['yield'].ravel())
    rmse_train, rmse_test, r2_score_train, r2_score_test = Evaluation(model,X_train,X_test,y_train,y_test,False)
    
    RMSE_train.append( rmse_train )
    RMSE_test.append( rmse_test )
    R2_Score_train.append( r2_score_train )
    R2_Score_test.append( r2_score_test )
    
  results = pd.DataFrame()
  results['Model_Name'] = Model_Name

  train_test_RMSE_difference = np.subtract(RMSE_train,RMSE_test)  # To Check Overfitting/Underfitting
  train_test_r2_score_difference = np.subtract(r2_score_train,r2_score_test)  # To Check Overfitting/Underfitting

  results['RMSE on Test Set'] = RMSE_test
  results['RMSE on Train Set'] = RMSE_train
  results['Difference of RMSE on train and test set'] = train_test_RMSE_difference
    
  results['R2 Score on Test Set'] = r2_score_test
  results['R2 Score on Train Set'] = r2_score_train
  results['Difference of R2 Score on train and test set'] = train_test_r2_score_difference

  results = results.sort_values(by=['RMSE on Test Set','Difference of RMSE on train and test set'],ascending = [True, False]) 

  return results


In [705]:
results = apply_models_with_default_paramters(X_train,X_test,y_train,y_test)

In [706]:
results

# Hypertuning Using Optuna

In [707]:
!pip install optuna
import optuna
from sklearn.model_selection import cross_val_score

### Define the Tuning function so that it can be reused

In [708]:
def tune(objective):
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=25)
    
    params = study.best_params
    best_score = study.best_value
    print(f"Best score: {best_score}\n")
    print(f"Optimized parameters: {params}\n")
#     optuna.visualization.plot_optimization_history(study)
#     optuna.visualization.plot_param_importances(study, target=lambda t: t.duration.total_seconds(), target_name="yield")
    return params

### Define Objective of each model

In [709]:
def randomforest_objective(trial):
    _n_estimators = trial.suggest_int("n_estimators", 50, 1000)
    _max_depth = trial.suggest_int("max_depth", 2, 20)
    _min_samp_split = trial.suggest_int("min_samples_split", 2, 10)
    _min_samples_leaf = trial.suggest_int("min_samples_leaf", 2, 10)
    #_max_features = trial.suggest_int("max_features", 5, 50)

    rf = RandomForestRegressor( max_depth=_max_depth,
                                min_samples_split=_min_samp_split,
                                min_samples_leaf=_min_samples_leaf,
                                #max_features=_max_features,
                                n_estimators=_n_estimators
                              )
    scores = cross_val_score( rf, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error"  )
    return scores.mean()

### Tuning Random Forest Regressor

In [710]:
best_params_RF = tune( randomforest_objective )

Tuning Results with original data (i.e without dropping and modifying feature ) :-

In [711]:
# [I 2022-07-30 12:29:40,754] A new study created in memory with name: no-name-03c06be6-9834-4a1f-9234-dffe16470622
# [I 2022-07-30 12:29:41,366] Trial 0 finished with value: -199.81979283104283 and parameters: {'n_estimators': 56, 'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 7}. Best is trial 0 with value: -199.81979283104283.
# [I 2022-07-30 12:29:42,564] Trial 1 finished with value: -206.20424965935658 and parameters: {'n_estimators': 124, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 8}. Best is trial 0 with value: -199.81979283104283.
# [I 2022-07-30 12:29:44,469] Trial 2 finished with value: -208.00736094358845 and parameters: {'n_estimators': 199, 'max_depth': 7, 'min_samples_split': 7, 'min_samples_leaf': 8}. Best is trial 0 with value: -199.81979283104283.
# [I 2022-07-30 12:29:49,537] Trial 3 finished with value: -185.87587546602657 and parameters: {'n_estimators': 501, 'max_depth': 6, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 3 with value: -185.87587546602657.
# [I 2022-07-30 12:29:52,709] Trial 4 finished with value: -167.44706302177906 and parameters: {'n_estimators': 259, 'max_depth': 12, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:29:57,256] Trial 5 finished with value: -192.22890628771637 and parameters: {'n_estimators': 453, 'max_depth': 6, 'min_samples_split': 6, 'min_samples_leaf': 6}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:29:58,753] Trial 6 finished with value: -181.62321719533864 and parameters: {'n_estimators': 146, 'max_depth': 6, 'min_samples_split': 8, 'min_samples_leaf': 4}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:30:01,225] Trial 7 finished with value: -189.69459536354884 and parameters: {'n_estimators': 234, 'max_depth': 13, 'min_samples_split': 6, 'min_samples_leaf': 6}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:30:02,822] Trial 8 finished with value: -199.8485766074405 and parameters: {'n_estimators': 155, 'max_depth': 9, 'min_samples_split': 10, 'min_samples_leaf': 7}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:30:07,281] Trial 9 finished with value: -177.27135830657346 and parameters: {'n_estimators': 402, 'max_depth': 13, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:30:17,609] Trial 10 finished with value: -167.9104531562972 and parameters: {'n_estimators': 858, 'max_depth': 19, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:30:28,017] Trial 11 finished with value: -167.54883162537124 and parameters: {'n_estimators': 881, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:30:39,521] Trial 12 finished with value: -167.79527943527674 and parameters: {'n_estimators': 977, 'max_depth': 20, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:30:44,575] Trial 13 finished with value: -417.40064881408364 and parameters: {'n_estimators': 712, 'max_depth': 2, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:30:52,170] Trial 14 finished with value: -167.54800892575128 and parameters: {'n_estimators': 646, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:30:57,834] Trial 15 finished with value: -219.80703784220117 and parameters: {'n_estimators': 633, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 10}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:31:01,293] Trial 16 finished with value: -177.3746603065906 and parameters: {'n_estimators': 335, 'max_depth': 16, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:31:07,812] Trial 17 finished with value: -172.41930116817326 and parameters: {'n_estimators': 603, 'max_depth': 16, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:31:11,254] Trial 18 finished with value: -171.76748770122464 and parameters: {'n_estimators': 317, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:31:18,264] Trial 19 finished with value: -182.48711618196444 and parameters: {'n_estimators': 712, 'max_depth': 13, 'min_samples_split': 9, 'min_samples_leaf': 5}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:31:23,335] Trial 20 finished with value: -219.9274604629474 and parameters: {'n_estimators': 573, 'max_depth': 18, 'min_samples_split': 5, 'min_samples_leaf': 10}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:31:33,470] Trial 21 finished with value: -167.7447228086886 and parameters: {'n_estimators': 855, 'max_depth': 18, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:31:44,992] Trial 22 finished with value: -167.6214153841942 and parameters: {'n_estimators': 989, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:31:53,953] Trial 23 finished with value: -171.79971332871446 and parameters: {'n_estimators': 825, 'max_depth': 20, 'min_samples_split': 5, 'min_samples_leaf': 3}. Best is trial 4 with value: -167.44706302177906.
# [I 2022-07-30 12:32:02,185] Trial 24 finished with value: -166.9401217325953 and parameters: {'n_estimators': 705, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:32:09,902] Trial 25 finished with value: -173.06945961399384 and parameters: {'n_estimators': 702, 'max_depth': 11, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:32:18,022] Trial 26 finished with value: -176.54573651860832 and parameters: {'n_estimators': 786, 'max_depth': 9, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:32:23,297] Trial 27 finished with value: -183.28691729608994 and parameters: {'n_estimators': 530, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 5}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:32:30,711] Trial 28 finished with value: -169.12714228266037 and parameters: {'n_estimators': 654, 'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:32:37,412] Trial 29 finished with value: -221.22236668932496 and parameters: {'n_estimators': 775, 'max_depth': 4, 'min_samples_split': 2, 'min_samples_leaf': 3}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:32:41,901] Trial 30 finished with value: -176.49280802560006 and parameters: {'n_estimators': 416, 'max_depth': 12, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:32:53,721] Trial 31 finished with value: -167.9230935576295 and parameters: {'n_estimators': 926, 'max_depth': 18, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:05,302] Trial 32 finished with value: -167.9332508744531 and parameters: {'n_estimators': 911, 'max_depth': 15, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:06,134] Trial 33 finished with value: -173.63621889685348 and parameters: {'n_estimators': 68, 'max_depth': 9, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:15,625] Trial 34 finished with value: -167.44536348261988 and parameters: {'n_estimators': 771, 'max_depth': 17, 'min_samples_split': 3, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:22,718] Trial 35 finished with value: -213.10207651256442 and parameters: {'n_estimators': 757, 'max_depth': 17, 'min_samples_split': 3, 'min_samples_leaf': 9}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:29,443] Trial 36 finished with value: -197.89816718066862 and parameters: {'n_estimators': 670, 'max_depth': 12, 'min_samples_split': 2, 'min_samples_leaf': 7}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:35,701] Trial 37 finished with value: -172.10139121056233 and parameters: {'n_estimators': 552, 'max_depth': 8, 'min_samples_split': 3, 'min_samples_leaf': 3}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:38,661] Trial 38 finished with value: -172.54999718884469 and parameters: {'n_estimators': 258, 'max_depth': 14, 'min_samples_split': 7, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:43,862] Trial 39 finished with value: -178.3267088570528 and parameters: {'n_estimators': 482, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 4}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:50,192] Trial 40 finished with value: -182.43747176958107 and parameters: {'n_estimators': 604, 'max_depth': 10, 'min_samples_split': 3, 'min_samples_leaf': 5}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:33:59,797] Trial 41 finished with value: -167.87715197522976 and parameters: {'n_estimators': 804, 'max_depth': 19, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:34:08,345] Trial 42 finished with value: -169.9963006570103 and parameters: {'n_estimators': 745, 'max_depth': 17, 'min_samples_split': 5, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:34:17,841] Trial 43 finished with value: -172.032153204881 and parameters: {'n_estimators': 865, 'max_depth': 19, 'min_samples_split': 6, 'min_samples_leaf': 3}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:34:28,737] Trial 44 finished with value: -167.98148127876058 and parameters: {'n_estimators': 928, 'max_depth': 17, 'min_samples_split': 4, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:34:35,057] Trial 45 finished with value: -206.1185372540681 and parameters: {'n_estimators': 679, 'max_depth': 20, 'min_samples_split': 3, 'min_samples_leaf': 8}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:34:42,673] Trial 46 finished with value: -167.7972180499029 and parameters: {'n_estimators': 615, 'max_depth': 15, 'min_samples_split': 2, 'min_samples_leaf': 2}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:34:50,780] Trial 47 finished with value: -172.2787557905057 and parameters: {'n_estimators': 738, 'max_depth': 14, 'min_samples_split': 4, 'min_samples_leaf': 3}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:34:59,428] Trial 48 finished with value: -176.9746526436473 and parameters: {'n_estimators': 820, 'max_depth': 19, 'min_samples_split': 3, 'min_samples_leaf': 4}. Best is trial 24 with value: -166.9401217325953.
# [I 2022-07-30 12:35:01,407] Trial 49 finished with value: -190.2332827319055 and parameters: {'n_estimators': 192, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 6}. Best is trial 24 with value: -166.9401217325953.
# Best score: -166.9401217325953

# Optimized parameters: {'n_estimators': 705, 'max_depth': 10, 'min_samples_split': 4, 'min_samples_leaf': 2}

### Tuning XGBoost Regressor

In [712]:
def xgboost_objective(trial):
    _n_estimators = trial.suggest_int("n_estimators", 50, 2000)
    _max_depth = trial.suggest_int("max_depth", 2, 20)
    _learning_rate = trial.suggest_float( 'learning_rate', 0.05, 0.30 )
    _min_child_weight=trial.suggest_int(  'min_child_weight' , 1 , 7 )
    _gamma=trial.suggest_float('gamma', 0.0, 0.4)
    _colsample_bytree=trial.suggest_float('colsample_bytree', 0.3, 0.7)

    xgb = XGBRegressor( n_estimators=_n_estimators,
                        max_depth=_max_depth,
                        learning_rate=_learning_rate,
                           min_child_weight = _min_child_weight,
                           gamma = _gamma, 
                           colsample_bytree = _colsample_bytree,
                           #max_features=_max_features,
                            random_state=10
                              )
    scores = cross_val_score( xgb, X_train, y_train, cv=5, scoring="neg_root_mean_squared_error"  )
    return scores.mean()

In [713]:
best_params_xgb = tune( xgboost_objective )

Tuning Results with original data (i.e without dropping and modifying feature ) :-

In [714]:
# [I 2022-07-30 12:35:01,438] A new study created in memory with name: no-name-ce252099-519d-44fe-9893-1b1d5544363b
# [I 2022-07-30 12:35:19,447] Trial 0 finished with value: -160.82065839281583 and parameters: {'n_estimators': 841, 'max_depth': 16, 'learning_rate': 0.13412551473595896, 'min_child_weight': 7, 'gamma': 0.3516176464144207, 'colsample_bytree': 0.5270850013112742}. Best is trial 0 with value: -160.82065839281583.
# [I 2022-07-30 12:35:29,218] Trial 1 finished with value: -145.51529341451345 and parameters: {'n_estimators': 731, 'max_depth': 2, 'learning_rate': 0.27513857427034744, 'min_child_weight': 3, 'gamma': 0.16508955291490152, 'colsample_bytree': 0.4060014466531871}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:35:47,612] Trial 2 finished with value: -167.55835342837327 and parameters: {'n_estimators': 1101, 'max_depth': 8, 'learning_rate': 0.27886610782952576, 'min_child_weight': 3, 'gamma': 0.1823551229296308, 'colsample_bytree': 0.516827166355998}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:36:12,874] Trial 3 finished with value: -177.5436074749714 and parameters: {'n_estimators': 1687, 'max_depth': 6, 'learning_rate': 0.13945072654365726, 'min_child_weight': 7, 'gamma': 0.2787615114626211, 'colsample_bytree': 0.3072085757655237}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:36:33,493] Trial 4 finished with value: -154.2424735962962 and parameters: {'n_estimators': 1311, 'max_depth': 6, 'learning_rate': 0.1821699555067971, 'min_child_weight': 3, 'gamma': 0.08887321329267084, 'colsample_bytree': 0.6668948203631218}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:36:43,140] Trial 5 finished with value: -158.39055123922503 and parameters: {'n_estimators': 445, 'max_depth': 13, 'learning_rate': 0.19950447284352624, 'min_child_weight': 6, 'gamma': 0.32394386198487785, 'colsample_bytree': 0.6554669629168417}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:37:06,524] Trial 6 finished with value: -150.41857443973473 and parameters: {'n_estimators': 1553, 'max_depth': 6, 'learning_rate': 0.14950598131377046, 'min_child_weight': 3, 'gamma': 0.36372964080235454, 'colsample_bytree': 0.6859768273337554}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:37:24,921] Trial 7 finished with value: -156.38087072702356 and parameters: {'n_estimators': 940, 'max_depth': 14, 'learning_rate': 0.10972484658447805, 'min_child_weight': 5, 'gamma': 0.2450636666384951, 'colsample_bytree': 0.5884295507716595}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:38:03,362] Trial 8 finished with value: -160.20413175569308 and parameters: {'n_estimators': 1918, 'max_depth': 12, 'learning_rate': 0.2566304267318222, 'min_child_weight': 2, 'gamma': 0.012850479357483825, 'colsample_bytree': 0.3798875516260073}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:38:26,825] Trial 9 finished with value: -159.0170074707749 and parameters: {'n_estimators': 1261, 'max_depth': 11, 'learning_rate': 0.053890703565576584, 'min_child_weight': 4, 'gamma': 0.2451396755564307, 'colsample_bytree': 0.502921150705515}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:38:29,035] Trial 10 finished with value: -151.07330063204915 and parameters: {'n_estimators': 163, 'max_depth': 2, 'learning_rate': 0.2271230391716922, 'min_child_weight': 1, 'gamma': 0.1523700870563703, 'colsample_bytree': 0.4143056829824321}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:38:36,275] Trial 11 finished with value: -146.5803132179853 and parameters: {'n_estimators': 553, 'max_depth': 2, 'learning_rate': 0.29698163921831683, 'min_child_weight': 3, 'gamma': 0.3929931539387984, 'colsample_bytree': 0.41070364447967084}. Best is trial 1 with value: -145.51529341451345.
# [I 2022-07-30 12:38:43,958] Trial 12 finished with value: -144.69298558666102 and parameters: {'n_estimators': 607, 'max_depth': 2, 'learning_rate': 0.27387715895127, 'min_child_weight': 1, 'gamma': 0.14268431517562824, 'colsample_bytree': 0.4067606216688355}. Best is trial 12 with value: -144.69298558666102.
# [I 2022-07-30 12:38:52,547] Trial 13 finished with value: -144.05648796998383 and parameters: {'n_estimators': 607, 'max_depth': 2, 'learning_rate': 0.22958769356178987, 'min_child_weight': 1, 'gamma': 0.10678637301345291, 'colsample_bytree': 0.3511478831229773}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:38:56,462] Trial 14 finished with value: -172.35342477456973 and parameters: {'n_estimators': 123, 'max_depth': 20, 'learning_rate': 0.2315545118433887, 'min_child_weight': 1, 'gamma': 0.09354675875987131, 'colsample_bytree': 0.3114931077363966}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:39:02,069] Trial 15 finished with value: -154.45295610235502 and parameters: {'n_estimators': 412, 'max_depth': 4, 'learning_rate': 0.23791928879730867, 'min_child_weight': 1, 'gamma': 0.08381099783241212, 'colsample_bytree': 0.3490296966807085}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:39:14,779] Trial 16 finished with value: -161.67533896695267 and parameters: {'n_estimators': 725, 'max_depth': 9, 'learning_rate': 0.2045699432776632, 'min_child_weight': 2, 'gamma': 0.0012682437634140287, 'colsample_bytree': 0.44065878752462817}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:39:19,361] Trial 17 finished with value: -153.94675824276467 and parameters: {'n_estimators': 319, 'max_depth': 4, 'learning_rate': 0.2578074946590302, 'min_child_weight': 2, 'gamma': 0.1363880021212065, 'colsample_bytree': 0.4535464247786477}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:39:32,539] Trial 18 finished with value: -159.33365042707146 and parameters: {'n_estimators': 646, 'max_depth': 9, 'learning_rate': 0.20993851101772049, 'min_child_weight': 1, 'gamma': 0.042394345278563755, 'colsample_bytree': 0.35329144537474977}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:39:48,853] Trial 19 finished with value: -155.01182015983215 and parameters: {'n_estimators': 1071, 'max_depth': 4, 'learning_rate': 0.2999422005006978, 'min_child_weight': 4, 'gamma': 0.12050378287121563, 'colsample_bytree': 0.3529622394534796}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:39:55,192] Trial 20 finished with value: -160.48348887820612 and parameters: {'n_estimators': 264, 'max_depth': 19, 'learning_rate': 0.17073814755835512, 'min_child_weight': 2, 'gamma': 0.23050115250261574, 'colsample_bytree': 0.47636686928208827}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:40:05,656] Trial 21 finished with value: -146.82692369182442 and parameters: {'n_estimators': 740, 'max_depth': 2, 'learning_rate': 0.2626923127115644, 'min_child_weight': 4, 'gamma': 0.1742866191146404, 'colsample_bytree': 0.39155750974909825}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:40:13,474] Trial 22 finished with value: -157.11226025338186 and parameters: {'n_estimators': 535, 'max_depth': 4, 'learning_rate': 0.2760822861047835, 'min_child_weight': 1, 'gamma': 0.19643816096014466, 'colsample_bytree': 0.44782602572534674}. Best is trial 13 with value: -144.05648796998383.
# [I 2022-07-30 12:40:26,182] Trial 23 finished with value: -143.96677260805345 and parameters: {'n_estimators': 892, 'max_depth': 2, 'learning_rate': 0.241616762257219, 'min_child_weight': 2, 'gamma': 0.06560193030863222, 'colsample_bytree': 0.5724802310363611}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:40:44,975] Trial 24 finished with value: -161.03879143247573 and parameters: {'n_estimators': 926, 'max_depth': 7, 'learning_rate': 0.23337440438522006, 'min_child_weight': 2, 'gamma': 0.11529348664750942, 'colsample_bytree': 0.5635693805276544}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:41:09,514] Trial 25 finished with value: -154.7697908305035 and parameters: {'n_estimators': 1187, 'max_depth': 4, 'learning_rate': 0.24743766166284673, 'min_child_weight': 1, 'gamma': 0.06369198813009035, 'colsample_bytree': 0.5876249574334549}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:41:31,190] Trial 26 finished with value: -152.70549603148854 and parameters: {'n_estimators': 1406, 'max_depth': 5, 'learning_rate': 0.21668384279119118, 'min_child_weight': 2, 'gamma': 0.04529135304117293, 'colsample_bytree': 0.5621421394135784}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:41:43,795] Trial 27 finished with value: -148.92916189808463 and parameters: {'n_estimators': 882, 'max_depth': 3, 'learning_rate': 0.19233753258928046, 'min_child_weight': 1, 'gamma': 0.12980435419069164, 'colsample_bytree': 0.6167426335847551}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:41:53,457] Trial 28 finished with value: -149.46850557593658 and parameters: {'n_estimators': 561, 'max_depth': 8, 'learning_rate': 0.1690737592862961, 'min_child_weight': 2, 'gamma': 0.041666626628973275, 'colsample_bytree': 0.6289334468461045}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:42:12,158] Trial 29 finished with value: -161.58972420159512 and parameters: {'n_estimators': 829, 'max_depth': 17, 'learning_rate': 0.09918686716352931, 'min_child_weight': 5, 'gamma': 0.21237861191627702, 'colsample_bytree': 0.4861664565753881}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:42:13,896] Trial 30 finished with value: -174.43340169722597 and parameters: {'n_estimators': 55, 'max_depth': 15, 'learning_rate': 0.28650190373969964, 'min_child_weight': 1, 'gamma': 0.10659389848308509, 'colsample_bytree': 0.5430390733501955}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:42:23,252] Trial 31 finished with value: -145.3974222652434 and parameters: {'n_estimators': 751, 'max_depth': 2, 'learning_rate': 0.2685203164905503, 'min_child_weight': 3, 'gamma': 0.15282436353322243, 'colsample_bytree': 0.33413524473127515}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:42:33,017] Trial 32 finished with value: -148.12258900325068 and parameters: {'n_estimators': 750, 'max_depth': 3, 'learning_rate': 0.25968443914637457, 'min_child_weight': 3, 'gamma': 0.14703802794043447, 'colsample_bytree': 0.3268338196515908}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:42:45,550] Trial 33 finished with value: -144.79568920145775 and parameters: {'n_estimators': 1008, 'max_depth': 2, 'learning_rate': 0.271183119787872, 'min_child_weight': 2, 'gamma': 0.06420568986353098, 'colsample_bytree': 0.38155728733725597}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:42:58,469] Trial 34 finished with value: -145.14119998536563 and parameters: {'n_estimators': 983, 'max_depth': 3, 'learning_rate': 0.28455889651244126, 'min_child_weight': 2, 'gamma': 0.06957136513119795, 'colsample_bytree': 0.38322494523941175}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:43:14,999] Trial 35 finished with value: -155.39249868009523 and parameters: {'n_estimators': 1101, 'max_depth': 5, 'learning_rate': 0.24586783272525792, 'min_child_weight': 1, 'gamma': 0.034525925392269924, 'colsample_bytree': 0.4298234511525623}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:43:24,199] Trial 36 finished with value: -150.93028290396995 and parameters: {'n_estimators': 638, 'max_depth': 5, 'learning_rate': 0.22382068104766212, 'min_child_weight': 2, 'gamma': 0.07630707447575816, 'colsample_bytree': 0.37514972854284667}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:43:46,656] Trial 37 finished with value: -166.49547407224668 and parameters: {'n_estimators': 1416, 'max_depth': 7, 'learning_rate': 0.2478463381402815, 'min_child_weight': 7, 'gamma': 0.10341687399943546, 'colsample_bytree': 0.5240908061431536}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:43:52,286] Trial 38 finished with value: -150.51808937064237 and parameters: {'n_estimators': 423, 'max_depth': 3, 'learning_rate': 0.2722142503671891, 'min_child_weight': 3, 'gamma': 0.06358915738438678, 'colsample_bytree': 0.46283913334973203}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:44:05,820] Trial 39 finished with value: -154.81205786593932 and parameters: {'n_estimators': 834, 'max_depth': 6, 'learning_rate': 0.18823961685054927, 'min_child_weight': 1, 'gamma': 0.29637604108882054, 'colsample_bytree': 0.419006368524295}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:44:20,608] Trial 40 finished with value: -150.36173217890288 and parameters: {'n_estimators': 1187, 'max_depth': 2, 'learning_rate': 0.2896914551727342, 'min_child_weight': 5, 'gamma': 0.01846450729579585, 'colsample_bytree': 0.37006792502871033}. Best is trial 23 with value: -143.96677260805345.
# [I 2022-07-30 12:44:34,557] Trial 41 finished with value: -142.87020283435976 and parameters: {'n_estimators': 991, 'max_depth': 3, 'learning_rate': 0.27769552853850504, 'min_child_weight': 2, 'gamma': 0.06524319397290876, 'colsample_bytree': 0.3883141779959077}. Best is trial 41 with value: -142.87020283435976.
# [I 2022-07-30 12:44:46,816] Trial 42 finished with value: -143.95755907433198 and parameters: {'n_estimators': 940, 'max_depth': 3, 'learning_rate': 0.2762427859569756, 'min_child_weight': 2, 'gamma': 0.09631965345918306, 'colsample_bytree': 0.3960798763005253}. Best is trial 41 with value: -142.87020283435976.
# [I 2022-07-30 12:44:57,827] Trial 43 finished with value: -145.526068741331 and parameters: {'n_estimators': 846, 'max_depth': 3, 'learning_rate': 0.2819680183680695, 'min_child_weight': 3, 'gamma': 0.09577725368875598, 'colsample_bytree': 0.39878216230281527}. Best is trial 41 with value: -142.87020283435976.
# [I 2022-07-30 12:45:07,951] Trial 44 finished with value: -162.14232603185593 and parameters: {'n_estimators': 642, 'max_depth': 5, 'learning_rate': 0.24607254007435156, 'min_child_weight': 2, 'gamma': 0.1776374051286251, 'colsample_bytree': 0.3041546787822086}. Best is trial 41 with value: -142.87020283435976.
# [I 2022-07-30 12:45:21,755] Trial 45 finished with value: -145.7462120433101 and parameters: {'n_estimators': 997, 'max_depth': 3, 'learning_rate': 0.2200926314491082, 'min_child_weight': 1, 'gamma': 0.11416437735492228, 'colsample_bytree': 0.5050512335325078}. Best is trial 41 with value: -142.87020283435976.
# [I 2022-07-30 12:45:39,040] Trial 46 finished with value: -157.99067254633155 and parameters: {'n_estimators': 1191, 'max_depth': 6, 'learning_rate': 0.2924956910131879, 'min_child_weight': 3, 'gamma': 0.08445373853515997, 'colsample_bytree': 0.33094275459246963}. Best is trial 41 with value: -142.87020283435976.
# [I 2022-07-30 12:45:45,320] Trial 47 finished with value: -141.81588934668588 and parameters: {'n_estimators': 492, 'max_depth': 2, 'learning_rate': 0.15352040587092425, 'min_child_weight': 1, 'gamma': 0.16135875510019515, 'colsample_bytree': 0.42632648410185214}. Best is trial 47 with value: -141.81588934668588.
# [I 2022-07-30 12:45:55,266] Trial 48 finished with value: -156.76910822484052 and parameters: {'n_estimators': 485, 'max_depth': 11, 'learning_rate': 0.1298768539041765, 'min_child_weight': 2, 'gamma': 0.16322796468427403, 'colsample_bytree': 0.3652577695685859}. Best is trial 47 with value: -141.81588934668588.
# [I 2022-07-30 12:45:59,871] Trial 49 finished with value: -144.1505626035477 and parameters: {'n_estimators': 338, 'max_depth': 4, 'learning_rate': 0.15845435351313653, 'min_child_weight': 1, 'gamma': 0.025970164399107744, 'colsample_bytree': 0.43778714706718896}. Best is trial 47 with value: -141.81588934668588.
# Best score: -141.81588934668588

# Optimized parameters: {'n_estimators': 492, 'max_depth': 2, 'learning_rate': 0.15352040587092425, 'min_child_weight': 1, 'gamma': 0.16135875510019515, 'colsample_bytree': 0.42632648410185214}


# Final Model

GradientBoostRegressor Algorithm Performed the best but it is overfitting so also the Ensemble based Algorithms

In [715]:
final_model = LinearRegression()
final_model.fit( X_train, y_train )

In [716]:
import pickle
list_objects = { 'final_model': final_model }
with open('model_LR.pkl', 'wb') as files:
  pickle.dump(list_objects, files)

In [717]:
with open('model_LR.pkl', 'rb') as file:
   dict = pickle.load(file)

final_model_LR = dict['final_model']

In [718]:
y_train_pred = final_model_LR.predict(X_train)
y_test_pred = final_model_LR.predict(X_test)

rmse_train, rmse_test, r2_score_train, r2_score_test = Evaluation(final_model_LR,X_train,X_test,y_train,y_test,False)

In [719]:
sns.distplot(y_test_pred, kde=True) # Blue
sns.distplot(y_test, kde=True) # Red
sns.set(rc={'figure.figsize':(35,15)})
plt.show()