# Importing Library 

In [40]:
import pandas as pd
import numpy as np
import math
import datetime as dt
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score 
from sklearn.metrics import mean_poisson_deviance, mean_gamma_deviance, accuracy_score
from sklearn.preprocessing import MinMaxScaler
from itertools import cycle
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots

# Import dataset

In [41]:
# Import dataset
apple_df= pd.read_csv("AAPL.csv")
apple_df.head()

Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close
0,12-12-1980,0.128348,0.128348,469033600,0.128906,0.128348,0.099874
1,15-12-1980,0.121652,0.12221,175884800,0.12221,0.121652,0.094663
2,16-12-1980,0.112723,0.113281,105728000,0.113281,0.112723,0.087715
3,17-12-1980,0.115513,0.115513,86441600,0.116071,0.115513,0.089886
4,18-12-1980,0.118862,0.118862,73449600,0.11942,0.118862,0.092492


# Convert date from string to date format

In [42]:
# converting date format to datetime
apple_df['Date'] = pd.to_datetime(apple_df.Date, infer_datetime_format=True, dayfirst=True)
apple_df.head()

Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close
0,1980-12-12,0.128348,0.128348,469033600,0.128906,0.128348,0.099874
1,1980-12-15,0.121652,0.12221,175884800,0.12221,0.121652,0.094663
2,1980-12-16,0.112723,0.113281,105728000,0.113281,0.112723,0.087715
3,1980-12-17,0.115513,0.115513,86441600,0.116071,0.115513,0.089886
4,1980-12-18,0.118862,0.118862,73449600,0.11942,0.118862,0.092492


# Sorting dataset by date format

In [43]:
# sorting by date 
apple_df.sort_values(by='Date', inplace=True)
apple_df.head()
apple_df = apple_df[(apple_df['Date'] > "2019-09-09") & (apple_df['Date'] < "2021-09-09")] 
apple_df

Unnamed: 0,Date,Low,Open,Volume,High,Close,Adjusted Close
9768,2019-09-10,52.927502,53.465000,127111600,54.195000,54.174999,52.915161
9769,2019-09-11,54.432499,54.517502,177158400,55.927502,55.897499,54.597610
9770,2019-09-12,55.715000,56.200001,128906800,56.605000,55.772499,54.475510
9771,2019-09-13,54.255001,55.000000,159053200,55.197498,54.687500,53.415745
9772,2019-09-16,54.389999,54.432499,84632400,55.032501,54.974998,53.696564
...,...,...,...,...,...,...,...
10267,2021-09-01,152.339996,152.830002,80313700,154.979996,152.509995,151.408997
10268,2021-09-02,152.399994,153.869995,71115500,154.720001,153.649994,152.540756
10269,2021-09-03,153.089996,153.759995,57808700,154.630005,154.300003,153.186066
10270,2021-09-07,154.389999,154.970001,82278300,157.259995,156.690002,155.558807


In [44]:
apple_df.shape

(504, 7)

In [45]:
# making a separate dataframe consisting of only closing prices
closedf = apple_df[['Date','Close']]
print("Shape of close dataframe:", closedf.shape)

Shape of close dataframe: (504, 2)


# Plotting close prices

In [46]:
fig = px.line(closedf, x=closedf.Date, y=closedf.Close,labels={'Date':'Date','Close':'Close Stock'})
fig.update_traces(marker_line_width=2, opacity=0.6)
fig.update_layout(title_text='Stock close price chart', plot_bgcolor='white', font_size=15, font_color='black')
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

In [47]:
close_stock = closedf.copy() # making a copy of dataframe
del closedf['Date'] # removing date column
scaler=MinMaxScaler(feature_range=(0,1)) # scaling values between 0 and 1
closedf=scaler.fit_transform(np.array(closedf).reshape(-1,1))
print(closedf.shape)

(504, 1)


# Split data for training and testing

Ratio for training and testing data is 70:30

In [48]:
# splitting dataset into train and test split with 70% data as training
training_size=int(len(closedf)*0.7)
test_size=len(closedf)-training_size
train_data,test_data=closedf[0:training_size,:],closedf[training_size:len(closedf),:1]
print("train_data: ", train_data.shape)
print("test_data: ", test_data.shape)

train_data:  (352, 1)
test_data:  (152, 1)


In [49]:
def create_dataset(dataset, time_step=1):
    """
    creates a dataset according to the time series prediction model
    converts an array of values into a dataset matrix
    """
    dataX, dataY = [], []
    for i in range(len(dataset)-time_step-1):
        a = dataset[i:(i+time_step), 0]  #i=0, 0,1,2,3-----99,100
        dataX.append(a)
        dataY.append(dataset[i + time_step, 0])
    return np.array(dataX), np.array(dataY)

In [50]:
# reshaping 
time_step = 15
X_train, y_train = create_dataset(train_data, time_step)
X_test, y_test = create_dataset(test_data, time_step)

print("X_train: ", X_train.shape)
print("y_train: ", y_train.shape)
print("X_test: ", X_test.shape)
print("y_test", y_test.shape)

X_train:  (336, 15)
y_train:  (336,)
X_test:  (136, 15)
y_test (136,)


In [51]:
# import random forest regressor and GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# create regressor object
# Define the parameter grid for the model
param_grid = {
    'n_estimators': [50, 100, 150],  # The number of trees in the forest
    'max_depth': [None, 10, 20],  # The maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # The minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # The minimum number of samples required to be at a leaf node
    'max_features': [1.0]  # The number of features to consider when looking for the best split
}

# Initialize the regressor
regressor = RandomForestRegressor(random_state=0)

# Initialize the Grid Search CV with the defined parameter grid
grid_search = GridSearchCV(estimator=regressor, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Fit the Grid Search CV to the training data
grid_search.fit(X_train, y_train)

# Get the best estimator from the Grid Search CV
regressor = grid_search.best_estimator_

# Get the parameters of the best estimator
best_parameters = regressor.get_params()

# Print the parameters of the best estimator
print(best_parameters)


{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': None, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 2, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 150, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}


In [52]:
# predictions
train_predict=regressor.predict(X_train)
test_predict=regressor.predict(X_test)
 # reshaping for plotting
train_predict = train_predict.reshape(-1,1)
test_predict = test_predict.reshape(-1,1)
print("Train data prediction:", train_predict.shape)
print("Test data prediction:", test_predict.shape)

Train data prediction: (336, 1)
Test data prediction: (136, 1)


In [53]:
# transforming into their original forms
train_predict = scaler.inverse_transform(train_predict)
test_predict = scaler.inverse_transform(test_predict)
original_ytrain = scaler.inverse_transform(y_train.reshape(-1,1)) 
original_ytest = scaler.inverse_transform(y_test.reshape(-1,1)) 

Evaluation metrices RMSE, MSE and MAE

In [54]:
# calculating performance metrics
print("Train data RMSE: ", math.sqrt(mean_squared_error(original_ytrain,train_predict)))
print("Train data MSE: ", mean_squared_error(original_ytrain,train_predict))
print("Test data MAE: ", mean_absolute_error(original_ytrain,train_predict))
print("-------------------------------------------------------------------------------------")
print("Test data RMSE: ", math.sqrt(mean_squared_error(original_ytest,test_predict)))
print("Test data MSE: ", mean_squared_error(original_ytest,test_predict))
print("Test data MAE: ", mean_absolute_error(original_ytest,test_predict))

Train data RMSE:  1.1370865226000515
Train data MSE:  1.2929657598786775
Test data MAE:  0.7515459825953841
-------------------------------------------------------------------------------------
Test data RMSE:  5.9579393998588115
Test data MSE:  35.49704189238998
Test data MAE:  4.754524015655774


The explained variance score explains the dispersion of errors of a given dataset, scores close to 1.0 are highly desired, indicating better squares of standard deviations of errors.

In [55]:
print("Train data explained variance regression score:", explained_variance_score(original_ytrain, train_predict))
print("Test data explained variance regression score:", explained_variance_score(original_ytest, test_predict))

Train data explained variance regression score: 0.9978873381717376
Test data explained variance regression score: 0.7317654522413506


R-squared (R2) is a statistical measure that represents the proportion of the variance for a dependent variable that's explained by an independent variable or variables in a regression model.
1 = Best, 0 or < 0 = worse

In [56]:
print("Train data R2 score:", r2_score(original_ytrain, train_predict))
print("Test data R2 score:", r2_score(original_ytest, test_predict))

Train data R2 score: 0.9978873379104944
Test data R2 score: 0.6786724961106323


Regression Loss Mean Gamma deviance regression loss (MGD) and Mean Poisson deviance regression loss (MPD)

In [57]:
print("Train data MGD: ", mean_gamma_deviance(original_ytrain, train_predict))
print("Test data MGD: ", mean_gamma_deviance(original_ytest, test_predict))
print("----------------------------------------------------------------------")
print("Train data MPD: ", mean_poisson_deviance(original_ytrain, train_predict))
print("Test data MPD: ", mean_poisson_deviance(original_ytest, test_predict))

Train data MGD:  0.00014463950679880462
Test data MGD:  0.0018722678015183542
----------------------------------------------------------------------
Train data MPD:  0.013129142522538448
Test data MPD:  0.2572501342248033


# Comparison between original price vs predicted price from our model

In [58]:
# shifting train predictions for plotting
look_back = time_step
trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
print("Train predicted data: ", trainPredictPlot.shape)
# shifting test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(train_predict)+(look_back*2)+1:len(closedf)-1, :] = test_predict
print("Test predicted data: ", testPredictPlot.shape)
names = cycle(['Original close price','Train predicted close price','Test predicted close price'])
# plotting the results 
plotdf = pd.DataFrame({'Date': close_stock['Date'],
                       'original_close': close_stock['Close'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = px.line(plotdf,x=plotdf['Date'], y=[plotdf['original_close'],plotdf['train_predicted_close'],
                                          plotdf['test_predicted_close']],
              labels={'value':'Stock price','Date': 'Date'})
fig.update_layout(title_text='Comparision between original close price vs predicted close price',
                  plot_bgcolor='white', font_size=15, font_color='black', legend_title_text='Close Price')
fig.for_each_trace(lambda t:  t.update(name = next(names)))

fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
fig.show()

Train predicted data:  (504, 1)
Test predicted data:  (504, 1)


In [59]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# Random Forest Regressor parameter grid
rf_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [1.0]
}

# AdaBoost Regressor parameter grid
adaboost_param_grid = {
    'base_estimator__n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [None, 10, 20],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.5, 1.0]
}

# Create Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=0)

# Initialize the Randomized Search CV for Random Forest
rf_random_search = RandomizedSearchCV(estimator=rf_regressor, param_distributions=rf_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
rf_random_search.fit(X_train, y_train)

# Get the best Random Forest Regressor from the Randomized Search CV
best_rf_regressor = rf_random_search.best_estimator_

# Create AdaBoost Regressor with the best Random Forest Regressor as the base estimator
adaboost_regressor = AdaBoostRegressor(base_estimator=best_rf_regressor, random_state=0)

# Initialize the Randomized Search CV for AdaBoost
adaboost_random_search = RandomizedSearchCV(estimator=adaboost_regressor, param_distributions=adaboost_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
adaboost_random_search.fit(X_train, y_train)

# Get the best AdaBoost Regressor from the Randomized Search CV
best_adaboost_regressor = adaboost_random_search.best_estimator_

# Make predictions on training and testing data with the best models
train_predict_rf = best_rf_regressor.predict(X_train)
test_predict_rf = best_rf_regressor.predict(X_test)

train_predict_adaboost = best_adaboost_regressor.predict(X_train)
test_predict_adaboost = best_adaboost_regressor.predict(X_test)

# Reshape predictions for plotting
train_predict_rf = train_predict_rf.reshape(-1, 1)
test_predict_rf = test_predict_rf.reshape(-1, 1)

train_predict_adaboost = train_predict_adaboost.reshape(-1, 1)
test_predict_adaboost = test_predict_adaboost.reshape(-1, 1)

# Transform predictions into their original forms
train_predict_rf = scaler.inverse_transform(train_predict_rf)
test_predict_rf = scaler.inverse_transform(test_predict_rf)

train_predict_adaboost = scaler.inverse_transform(train_predict_adaboost)
test_predict_adaboost = scaler.inverse_transform(test_predict_adaboost)

# Evaluate performance metrics
print("Random Forest Regressor:")
train_rmse_rf = math.sqrt(mean_squared_error(original_ytrain, train_predict_rf))
test_rmse_rf = math.sqrt(mean_squared_error(original_ytest, test_predict_rf))
print("Train data RMSE: ", train_rmse_rf)
print("Test data RMSE: ", test_rmse_rf)
print("Train data MSE: ", mean_squared_error(original_ytrain, train_predict_rf))
print("Test data MSE: ", mean_squared_error(original_ytest, test_predict_rf))
print("Train data MAE: ", mean_absolute_error(original_ytrain, train_predict_rf))
print("Test data MAE: ", mean_absolute_error(original_ytest, test_predict_rf))

print("\nAdaBoost Regressor:")
train_rmse_adaboost = math.sqrt(mean_squared_error(original_ytrain, train_predict_adaboost))
test_rmse_adaboost = math.sqrt(mean_squared_error(original_ytest, test_predict_adaboost))
print("Train data RMSE: ", train_rmse_adaboost)
print("Test data RMSE: ", test_rmse_adaboost)
print("Train data MSE: ", mean_squared_error(original_ytrain, train_predict_adaboost))
print("Test data MSE: ", mean_squared_error(original_ytest, test_predict_adaboost))
print("Train data MAE: ", mean_absolute_error(original_ytrain, train_predict_adaboost))
print("Test data MAE: ", mean_absolute_error(original_ytest, test_predict_adaboost))



`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.



Random Forest Regressor:
Train data RMSE:  1.5853136763615805
Test data RMSE:  6.6124965709134305
Train data MSE:  2.5132194524590696
Test data MSE:  43.72511090034188
Train data MAE:  1.0689759367582556
Test data MAE:  5.087285037750198

AdaBoost Regressor:
Train data RMSE:  1.2086779185198726
Test data RMSE:  5.722665363243335
Train data MSE:  1.4609023107175316
Test data MSE:  32.748898859664976
Train data MAE:  0.9399075999396425
Test data MAE:  4.580900026236911


In [60]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.metrics import mean_squared_error
import math

# Random Forest Regressor parameter grid
rf_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [1.0]
}

# AdaBoost Regressor parameter grid
adaboost_param_grid = {
    'base_estimator__n_estimators': [50, 100, 150],
    'base_estimator__max_depth': [None, 10, 20],
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.1, 0.5, 1.0]
}

# Create Random Forest Regressor
rf_regressor = RandomForestRegressor(random_state=0)

# Initialize the Randomized Search CV for Random Forest
rf_random_search = RandomizedSearchCV(estimator=rf_regressor, param_distributions=rf_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
rf_random_search.fit(X_train, y_train)

# Get the best Random Forest Regressor from the Randomized Search CV
best_rf_regressor = rf_random_search.best_estimator_

# Cross-validation for Random Forest
rf_cv_scores = cross_val_score(best_rf_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
print("Random Forest Regressor Cross-Validation RMSE: ", math.sqrt(-rf_cv_scores.mean()))

# Create AdaBoost Regressor with the best Random Forest Regressor as the base estimator
adaboost_regressor = AdaBoostRegressor(base_estimator=best_rf_regressor, random_state=0)

# Initialize the Randomized Search CV for AdaBoost
adaboost_random_search = RandomizedSearchCV(estimator=adaboost_regressor, param_distributions=adaboost_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
adaboost_random_search.fit(X_train, y_train)

# Get the best AdaBoost Regressor from the Randomized Search CV
best_adaboost_regressor = adaboost_random_search.best_estimator_

# Cross-validation for AdaBoost
adaboost_cv_scores = cross_val_score(best_adaboost_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
print("AdaBoost Regressor Cross-Validation RMSE: ", math.sqrt(-adaboost_cv_scores.mean()))

# Make predictions on training and testing data with the best models
train_predict_rf = best_rf_regressor.predict(X_train)
test_predict_rf = best_rf_regressor.predict(X_test)

train_predict_adaboost = best_adaboost_regressor.predict(X_train)
test_predict_adaboost = best_adaboost_regressor.predict(X_test)

# Reshape predictions for plotting
train_predict_rf = train_predict_rf.reshape(-1, 1)
test_predict_rf = test_predict_rf.reshape(-1, 1)

train_predict_adaboost = train_predict_adaboost.reshape(-1, 1)
test_predict_adaboost = test_predict_adaboost.reshape(-1, 1)

# Transform predictions into their original forms
train_predict_rf = scaler.inverse_transform(train_predict_rf)
test_predict_rf = scaler.inverse_transform(test_predict_rf)

train_predict_adaboost = scaler.inverse_transform(train_predict_adaboost)
test_predict_adaboost = scaler.inverse_transform(test_predict_adaboost)

# Evaluate performance metrics
print("Random Forest Regressor:")
train_rmse_rf = math.sqrt(mean_squared_error(original_ytrain, train_predict_rf))
test_rmse_rf = math.sqrt(mean_squared_error(original_ytest, test_predict_rf))
print("Train data RMSE: ", train_rmse_rf)
print("Test data RMSE: ", test_rmse_rf)
print("Train data MSE: ", mean_squared_error(original_ytrain, train_predict_rf))
print("Test data MSE: ", mean_squared_error(original_ytest, test_predict_rf))
print("Train data MAE: ", mean_absolute_error(original_ytrain, train_predict_rf))
print("Test data MAE: ", mean_absolute_error(original_ytest, test_predict_rf))

print("\nAdaBoost Regressor:")
train_rmse_adaboost = math.sqrt(mean_squared_error(original_ytrain, train_predict_adaboost))
test_rmse_adaboost = math.sqrt(mean_squared_error(original_ytest, test_predict_adaboost))
print("Train data RMSE: ", train_rmse_adaboost)
print("Test data RMSE: ", test_rmse_adaboost)
print("Train data MSE: ", mean_squared_error(original_ytrain, train_predict_adaboost))
print("Test data MSE: ", mean_squared_error(original_ytest, test_predict_adaboost))
print("Train data MAE: ", mean_absolute_error(original_ytrain, train_predict_adaboost))
print("Test data MAE: ", mean_absolute_error(original_ytest, test_predict_adaboost))


Random Forest Regressor Cross-Validation RMSE:  0.04643322155746023



`base_estimator` was renamed to `estimator` in version 1.2 and will be removed in 1.4.



AdaBoost Regressor Cross-Validation RMSE:  0.045432244021827335
Random Forest Regressor:
Train data RMSE:  1.596523807189328
Test data RMSE:  6.720037092323736
Train data MSE:  2.5488882669223063
Test data MSE:  45.158898522206854
Train data MAE:  1.0784030302557834
Test data MAE:  5.1712417975042815

AdaBoost Regressor:
Train data RMSE:  1.2135018140524212
Test data RMSE:  5.8309971355919785
Train data MSE:  1.472586652708517
Test data MSE:  34.00052759528186
Train data MAE:  0.9481195387516512
Test data MAE:  4.642852939252439


In [61]:
import plotly.graph_objects as go
from itertools import cycle

# shifting train predictions for plotting
look_back = time_step
trainPredictPlot = np.empty_like(closedf)
trainPredictPlot[:, :] = np.nan
trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
print("Train predicted data: ", trainPredictPlot.shape)

# shifting test predictions for plotting
testPredictPlot = np.empty_like(closedf)
testPredictPlot[:, :] = np.nan
testPredictPlot[len(train_predict)+(look_back*2)+1:len(closedf)-1, :] = test_predict
print("Test predicted data: ", testPredictPlot.shape)

names = cycle(['Original close price','Train predicted close price','Test predicted close price'])

# plotting the results 
plotdf = pd.DataFrame({'Date': close_stock['Date'],
                       'original_close': close_stock['Close'],
                      'train_predicted_close': trainPredictPlot.reshape(1,-1)[0].tolist(),
                      'test_predicted_close': testPredictPlot.reshape(1,-1)[0].tolist()})

fig = go.Figure()

# Adding original close price to the plot
fig.add_trace(go.Scatter(x=plotdf['Date'], y=plotdf['original_close'], mode='lines', name=next(names)))

# Adding train predicted close price to the plot
fig.add_trace(go.Scatter(x=plotdf['Date'], y=plotdf['train_predicted_close'], mode='lines', name=next(names)))

# Adding test predicted close price to the plot
fig.add_trace(go.Scatter(x=plotdf['Date'], y=plotdf['test_predicted_close'], mode='lines', name=next(names)))

fig.update_layout(title='Comparision between original close price vs predicted close price',
                  xaxis_title='Date',
                  yaxis_title='Stock price',
                  plot_bgcolor='white', 
                  font_size=15, 
                  font_color='black', 
                  legend_title_text='Close Price')

fig.show()

Train predicted data:  (504, 1)
Test predicted data:  (504, 1)


In [62]:
from sklearn.ensemble import GradientBoostingRegressor

# Define parameter grid for Gradient Boosting Regressor
gradient_boost_param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 0.9, 1.0],
    'max_features': [None, 'sqrt', 'log2']
}

# Create Gradient Boosting Regressor
gb_regressor = GradientBoostingRegressor(random_state=0)

# Initialize Randomized Search CV for Gradient Boosting
gb_random_search = RandomizedSearchCV(estimator=gb_regressor, param_distributions=gradient_boost_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
gb_random_search.fit(X_train, y_train)

# Get the best Gradient Boosting Regressor from the Randomized Search CV
best_gb_regressor = gb_random_search.best_estimator_

# Cross-validation for Gradient Boosting
gb_cv_scores = cross_val_score(best_gb_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
print("Gradient Boosting Regressor Cross-Validation RMSE: ", math.sqrt(-gb_cv_scores.mean()))

# Make predictions on training and testing data with the best model
train_predict_gb = best_gb_regressor.predict(X_train)
test_predict_gb = best_gb_regressor.predict(X_test)

# Reshape predictions for plotting
train_predict_gb = train_predict_gb.reshape(-1, 1)
test_predict_gb = test_predict_gb.reshape(-1, 1)

# Transform predictions into their original forms
train_predict_gb = scaler.inverse_transform(train_predict_gb)
test_predict_gb = scaler.inverse_transform(test_predict_gb)

# Evaluate performance metrics
print("Gradient Boosting Regressor:")
print("Train data RMSE: ", math.sqrt(mean_squared_error(original_ytrain, train_predict_gb)))
print("Test data RMSE: ", math.sqrt(mean_squared_error(original_ytest, test_predict_gb)))


Gradient Boosting Regressor Cross-Validation RMSE:  0.058979926286618634
Gradient Boosting Regressor:
Train data RMSE:  0.6162433782147769
Test data RMSE:  8.005158059666188


In [63]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

# Define parameter grid for KNN
knn_param_grid = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1 for Manhattan distance, 2 for Euclidean distance
}

# Create KNN Regressor
knn_regressor = KNeighborsRegressor()

# Initialize Grid Search CV for KNN
knn_grid_search = GridSearchCV(estimator=knn_regressor, param_grid=knn_param_grid, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
knn_grid_search.fit(X_train, y_train)

# Get the best KNN Regressor from the Grid Search CV
best_knn_regressor = knn_grid_search.best_estimator_

# Cross-validation for KNN
knn_cv_scores = cross_val_score(best_knn_regressor, X_train, y_train, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
print("KNN Regressor Cross-Validation RMSE: ", math.sqrt(-knn_cv_scores.mean()))

# Make predictions on training and testing data with the best model
train_predict_knn = best_knn_regressor.predict(X_train)
test_predict_knn = best_knn_regressor.predict(X_test)

# Reshape predictions for plotting
train_predict_knn = train_predict_knn.reshape(-1, 1)
test_predict_knn = test_predict_knn.reshape(-1, 1)

# Transform predictions into their original forms
train_predict_knn = scaler.inverse_transform(train_predict_knn)
test_predict_knn = scaler.inverse_transform(test_predict_knn)

# Evaluate performance metrics
print("KNN Regressor:")
print("Train data RMSE: ", math.sqrt(mean_squared_error(original_ytrain, train_predict_knn)))
print("Test data RMSE: ", math.sqrt(mean_squared_error(original_ytest, test_predict_knn)))


KNN Regressor Cross-Validation RMSE:  0.0702937941559015
KNN Regressor:
Train data RMSE:  0.0
Test data RMSE:  7.835669915984876
