In [152]:
!pip install --upgrade pandas
!pip install --upgrade pandas-datareader

from keras.callbacks import EarlyStopping
from prettytable import PrettyTable
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
import pandas_datareader.data as web

import numpy as np
import pandas as pd
from datetime import date, timedelta
import matplotlib.pyplot as plt

!pip install multipledispatch
from multipledispatch import dispatch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [153]:
def get_data_from_web(ticker):
    # NSE was incorporated in 1992. It was recognised as a stock exchange by SEBI in April 1993 and commenced operations in 1994
    print("Getting the data for the ticker ", ticker)
    start = date(1994, 1, 1)
    yesterday = date.today() - timedelta(days=1)
    df = web.DataReader(ticker, 'yahoo', start=start, end=yesterday)
    return df


def linear(x_train, y_train, x_test, y_test):

    # hyper-paramater tuning
    clf = SGDRegressor(loss="squared_error", penalty="l2")
    values = [10**i for i in range(-10, 6)]
    hyper_parameter = {"alpha": values}
    gscv = GridSearchCV(
        clf, hyper_parameter, scoring="neg_mean_squared_error", cv=10, verbose=1, n_jobs=-1)
    gscv.fit(x_train, y_train)
    alpha = gscv.best_params_["alpha"]

    # applying linear regression with optimal hyper-parameter
    clf = SGDRegressor(loss="squared_error", penalty="l2", alpha=alpha)
    clf.fit(x_train, y_train)

    train_pred = clf.predict(x_train)
    train_MAPE = mean_absolute_error(
        y_train, train_pred) / (sum(y_train)/len(y_train))
    train_MSE = mean_squared_error(y_train, train_pred)

    test_pred = clf.predict(x_test)
    test_MAPE = mean_absolute_error(
        y_test, test_pred) / (sum(y_test)/len(y_test))
    test_MSE = mean_squared_error(y_test, test_pred)

    return train_pred, test_pred, train_MAPE, train_MSE, test_MAPE, test_MSE


def randomForest(x_train, y_train, x_test, y_test):

    # hyper-paramater tuning
    clf = RandomForestRegressor(n_jobs=-1)
    hyper_parameter = {"n_estimators": [10, 50, 100, 500],
                       "max_depth": [1, 5, 10, 50, 100, 500, 1000]}
    gscv = GridSearchCV(
        clf, hyper_parameter, scoring="neg_mean_squared_error", cv=3, verbose=1, n_jobs=-1)
    gscv.fit(x_train, y_train)
    estimators = gscv.best_params_["n_estimators"]
    max_depth = gscv.best_params_["max_depth"]

    # applying random forest with optimal hyper-parameter
    clf = RandomForestRegressor(
        n_estimators=estimators, max_depth=max_depth, verbose=1, n_jobs=-1)
    clf.fit(x_train, y_train)

    train_pred = clf.predict(x_train)
    train_MAPE = mean_absolute_error(
        y_train, train_pred) / (sum(y_train)/len(y_train))
    train_MSE = mean_squared_error(y_train, train_pred)

    test_pred = clf.predict(x_test)
    test_MAPE = mean_absolute_error(
        y_test, test_pred) / (sum(y_test)/len(y_test))
    test_MSE = mean_squared_error(y_test, test_pred)

    return train_pred, test_pred, train_MAPE, train_MSE, test_MAPE, test_MSE


def xgboost_reg(x_train, y_train, x_test, y_test):
    # hyper-parameter tuning
    hyper_parameter = {"max_depth": [1, 2, 3, 4],
                       "n_estimators": [10, 50, 100, 500]}
    clf = xgb.XGBRegressor(silent=True)
    best_parameter = GridSearchCV(
        clf, hyper_parameter, scoring="neg_mean_squared_error", cv=3)
    best_parameter.fit(x_train, y_train)
    estimators = best_parameter.best_params_["n_estimators"]
    depth = best_parameter.best_params_["max_depth"]

    # applying xgboost regressor with best hyper-parameter
    clf = xgb.XGBRegressor(
        max_depth=depth, n_estimators=estimators, silent=True)
    clf.fit(x_train, y_train)

    train_pred = clf.predict(x_train)
    train_MAPE = mean_absolute_error(
        y_train, train_pred) / (sum(y_train)/len(y_train))
    train_MSE = mean_squared_error(y_train, train_pred)

    test_pred = clf.predict(x_test)
    test_MAPE = mean_absolute_error(
        y_test, test_pred) / (sum(y_test)/len(y_test))
    test_MSE = mean_squared_error(y_test, test_pred)

    return train_pred, test_pred, train_MAPE, train_MSE, test_MAPE, test_MSE

In [154]:
@dispatch(object, object, object)
def plot(data1, d1_label, title=""):
    fig = plt.figure(figsize=(25, 8))
    plt.title(title)
    plt.plot(data1, 'b', label=d1_label)
    plt.legend()
    plt.show()

@dispatch(object, object, object, object, object)
def plot(data1, d1_label, data2, d2_label, title=""):
    fig = plt.figure(figsize=(25, 8))
    plt.title(title)
    plt.plot(data1, 'b', label=d1_label)
    plt.plot(data2, 'b', label=d2_label)
    plt.legend()
    plt.show()

In [155]:
def train_test_split(data, split_sz):
    data_train = pd.DataFrame(data['Close'][0: int(len(data)*split_sz)])
    data_test = pd.DataFrame(data['Close'][int(len(data)*split_sz): int(len(data))])

    print("Size of complete data = ", data.shape)
    print("Size of training data = ", data_train.shape)
    print("Size of testing data = ", data_test.shape)

    return data_train, data_test
    
def prepare_train_set(train_data, win_sz):
    scaler = MinMaxScaler(feature_range=(0, 1))
    train_data_arr = scaler.fit_transform(train_data)
    
    train_data_dt = data_train.reset_index()["Date"]
    train_x = []
    train_y = []
    train_t = []

    for i in range(win_sz, len(train_data_arr)):
        train_x.append(train_data_arr[i-win_sz : i, 0])
        train_y.append(train_data_arr[i, 0])
        train_t.append(train_data_dt[i])

    train_x, train_y, train_t = np.array(train_x), np.array(train_y), np.array(train_t)
    print("Shape of train feature set ", train_x.shape)
    print("Shape of train target ", train_y.shape)
    print("Shape of train timestamp ", train_t.shape)
    
    return train_x, train_y, train_t

def prepare_test_set(past_n_days_data, test_data, win_sz):
    test_data = pd.concat([past_n_days_data, test_data])

    scaler = MinMaxScaler(feature_range=(0, 1))
    test_data_arr = scaler.fit_transform(test_data)
    
    test_data_dt = test_data.reset_index()["Date"]
    test_x = []
    test_y = []
    test_t = []

    for i in range(win_sz, len(test_data_arr)):
        test_x.append(test_data_arr[i-win_sz: i, 0])
        test_y.append(test_data_arr[i, 0])
        test_t.append(test_data_dt[i])

    test_x, test_y, test_t = np.array(test_x), np.array(test_y), np.array(test_t)
    print("Size of train dataset : ", test_x.shape)
    print("Size of test dataset : ", test_y.shape)
    print("Size of test timestamp : ", test_t.shape)

    return test_x, test_y, test_t

In [156]:
ticker = "RELIANCE.NS"
data = get_data_from_web(ticker)

Getting the data for the ticker  RELIANCE.NS


In [157]:
data_train, data_test = train_test_split(data, split_sz=0.7)

Size of complete data =  (6716, 6)
Size of training data =  (4701, 1)
Size of testing data =  (2015, 1)


In [158]:
win_size = 50
train_x, train_y, train_t = prepare_train_set(data_train, win_size)

Shape of train feature set  (4651, 50)
Shape of train target  (4651,)
Shape of train timestamp  (4651,)


In [159]:
past_n_days_data = data_train.tail(win_size)
test_x, test_y, test_t = prepare_test_set(past_n_days_data, data_test, win_size)

Size of train dataset :  (2015, 50)
Size of test dataset :  (2015,)
Size of test timestamp :  (2015,)


In [None]:
print("Predicting using linear regression model...")
train_y_hat_lr, test_y_hat_lr, trainMAPE_lr, trainMSE_lr, testMAPE_lr, testMSE_lr = linear(train_x, train_y, test_x, test_y)
print("Predicting using randomforest regression model...")
tain_y_hat_rf, test_y_hat_rf, trainMAPE_rf, trainMSE_rf, testMAPE_rf, testMSE_rf = randomForest(train_x, train_y, test_x, test_y)
print("Predicting using xgboost regression model...")
tain_y_hat_xgb, test_y_hat_xgb, trainMAPE_xgb, trainMSE_xgb, testMAPE_xgb, testMSE_xgb = xgboost_reg(train_x, train_y, test_x, test_y)

Predicting using linear regression model...
Fitting 10 folds for each of 16 candidates, totalling 160 fits
Predicting using randomforest regression model...
Fitting 3 folds for each of 28 candidates, totalling 84 fits
