In [8]:
import datetime
import pandas as pd
import yfinance as yf
from functools import reduce
import numpy as np
import plotly.express as px
import plotly.graph_objects as go 
from plotly.offline import init_notebook_mode, iplot
import matplotlib.pyplot as plt

import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from statsmodels.tsa.arima.model import ARIMA
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Dropout
import math
import os
import time
import copy
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from dateparser import parse

## Utils

In [2]:
class Util():
    def __init__(self):
        self.scaler = None
        self.metric_sig_digits = 2
    
    def get_stock(self, ticker):
        ticker_data = yf.Ticker(ticker)
        fund_df = ticker_data.history(start=start, end=end)
        data = fund_df[['Close']]
        data.rename(columns={'Close':ticker}, inplace=True)
        return data

    def combine_stocks(self, tickers):
        data_frames = []
        for i in tickers:
            data_frames.append(self.get_stock(i))
        df_merged = reduce(lambda  left,right: pd.merge(left,right,on=['Date'], how='outer'), data_frames)
        return df_merged
    
    def plot_lines(self, df, label=''):
        traces = []
        for c in df.columns:
            traces.append(go.Scatter(x=df.index, y=df[c].values, name=str(c)+label))
        return traces
    
    def plot_bars(self, df):
        traces = []
        df.sort_values(df.columns[0], inplace=True)
        for c in df.columns:
            traces.append(go.Bar(x=df.index, y=df[c].values, name=c))
        return traces

    def plot_comparison(self, actual, predicted_baseline, predicted_best, trials=[], stock=''):
        plt.plot(actual, color='black',label='Actual')
        plt.plot(predicted_best, color='green',label='Best Model')
        plt.plot(predicted_baseline, color='blue',label='Base Model')
        plt.title('Best vs Baseline model')
        plt.xlabel('Time')
        plt.ylabel(stock+' Stock Price')
        if len(trials)>0:
            for i,pred in enumerate(trials):
                plt.plot(pred,label='Trial:'+str(i))
        plt.legend()
        plt.show()
        
    def calculate_metric(self, metricname, true, pred):
        if metricname=='rmse':
            metric = math.sqrt(mean_squared_error(true, pred))
        if metricname=='mse':
            metric = mean_squared_error(true, pred)
        if metricname=='mae':
            metric = mean_absolute_error(true, pred)
        if metricname=='mape':
            metric = mean_absolute_percentage_error(true, pred)
        if metricname=='r2':
            metric = r2_score(true, pred)
        return round(metric, self.metric_sig_digits)
            


## Modeling

In [11]:
class Modeling():
    
    def __init__(self):
        self.util = Util()
        self.model_objects = []
        self.g_epochs = 5
        self.g_batch_size = 10
        
    def transform(self, transform, train, val, inputs, target):
        if type(transform)!=str:
            self.util.xscaler = transform
            self.util.yscaler = transform
            train_scaled = pd.DataFrame(self.util.xscaler.fit_transform(train), columns=train.columns, index=train.index)
            val_scaled = pd.DataFrame(self.util.xscaler.fit_transform(val), columns=val.columns, index=val.index)
            self.util.yscaler.fit_transform(train[target].values.reshape(-1,1))
        else:
            self.util.xscaler = transform
            self.util.yscaler = transform
            if transform=='diff1':
                train_ = train.diff().backfill()
                val_ = val.diff().backfill()
            elif transform=='log':
                train_ = np.log(train)
                val_ = np.log(val.diff())
            train_scaled = pd.DataFrame(train_, columns=train.columns, index=train.index)
            val_scaled = pd.DataFrame(val_, columns=val.columns, index=val.index)
        self.train = train_scaled
        self.val = val_scaled
        self.y_train = train_scaled[[target]]
        self.y_val = val_scaled[[target]]
        self.x_train = None
        self.x_val = None
        if inputs is not None:
            self.x_train = train_scaled[inputs]
            self.x_val = val_scaled[inputs]
        

        
    def data_preprocessing(self, data, split, target, inputs, transformer=StandardScaler()):
        split_date = parse(split,settings={'PREFER_DAY_OF_MONTH': 'first', 'PREFER_DATES_FROM': 'past'})
        train = data[:split_date].iloc[:,:]
        val = data[split_date:].iloc[:,:]
        self.transform(transformer, train, val, inputs, target)
        print("Total data:"+str(data.shape))
        print("Train data:"+str(train.shape))
        print("Val data:"+str(val.shape))

    
    def preprocessed_check(self):
        print(self.train.columns, self.val.columns)
        train_traces = self.util.plot_lines(self.train)
        val_traces = self.util.plot_lines(self.val)
        iplot(train_traces+val_traces)
    
    def corr_check(self):
        corr = self.x_train.corr()
        fig, ax = plt.subplots(1,1, figsize=(6,6))
        cax = ax.matshow(corr, cmap='coolwarm')
        for i in range(corr.shape[0]):
            for j in range(corr.shape[1]):
                text = ax.text(j, i, f"{corr.iloc[i, j]:.2f}", ha='center', va='center', color='black')

        ax.set_xticklabels(corr.columns)
        ax.set_yticklabels(corr.index)
        plt.show()
        
    def run_optuna(self, models, hyperparams, metric, optuna_trials=20):
        self.hyperparams = hyperparams
        self.studies = []
        self.best_params = {}
        self.best_scores = {}
        def optuna_objective(trial):
            current_model = study.user_attrs["current_model"]
            params = copy.deepcopy(self.hyperparams[current_model])
            for p in params:
                if p in ['positive', 'fit_intercept', 'optimizer']:
                    params[p] = trial.suggest_categorical(p,params[p])
                elif p in ['alpha', 'learning_rate']:
                    params[p] = trial.suggest_uniform(p,params[p][0],params[p][1])
                elif p in ['units']:
                    params[p] = trial.suggest_int(p,params[p][0],params[p][1])
            regressor = self.get_model(current_model, params)
            score = self.train_and_evaluate_optimize(regressor, metric)
            return score
        start = time.time()
        for model in models:
            name=type(model).__name__
            if name in self.hyperparams:
                study = optuna.create_study(storage="sqlite:///"+db_name, 
                        study_name="Optimize-Experiment-"+name,
                        direction="minimize", 
                        sampler=optuna.samplers.TPESampler(),
                        pruner=optuna.pruners.MedianPruner(
                            n_startup_trials=2, n_warmup_steps=optuna_trials/2
                        )
                       )
                study.set_user_attr("current_model", name)
                start = time.time()
                study.optimize(optuna_objective, n_trials=optuna_trials)
                self.studies.append(study)
                self.best_params[name] = study.best_params
                self.best_scores[name] = self.train_and_evaluate(self.get_model(name, study.best_params))
                print(model, "Best parameters: ",study.best_params)
                time_taken = time.time() - start
                print(f"Optimization for Model-{name} completed in {time_taken} seconds.")
            else:
                print(name, 'non-optuna')
                self.best_scores[name] = self.train_and_evaluate(self.get_model(name,{}))
        time_taken = time.time() - start
        print(f"Total Optimization completed in {time_taken} seconds.")
            
    
    def train_and_evaluate_optimize(self,model, metric):
        if 'type' not in str(type(model)):
            model.fit(self.x_train, self.y_train)
            prediction = self.get_predictions(model)
        else:
            model = model(self.y_train, order=(1,0,0), exog=self.x_train)
            model_fit = model.fit()
            prediction = self.get_predictions(model_fit)
        metric_ = self.util.calculate_metric(metric, self.y_val, prediction)
        return metric_
    
    def get_model(self, model_name, params):
        if model_name=='LinearRegression':
            return self.linear_reg(params)
        elif model_name=='Ridge':
            return self.ridge(params)
        elif model_name=='Lasso':
            return self.lasso(params)
        elif model_name=='DecisionTreeRegressor':
            return self.dt(params)
        elif model_name=='Sequential':
            return self.lstm(params)
        else:
            return self.linear_reg({})
        
    
    
    def lstm_model(self,params, x_shape):
        regressor = Sequential()
        # First LSTM layer with Dropout regularisation
        regressor.add(LSTM(units=params['n_unit'], return_sequences=True, input_shape=(x_shape,1)))
        regressor.add(Dropout(0.2))
        # Second LSTM layer
        regressor.add(LSTM(units=params['n_unit'], return_sequences=True))
        regressor.add(Dropout(0.2))
        # Third LSTM layer
        regressor.add(LSTM(units=params['n_unit'], return_sequences=True))
        regressor.add(Dropout(0.2))
        # Fourth LSTM layer
        regressor.add(LSTM(units=params['n_unit']))
        regressor.add(Dropout(0.2))  
        # The output layer
        regressor.add(Dense(units=1))
        # Compiling the RNN
        regressor.compile(optimizer=params['optimizer'], loss='mean_squared_error')
        return regressor
    
    # Build models
    def linear_reg(self, params=None):
        regressor = LinearRegression(**params)
        return regressor
    def ridge(self, params=None):
        regressor = Ridge(**params)
        return regressor
    def lasso(self, params=None):
        regressor = Lasso(**params)
        return regressor
    def dt(self, params=None):
        regressor = DecisionTreeRegressor(**params)
        return regressor
    def rf(self, params=None):
        regressor = RandomForestRegressor(**params)
        return regressor
    def lg(self, params=None):
        regressor = LGBMRegressor(**params)
        return regressor
    def xg(self, params=None):
        regressor = XGBRegressor(**params)
        return regressor
    def arima(self, params=None):
        regressor = ARIMA
        self.arimaparams = params
        return regressor
    def lstm(self, params=None):
        #data reshape
#         temp_x_train = np.reshape(self.x_train.values, (self.x_train.shape[0], 1, self.x_train.shape[1]))
        regressor = Sequential()
        regressor.add(LSTM(units=params['units'], input_shape=(self.x_train.shape[1],1)))
        regressor.add(Dense(1))
        regressor.compile(optimizer=params['optimizer'], loss='mse')
        return regressor
    
    def get_predictions(self, model):
        if 'Results' in str(type(model)):
            if not self.x_val.empty:
                raw_prediction = np.array(model.forecast(steps=self.y_val.shape[0], exog=self.x_val))
            else:
                raw_prediction = np.array(model.forecast(steps=self.y_val.shape[0]))
        else:
            raw_prediction = model.predict(self.x_val)
        if type(self.util.yscaler)!=str:
            prediction = pd.DataFrame(self.util.yscaler.inverse_transform(raw_prediction.reshape(-1,1)), columns = self.y_train.columns, index=self.y_val.index)
        else:
            if self.util.yscaler=='diff1':
                prediction = pd.DataFrame(raw_prediction.cumsum(), columns = self.y_train.columns, index=self.y_val.index)
            elif self.util.yscaler=='log':
                prediction = pd.DataFrame(np.exp(raw_prediction), columns = self.y_train.columns, index=self.y_val.index)
        return prediction
    
    def train_and_evaluate(self,model):
        metrics={}
        if 'type' not in str(type(model)):
            if type(model)!=Sequential:
                model.fit(self.x_train, self.y_train)
            else:
                model.fit(self.x_train, self.y_train, epochs = self.g_epochs, batch_size=self.g_batch_size)
            prediction = self.get_predictions(model)
            self.model_objects.append(model)
        else:
            if self.arimaparams:
                order = self.arimaparams['order'] if 'order' in self.arimaparams else (1,0,0)
            if self.x_train.empty:
                model = model(self.y_train, order=order, exog=self.x_train)
            else:
                model = model(self.y_train, order=order)
            model_fit = model.fit()
            print(model_fit.summary(),'\n\n\n')
            prediction = self.get_predictions(model_fit)
            self.model_objects.append(model_fit)
        metric_list = ['rmse','mse','mae','mape','r2']
        for m in metric_list:
            metrics[m] = self.util.calculate_metric(m, self.y_val, prediction)
        return metrics
    
    def run_models(self, models):
        self.scores = {}
        for model in models:
            model_name = type(model).__name__
            metrics = self.train_and_evaluate(model)
            self.scores[model_name] = metrics
            
    def objective(self,trial):
        lstm_params = {
                  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-1),
                  'optimizer': trial.suggest_categorical("optimizer", ["rmsprop","adam","sgd","adagrad","adadelta"]),
                  'units': trial.suggest_int("units", 20, 40)
                  }
        regressor = self.build_lstm_model(lstm_params, x.shape[1])
        score = self.train_and_evaluate(regressor, epochs, batch_size)
        return score
            
    def plot_predictions(self, num):
        model_object = self.model_objects[num]
        mname = str(model_object).split('(')[0]
        prediction = self.get_predictions(model_object)
        pred = self.util.plot_lines(prediction, '_prediction_'+mname)
        if type(self.util.yscaler)!=str:
            y_true = pd.DataFrame(self.util.yscaler.inverse_transform(self.y_val), index=self.y_val.index, columns=self.y_val.columns)
        else:
            if self.util.yscaler=='diff1':
                y_true = pd.DataFrame(self.y_val.cumsum(), columns = self.y_train.columns, index=self.y_val.index)
            elif self.util.yscaler=='log':
                y_true = pd.DataFrame(np.exp(self.y_val), columns = self.y_train.columns, index=self.y_val.index)
        true = self.util.plot_lines(y_true)
        iplot(pred+true)
            
            
    def plot_metrics(self, best_models=False, baseline=False):
        metric_frame = pd.DataFrame(self.scores).T if not best_models else pd.DataFrame(self.best_scores).T
        og_metrics = pd.DataFrame(self.scores).T if baseline else None
        og_traces = []
        for metric in metric_frame.columns:
            traces = self.util.plot_bars(metric_frame[[metric]].T)
            if baseline:
                og_traces = self.util.plot_bars(og_metrics[[metric]].T)
        iplot(traces+og_traces)
            
    def print_metrics(self):
        for k in self.scores:
            print('\n'+str(k))
            for m,v in self.scores[k].items():
                print(m +" : "+str(v))