In [None]:
# External Libs
import json
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import random
import os
import datetime

In [None]:
import sys
sys.path.append('../../../..')

In [None]:
from zeno_etl_libs.helper.aws.s3 import S3
from zeno_etl_libs.logger import get_logger
from zeno_etl_libs.db.db import DB

In [None]:
# Set env
env = "stage"

In [None]:
os.environ['env'] = env

In [None]:
logger = get_logger(level="INFO")

In [None]:
logger.info(f"env: {env}")

### Import modified dataset and category splits

In [None]:
s3 = S3(bucket_name="sagemaker-ap-south-1-921939243643")

In [None]:
csv_full_path = s3.download_file_from_s3(file_name="data/modified_data.csv")

In [None]:
df1 = pd.read_csv(csv_full_path)

In [None]:
df2 = df1.loc[3:27, :]
df2 = df2.drop(['Unnamed: 0'], axis=1)
df2['Dates'] = pd.to_datetime(df2['Dates']) # converting to datetime format
df2 = df2.set_index('Dates')
df2.tail()

### Train - Test Split Function

<b>for_valid=True</b> implies, the split is for validation. In this case, all data before March 2021, is taken the last three months in the obtained dataset is taken as Test set, and all the prior datapoints are taken into Train set.

<b>for_valid=False</b> implies the split is for final model. Hence all the datapoints before March 2021 is taken into Train set. The data of March 2021 is only considered into Test set.

In [None]:
def test_train_split(drug_id, for_valid=True):
    df3 = df2[[drug_id]]
    if for_valid:
        train = df3[0:-5] # Training Split
        test = df3[-5:-2] # Testing Split
    else:
        train = df3[0:-2] # Training Split
        test = df3[-2:-1] # For 2021 March
    return train, test

### Model Libraries -- Imports

In [None]:
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings("ignore")

In [None]:
# from statsmodels.tsa.api import SimpleExpSmoothing, Holt

### Define Exponential Smoothening Model

A simple exponential smoothing model. The <i><b>smoothing constant</b></i> hyperparameter is not provided, so that the algorithm finds the best parameter itself. This model does not capture any trend, it simply forecasts a constant value for all the future predictions. The forecasting is performed by giving higher weightage to the most recent values decided by the <i><b>smoothing constant</b></i>. 

In [None]:
def exp_mod(drug_id, for_valid=True):
    train, test = test_train_split(drug_id, for_valid)
    fit = SimpleExpSmoothing(train).fit() # Optimum alpha automatically computed
    if for_valid:
        fcast = fit.forecast(len(test))
        mae = mean_absolute_error(test, fcast)
        return mae
    else:
        fcast = fit.forecast(1)
        return int(math.ceil(fcast.values[0]))

In [None]:
# result_1 = exp_mod('216583') # for testing

In [None]:
# result_2 = exp_mod('216583', for_valid=False) # for testing

### Define Holts Winter Model

This is better model, due to its ability to capture trend. In order to fit the model, a <b><i>linear trend</i></b> is assumed. The hyperparameter values such as  <i><b>smoothing_level</b></i> and <i><b>smoothing_trend</b></i> are also assumed to be <b>0.8</b> and <b>0.2</b> respectively.

In [None]:
def holts_mod(drug_id, for_valid=True):
    train, test = test_train_split(drug_id, for_valid)
    fit = Holt(train).fit(smoothing_level=0.8, smoothing_trend=0.2, optimized=False) # assume hyp-params and linear trend
    if for_valid:
        fcast = fit.forecast(len(test))
        mae = mean_absolute_error(test, fcast)
        return mae
    else:
        fcast = fit.forecast(1)
        return int(math.ceil(fcast.values[0]))

In [None]:
# result_3 = holts_mod('216583') # for testing

In [None]:
# result_4 = holts_mod('216583', for_valid=False) # for testing

### Define LSTM Model

A basic LSTM model, much more powerfull than previous models. Uses special class of Neural Networks to train the model. The hyperparametrs such as <i><b>number of layers, number of neurons, activation function, optimizers</b></i> and <i><b>number of epochs</b></i> are assumed. 

In [None]:
# preparing independent and dependent features
def prepare_lstm_data(timeseries_data, n_features):
    X, y =[],[]
    for i in range(len(timeseries_data)):
        # find the end of this pattern
        end_ix = i + n_features
        # check if we are beyond the sequence
        if end_ix > len(timeseries_data)-1:
            break
        # gather input and output parts of the pattern
        seq_x, seq_y = timeseries_data[i:end_ix], timeseries_data[end_ix]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

In [None]:
def lstm_mod(drug_id, for_valid=True, n_steps=3, epochs=500):
    n_features = 1
    train, test = test_train_split(drug_id, for_valid)
    X, y = prepare_lstm_data(train.values, n_steps) # function call
    X = X.reshape((X.shape[0], X.shape[1], n_features))
    
    # define model
    model = Sequential()
    model.add(LSTM(50, activation='relu', return_sequences=True, input_shape=(n_steps, n_features)))
    model.add(LSTM(50, activation='relu'))
    model.add(Dense(1))
    model.compile(optimizer='adam', loss='mse')
    # fit model
    model.fit(X, y, epochs=epochs, verbose=0)
    
    if for_valid:
        # demonstrate prediction for next n days
        x_input = train[-(len(test)):][drug_id].values
        temp_input=list(x_input)
        lst_output=[]
        i=0
        while(i<len(test)):
            if(len(temp_input)>n_steps):
                x_input=np.array(temp_input[1:])
                x_input = x_input.reshape((1, n_steps, n_features))
                yhat = model.predict(x_input, verbose=0)
                temp_input.append(yhat[0][0])
                temp_input=temp_input[1:]
                lst_output.append(yhat[0][0])
                i=i+1
            else:
                x_input = x_input.reshape((1, n_steps, n_features))
                yhat = model.predict(x_input, verbose=0)
                temp_input.append(yhat[0][0])
                lst_output.append(yhat[0][0])
                i=i+1
    
        #converting to dictionary --> df
        pred_dict = {'Dates': ['2020-12-01', '2021-01-01', '2021-02-01'], # assuming forecasting for 3 days
                    'Sales': lst_output}
        pred_df = pd.DataFrame(pred_dict)
        pred_df['Dates'] = pd.to_datetime(pred_df['Dates']) # converting to datetime format
        pred_df = pred_df.set_index('Dates')
        mae = mean_absolute_error(test, pred_df)
        return mae
    
    else:
        x_input = train[-3:][drug_id].values
        x_input = x_input.reshape((1, 3, 1))
        yhat = model.predict(x_input, verbose=0)
        return int(math.ceil(yhat[0][0]))

In [None]:
# result_5 = lstm_mod('216583') # for testing

In [None]:
# result_6 = lstm_mod(drug_id='216583', for_valid=False) # for testing

In [None]:
import tensorflow as tf
import tensorflow_datasets as tfds

In [None]:
print("TensorFlow version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))
tf.config.list_physical_devices('GPU')
(ds_train, ds_test), ds_info = tfds.load(
    'mnist',
    split=['train', 'test'],
    shuffle_files=True,
    as_supervised=True,
    with_info=True,
)
def normalize_img(image, label):
  """Normalizes images: `uint8` -> `float32`."""
  return tf.cast(image, tf.float32) / 255., label
batch_size = 128
ds_train = ds_train.map(
    normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_train = ds_train.cache()
ds_train = ds_train.shuffle(ds_info.splits['train'].num_examples)
ds_train = ds_train.batch(batch_size)
ds_train = ds_train.prefetch(tf.data.experimental.AUTOTUNE)
ds_test = ds_test.map(
    normalize_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds_test = ds_test.batch(batch_size)
ds_test = ds_test.cache()
ds_test = ds_test.prefetch(tf.data.experimental.AUTOTUNE)
model = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(32, kernel_size=(3, 3),
                 activation='relu'),
  tf.keras.layers.Conv2D(64, kernel_size=(3, 3),
                 activation='relu'),
  tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),
#   tf.keras.layers.Dropout(0.25),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
#   tf.keras.layers.Dropout(0.5),
  tf.keras.layers.Dense(10, activation='softmax')
])
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=tf.keras.optimizers.Adam(0.001),
    metrics=['accuracy'],
)
model.fit(
    ds_train,
    epochs=6,
    validation_data=ds_test,
)

In [None]:
## Write result to DB

In [None]:
db = DB(read_only=False)

In [None]:
db.open_connection()

In [None]:
all_result = f"result_5: {datetime.datetime.now()}"

In [None]:
query = f""" 
        insert into "prod2-generico"."temp-str" (col1) values ('Hello at {datetime.datetime.now()}: {all_result}')
    """

In [None]:
db.execute(query=query)

In [None]:
db.close_connection()