In [1]:
import pandas as pd
import numpy as np
import os
import traceback
from datetime import datetime
from numpy import array
import joblib
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')

In [2]:
# required for training only
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense,Dropout
import keras
from keras.callbacks import EarlyStopping
import tensorflow as tf
import tf2onnx

2022-11-15 19:11:33.703650: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-15 19:11:33.703674: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
#@title TRAINING CLASS WITH TRY EXCEPT

class LSTM_MODEL:
    """
    Attributes
    ----------------------------------------------------------------------------
    df : DataFrame
        Input DataFrame 

    model_1_independent_features : list
        List of the independent features to train model 1.

    model_1_target_feature : str
        Target feature name to train model 1.

    n_steps : tuple 
        Tuple of integer values from which model will find best n_step value.
        e.g (1,10) -> model will find best_n_step from this range

    train_size : float
        float value which indicates the percentage of fraction for train split.
        e.g 0.70 -> in will take 70% of data for training

    root_path : str
        Root path where model will be stored.
        
    Methods
    ----------------------------------------------------------------------------
    get_records_with_akm_combinations(df, AD_GROUP_ID, KEYWORD_ID, MATCH_TYPE)
        -> Return the pandas dataframe with given AKM combination

    data_preprocessing(df,features)
        -> It remove rows with all zeroes or Nan values for given subset of features

    train_test_split(dataset,train_size)
        -> Split the dataset into train and test data based on train_size fraction value.

    split_sequences(sequences, n_steps)
        -> It returns independent feature value sequences along with target feature value
           based on n_steps value to feed into LSTM model
        
    split_sequences_inference(sequences, n_steps)
        -> It returns test sequences based on n_steps to get prediction from LSTM model.

    serialize_model(self,model,root_path,AD_GROUP_ID,KEYWORD_ID,MATCH_TYPE,target_feature)
        -> It will serialize model and dump it at given root_path location with AKM values with target_feature as model name in ".h5" format

    get_stats(self,df,feature)
        -> It returns min, max, mean, std and max_roc for the given feature from the training data

    smape(self,actual, predicted)
        -> It return symetric mean absolute percentage error

    rolling_diff(self,df,target_feature,n_step)
        -> It calculates delta value from the current predicted value and previous real value for the given feature

    serialize_akm_metadata_model_1(self,root_path=None,df = None,metric_df = None,independent_features = None,inference_date = None,onnx_output_names=None)
        -> It will serialize AKM values along with the stats of model 1 as a json and store at a given location.

    append_pred_result(df,y_pred,n_steps,target_feature)
        -> It will return dataframe with predicted target value appended to the original dataframe based on n_step value

    get_metrics_df(df,n_steps = (1,20),train_size=0.70)
        -> It will return dataframe with diffrent n_step_values with corresponding metrics (RMSE, R2) of train and test data.

    tuned_lstm(df,n_steps, train_size)
        -> It will train LSTM model with given best n_step value and return dict of trained model object and metrics info.

    min_max_scaling(self,train_data, test_data)
        -> It will return scaled_train_data and scaled_test_data and scaler object as a dictionary.
    
    keras_to_onnx_model(self,keras_model,best_n_steps,n_features,root_path,AD_GROUP_ID,KEYWORD_ID,MATCH_TYPE,target_feature)
        -> It will convert and store keras model as onnx and return output names of onnx model

    lstm_model(self,train_data,test_data,n_steps)
        -> It will return dictionary with trained model object and metrics values of train and test data.

    tsa_model(self,df,independent_features,target_feature,n_steps,train_size,root_path,serialize_model=None)
        -> It will train lstm model by finding best n_step value from the given range of n_steps and
           serialize model and dump at given location if serialize_model = True.

    model_1()
        -> It will use tsa_model() method to train model by using model_1 independent_feature and target feature 
           and return dataframe with prediction appended to it, y_test and y_test_pred dataframe and metrics df.

    main(self)
        -> It will call model_1() method and return model_1() outputs
    """
    def __init__(self,
                 df,
                 model_1_independent_features,
                 model_1_target_feature,
                 n_steps,
                 train_size,
                 root_path):
        
        self.df = df                                                     # input dataframe
        self.model_1_independent_features = model_1_independent_features # model 1 independent features
        self.model_1_target_feature = model_1_target_feature             # model 1 target features
        self.n_steps = n_steps                                           # max n steps to be tuned e.g (1,10)
        self.train_size = train_size                                     # train size e.g 0.70 (70%)
        self.root_path = root_path                                       # root path when model would be stored
        self.todays_date = datetime.now()

  
    def get_records_with_akm_combinations(self,df, AD_GROUP_ID, KEYWORD_ID, MATCH_TYPE):
        '''
        Args :
            df -> Input dataframe
            AD_GROUP_ID -> value of AD_GROUP_ID
            KEYWORD_ID  -> value of KEYWORD_ID
            MATCH_TYPE  -> value of MATCH_TYPE
        Returns:
            Returns filtered dataframe with given AKM combination
        '''
        df = df.loc[(df['AD_GROUP_ID'] == AD_GROUP_ID) & (df['KEYWORD_ID'] == KEYWORD_ID) & 
                    (df['MATCH_TYPE'] == MATCH_TYPE)].reset_index(drop=True)
        return df

    def data_preprocessing(self,df,features):
        '''
        Args :
            df -> Input dataframe
            features -> features list to be preprocessed
        Returns:
            Remove rows with all 0s and NaN in a Dataframe'''
        df = df.loc[(df[features] != 0).any(axis=1)].dropna(subset=features).reset_index(drop=True)
        return df

    def train_test_split(self,dataset,train_size = 0.70):
        '''
        Args :
            dataset -> full input dataframe
            train_size -> train size in float
        Returns :
            Split into train and test sets and return train and test dataframes'''
        train_size = int(len(dataset) * train_size)
        test_size = len(dataset) - train_size
        train, test = dataset.iloc[0:train_size,:], dataset.iloc[train_size:len(dataset),:]
        return train.reset_index(drop=True), test.reset_index(drop=True)

    def split_sequences(self,dataset,n_steps):
        '''
        Args :
            dataset : input dataframe
            n_steps : look back window size (int)   
        Returns :
            Split a multivariate sequence into samples to feed data in LSTM model
        '''
        # if n_steps is greater then or equal to length of data
        if n_steps >= len(dataset):
            n_steps = len(dataset) - 1

        sequences = []
        labels = []

        start_idx = 0

        for stop_idx in range(n_steps,len(dataset)):
            sequences.append(dataset.iloc[start_idx:stop_idx,:-1])
            labels.append(dataset.iloc[stop_idx,-1])
            start_idx += 1

        return (np.array(sequences),np.array(labels))

    
    # Function to create sequence of data for inference
    def split_sequences_inference(self,sequences, n_steps):
        '''
        Args :
            sequences -> input dataset as a series
            n_steps -> look back window size(int) which was used during training
        Returns : 
            It returns test sequences based on n_steps to get prediction from LSTM model.
        '''
        # if n_steps is greater then or equal to length of data
        if n_steps >= len(sequences):
            n_steps = len(sequences) - 1
            
        X = list()
        for i in range(len(sequences)):
            # find the end of this pattern
            end_ix = i + n_steps
            # check if we are beyond the dataset
            if end_ix > len(sequences):
                break
            # gather input and output parts of the pattern
            seq_x = sequences[i:end_ix, :]
            X.append(seq_x)
        return array(X)
            
    def get_stats(self,df,feature):
        '''
        Args :
            df -> input dataframe
            feature -> feature name for which statistics to be returned
        Returns :
            It returns min, max, mean, std and max_roc for the given feature from the training data
        '''
        max_val = round(df[feature].max(),2)   # get maximum value for the given feature
        mean_val = round(df[feature].mean(),2) # get mean value for the given feature
        std_val = round(df[feature].std(),2)   # get standard deviation for the given feature
        max_roc = round(df[f'{feature}_DELTA(%)'].max(),2)  # get maximum roc from the delta values

        non_zero_indicies = df[feature].to_numpy().nonzero()  # find non zero value indicies for given feature
        non_zero_values = df[feature].iloc[non_zero_indicies] # get the non zero value array
        min_val = min(non_zero_values)                        # get the minimum value from the non zero values
 
        return {'min':min_val,
                'max':max_val,
                'mean':mean_val,
                'std':std_val,
                'max_roc':max_roc}

    
    def smape(self,act,pred):
        '''
        Args :
            actual -> array/series/list of actual values 
            predicted -> array/series/list of predicted values
        Returns :
            It return symetric mean absolute percentage error

        Assumption: actuals and predictions are not logarithms
        '''
        
        smape = 0
        skippedcnt = 0
        for a,p in zip(act,pred):
            den = abs(a) + abs(p)
            if (den > 0):
                num = 2*abs(p-a)
            else:
                den = 1
                num = 2*abs(p-a)
                skippedcnt += 1
                
            smape += (num/den)
        
        smape = (100*smape)/len(act)
        smapedatalength = len(act)
                            
        return smapedatalength, skippedcnt, round(smape,2)


    def rolling_diff(self,df,target_feature,n_step):
        '''
        Args :
            df -> input dataframe
            target_feature -> Name of the target feature
            n_step -> look back window size
        Returns :
            It calculates delta value from the current predicted value and previous real value for the given feature.
        '''
        i = 0 
        diff_lst = []
        for idx in range(len(df)):
            if i < len(df) - n_step:
                j = i + n_step
                current_pred_val =  df[f'predicted_{target_feature}'].iloc[j] # get the current predicted value of the target fetaure
                previous_real_val = df[target_feature].iloc[j-1]              # get the previous real value of the target feature
                if previous_real_val != 0 :
                    delta = round(((current_pred_val - previous_real_val)/previous_real_val)*100,2)  # calculate delta if previous real value is nonzero
                else:
                    # get previous values from the current index
                    previous_values = df[target_feature].iloc[:idx]
                    # get previous non zero indices
                    previous_non_zero_indicies = previous_values.to_numpy().nonzero()[0]
                    
                    # check if there are any nonzero indicies or not
                    if previous_non_zero_indicies.size > 0:
                        last_non_zero_index = previous_non_zero_indicies[-1] # most recent non zero index
                        previous_real_val = previous_values.iloc[last_non_zero_index] # most recent non zero value
                    
                    else:
                        # find non zero value indicies for given feature
                        non_zero_indicies = df[target_feature].to_numpy().nonzero()
                        # get the non zero value array
                        non_zero_values = df[target_feature].iloc[non_zero_indicies]
                        # take the min value from the non zero values
                        min_val = min(non_zero_values)
                        previous_real_val = min_val
                
                    delta = round(((current_pred_val - previous_real_val)/previous_real_val)*100,2)

                diff_lst.append(delta)
                i = i+1
        df.loc[n_step:, f'{target_feature}_DELTA(%)'] = diff_lst
        
        return df


    def serialize_akm_metadata_model_1(self,
                           root_path=None, 
                           df = None,
                           metric_df = None,
                           independent_features = None,
                           inference_date = None,
                           onnx_output_names=None):
        '''
        Args :
            root_path -> path where akm metadata of all the models to be stored.
            df -> input dataframe 
            metric_df -> metric df with metric info of model 1
            independent_features -> Independent feature list for model 1
            inference_date -> t+1 date from the training data
        Returns :
            It will serialize AKM values along with the stats of model 1 as a json and store at a given location.
        '''
        # fetch value of AKM from the given df
        AD_GROUP_ID = df.iloc[0]['AD_GROUP_ID']
        KEYWORD_ID = df.iloc[0]['KEYWORD_ID']
        MATCH_TYPE = df.iloc[0]['MATCH_TYPE']

        # fetch the best n_step value for model 1
        N_STEPS_MODEL_1 = int(metric_df.iloc[0]['BEST_N_STEPS'])

        # fetch the MAPE and SMAPE value for model 1
        model_1_test_mape = round(float(metric_df.iloc[0]['TEST_MAPE']), 2)
        model_1_test_smape = round(float(metric_df.iloc[0]['TEST_SMAPE']), 2)

        #get scaler from metric df
        model_1_scaler = metric_df.iloc[0]['SCALER']
        
        # get model 1 statistics
        model_1_stats = self.get_stats(df,self.model_1_target_feature)
       
        
        # year
        year = self.todays_date.strftime("%Y")
        # Month
        month = self.todays_date.strftime("%m")
        # Day
        day = self.todays_date.strftime("%d")

        year_month_day =  str(year) + str(month) + str(day)

        model_1_name = '_'.join([year_month_day, 
                                 str(int(AD_GROUP_ID)),
                                 str(int(KEYWORD_ID)),
                                 str(MATCH_TYPE),
                                 f'{self.model_1_target_feature}',
                                 'LSTMModel'])

        # model_1_name = '_'.join([year_month_day, 
        #                        str(int(AD_GROUP_ID)),
        #                        str(int(KEYWORD_ID)),
        #                        str(MATCH_TYPE),
        #                        f'{self.model_1_target_feature}Model'])
        
        # create akm json to store at specified location
        akm_dict = {'AD_GROUP_ID' : int(AD_GROUP_ID),
                    'KEYWORD_ID' : int(KEYWORD_ID),
                    'MATCH_TYPE' : str(MATCH_TYPE),
                    'INFERENCE_DATE' : str(inference_date),
                    'MODEL_VERSION' : str(self.todays_date),
                    'MODEL_1' : {
                        'NAME' : model_1_name,
                        'INDEPENDENT_FEATURES' : independent_features,
                        'TARGET_FEATURE' : self.model_1_target_feature,
                        'ONNX_OUTPUT_NAMES' : onnx_output_names,
                        'N_STEP' : N_STEPS_MODEL_1,
                        'SCALER' : model_1_scaler,
                        'STATS' : {
                            f'{self.model_1_target_feature}_MIN' : model_1_stats['min'],
                            f'{self.model_1_target_feature}_MAX' : model_1_stats['max'],
                            f'{self.model_1_target_feature}_MEAN' : model_1_stats['mean'],
                            f'{self.model_1_target_feature}_SD' : model_1_stats['std'],
                            f'{self.model_1_target_feature}_MAX_ROC' : model_1_stats['max_roc']
                        },
                        'MODEL_STATS' : {
                            'MAPE' : model_1_test_mape,
                            'SMAPE' : model_1_test_smape
                        }   
                    }
                }
                
        #metadata_file_name = '_'.join([str(int(AD_GROUP_ID)),str(int(KEYWORD_ID)),str(MATCH_TYPE)])
        full_path = os.path.join(root_path,model_1_name)
        # dump json at specified location
        joblib.dump(akm_dict,filename=full_path+'.json')
        print('AKM metadata Dictionary saved at {}'.format(full_path))
        return 'AKM metadata Dictionary saved at {}'.format(full_path)

    
    def append_pred_result(self,df,y_pred,n_steps,target_feature):
        '''
        Args :
            df -> input dataframe to which predicted column will be appended
            y_pred -> predicted series/list of the given target feature
            n_steps -> look back window size
            target_feature -> name of the target feature
        Returns :
            It will return dataframe with predicted target value appended to the original dataframe based on n_step value
        '''
        # df : dataframe of the specific AKM combination
        df_out = df.copy()
        AD_GROUP_ID = int(df_out.iloc[0]['AD_GROUP_ID'])
        KEYWORD_ID = int(df_out.iloc[0]['KEYWORD_ID'])
        MATCH_TYPE = str(df_out.iloc[0]['MATCH_TYPE'])
        
        df_out['REPORT_DATE'] = pd.to_datetime(df_out['REPORT_DATE'],yearfirst = True)
        # assign predicted target feature to the dataframe
        df_out.loc[n_steps:,f'predicted_{target_feature}'] = y_pred[:-1]
        # get next date
        next_date = df_out.iloc[-1]['REPORT_DATE'] + pd.Timedelta(days=1)
        # assign values to the column for t+1 prediction
        df_out.at[len(df_out.index),['REPORT_DATE','AD_GROUP_ID','KEYWORD_ID','MATCH_TYPE',f'predicted_{target_feature}']] = [next_date,
                                                                                                                              AD_GROUP_ID,
                                                                                                                              KEYWORD_ID,
                                                                                                                              MATCH_TYPE,
                                                                                                                              y_pred[-1]]

        return df_out

    def get_metrics_df(self,train_data, test_data, n_steps = (1,20)):
        '''
        Args :
            train_data -> training dataframe
            test_data -> test dataframe
            n_steps -> tuple of n_steps to be used for training, 
                       model will be trained for each n_step value from the given range.
        Returns :
            It will return dataframe with diffrent n_step_values with corresponding metrics (RMSE, R2) of train and test data.
        
        Note : This method is used to find the best n_step value from the given n_steps range.
        '''
        N_steps = []
        train_r2 = []
        test_r2 = []
        train_rmse = []
        test_rmse = []
        train_mape = []
        test_mape = []

        # iterate over specified n_steps values and store results in the above lists
        for i in range(n_steps[0],n_steps[1]+1):
            if i >= len(train_data) or i >= len(test_data):
                continue
            # train lstm model for the given n_step value in for loop
            model1_metrics = self.lstm_model(train_data=train_data, test_data=test_data, n_steps=i)
            # append current n_step and metrics to the above lists
            N_steps.append(i)
            train_r2.append(model1_metrics['train_R2'])
            test_r2.append(model1_metrics['test_R2'])
            train_rmse.append(model1_metrics['TRAIN_RMSE'])
            test_rmse.append(model1_metrics['TEST_RMSE'])
            train_mape.append(model1_metrics['TRAIN_MAPE'])
            test_mape.append(model1_metrics['TEST_MAPE'])
        
        # return dataframe of metrics associated with each n_step value
        return pd.DataFrame({
            'n_steps' : N_steps,
            'train_r2' : train_r2,
            'test_r2' : test_r2,
            'train_rmse' : train_rmse,
            'test_rmse' : test_rmse,
            'train_mape' : train_mape,
            'test_mape' : test_mape
        })

    def tuned_lstm(self,train_data,test_data,n_steps):
        '''
        Args :
            train_data -> train dataframe
            test_data -> test dataframe
            n_steps -> look back window size
        Returns :
            It will train LSTM model with given best n_step value and return dict of trained model object and metrics info.

        Note : this method used to train lstm model with best found n_step value.
        '''
        
        model_output = self.lstm_model(train_data=train_data, test_data=test_data, n_steps= n_steps)
        return model_output

    
    def min_max_scaling(self,train_data, test_data):
        '''
        Args :
            train_data -> train dataframe to be scaled
            test_data -> test dataframe to be scaled
        Returns :
            Dictionary of scaled_train_data, scaled_test_data and scaler object
        '''
        df_columns = train_data.columns.tolist()
        sc = MinMaxScaler(feature_range=(0, 1))
        
        # fit transform on training dataframe
        scaled_train_data = pd.DataFrame(sc.fit_transform(train_data),columns = df_columns)
        # transform on test dataframe
        scaled_test_data = pd.DataFrame(sc.transform(test_data),columns = df_columns)
        
        return {'scaled_train_data' : scaled_train_data,
                'scaled_test_data' : scaled_test_data,
                'scaler' : sc}

    def keras_to_onnx_model(self,keras_model,best_n_steps,n_features,root_path,AD_GROUP_ID,KEYWORD_ID,MATCH_TYPE,target_feature):
        # year
        year = self.todays_date.strftime("%Y")
        # Month
        month = self.todays_date.strftime("%m")
        # Day
        day = self.todays_date.strftime("%d")
        # join string with current year, month and day
        year_month_day =  str(year) + str(month) + str(day)
        
        # model name as a string of year_month_day_AD_GROUP_ID_KEYWORD_ID_MATCH_TYPE_{target_feature}Model
        model_name = '_'.join([year_month_day,
                               str(int(AD_GROUP_ID)),
                               str(int(KEYWORD_ID)),
                               str(MATCH_TYPE),
                               f'{target_feature}',
                               'LSTMModel'])
        
        full_path = os.path.join(root_path,model_name) + '.onnx'

        ########### onnx model ########################
        spec = (tf.TensorSpec((None,best_n_steps,n_features), tf.double, name="input"),)
        # output_path = 'output/20_oct/onnx_test/model2.onnx'
        model_proto, _ = tf2onnx.convert.from_keras(keras_model, input_signature=spec, opset=13, output_path=full_path)
        output_names = [n.name for n in model_proto.graph.output]
        return output_names

    def create_keras_layers(self,n_steps,n_features):
        # define model(to decide best params hyperparameter tuning is required)
        model = Sequential()
        model.add(LSTM(16, activation='relu', return_sequences=True,input_shape=(n_steps, n_features)))
        model.add(Dropout(0.2))
        model.add(LSTM(16, activation='relu', return_sequences=False,input_shape=(n_steps, n_features)))
        model.add(Dropout(0.2))
        model.add(Dense(1))
        model.compile(optimizer='adam', loss='mse')
        return model

    def lstm_model(self,train_data,test_data,n_steps):
        '''
        Args :
            train_data -> train dataframe
            test_data -> test dataframe
            n_steps -> look back window size
        Returns :
            It will return dictionary with trained model object and metrics values of train and test data.
        '''
        # convert into input/output (Traing data)
        x_train, y_train = self.split_sequences(train_data, n_steps)

        # convert into input/output (Test data)
        x_test, y_test = self.split_sequences(test_data, n_steps)
        
        # convert into input/output (full data)
        scaled_dataset_full = train_data.append(test_data)
        
        x_full = self.split_sequences_inference(scaled_dataset_full.iloc[:,:-1].values,n_steps)
        
        # the dataset knows the number of features, e.g. 2
        n_features = x_train.shape[2]

        # define model(to decide best params hyperparameter tuning is required)
        # model = Sequential()
        # model.add(LSTM(16, activation='relu', return_sequences=True,input_shape=(n_steps, n_features)))
        # model.add(Dropout(0.2))
        # model.add(LSTM(16, activation='relu', return_sequences=False,input_shape=(n_steps, n_features)))
        # model.add(Dropout(0.2))
        # model.add(Dense(1))
        # model.compile(optimizer='adam', loss='mse')

        model = self.create_keras_layers(n_steps, n_features)
        # fit model
        model.fit(x_train, y_train, epochs=50,batch_size=1,verbose=0)

        
        # demonstrate prediction on test data
        y_test_pred = model.predict(x_test)
        # get prediction on training data
        y_train_pred = model.predict(x_train)
        # get prediction on full data
        y_full_pred = model.predict(x_full)[:,0]
        
        # calculate root mean squared error for train and test data
        train_RMSE = np.sqrt(mean_squared_error(y_train, y_train_pred[:,0]))
        test_RMSE = np.sqrt(mean_squared_error(y_test, y_test_pred[:,0]))
        # calculate R2 score for train and test data
        train_r2 = r2_score(y_train,y_train_pred[:,0])
        test_r2 = r2_score(y_test,y_test_pred[:,0])
        # calculate mean absolute percentage error for train and test data
        train_MAPE = mean_absolute_percentage_error(y_train, y_train_pred[:,0])
        test_MAPE = mean_absolute_percentage_error(y_test, y_test_pred[:,0])
        # calculate symmetric mean absolute percentage error for train and test data
        train_smape_datalength, train_smape_skippedcnt, train_SMAPE = self.smape(y_train, y_train_pred[:,0])
        test_smape_datalength, test_smape_skippedcnt, test_SMAPE = self.smape(y_test, y_test_pred[:,0])
        
        # to overcome the issue of nan value round off of metrics 
        if not np.isnan(train_RMSE):
            train_RMSE = round(train_RMSE,2)     
        if not np.isnan(test_RMSE):
            test_RMSE = round(test_RMSE,2)
        if not np.isnan(train_MAPE):
            train_MAPE = round(train_MAPE,2)     
        if not np.isnan(test_MAPE):
            test_MAPE = round(test_MAPE,2)
        if not np.isnan(train_SMAPE):
            train_SMAPE = round(train_SMAPE,2)     
        if not np.isnan(test_SMAPE):
            test_SMAPE = round(test_SMAPE,2)
        if not np.isnan(train_r2):
            train_r2 = round(train_r2,2)     
        if not np.isnan(test_r2):
            test_r2 = round(test_r2,2)

        return {'MODEL' : model,
                'TRAIN_RMSE': train_RMSE, 
                'TEST_RMSE' : test_RMSE,
                'TRAIN_MAPE' : train_MAPE,
                'TEST_MAPE' : test_MAPE,
                'TRAIN_SMAPE' : train_SMAPE,
                'TEST_SMAPE' : test_SMAPE,
                'train_R2' : train_r2, 
                'test_R2' : test_r2,
                'Y_TEST' :  y_test.round(2),
                'Y_TEST_PRED' : y_test_pred[:,0].round(2),
                'Y_PRED_FULL' : y_full_pred.round(2),
                # added on 19th october for comparision purpose
                'train_smape_datalength' : train_smape_datalength,
                'test_smape_datalength' :test_smape_datalength}

    def lstm_model_training_full(self,data,n_steps):
        '''
        Args :
            data -> full dataframe
            n_steps -> look back window size
        Returns :
            It will return trained model object on 100% data.
        '''
        # convert into input/output (Full data)
        x_train, y_train = self.split_sequences(data, n_steps)

        # the dataset knows the number of features, e.g. 2
        n_features = x_train.shape[2]

        # define model(to decide best params hyperparameter tuning is required)
        lstm_model_full_Data = self.create_keras_layers(n_steps, n_features)
        # fit model
        lstm_model_full_Data.fit(x_train, y_train, epochs=50,batch_size=1,verbose=0)

        return lstm_model_full_Data
        
    def tsa_model(self,df,independent_features,target_feature,n_steps,train_size,root_path,serialize_model=None):
        '''
        Args:
            df -> input dataframe
            independent_features -> list of independent feature
            target_feature -> name of the target feature
            n_steps -> tuple of n_steps i.e (1,5)
            train_size -> train_size between 0 to 1 (i.e 0.70)
            root_path -> path to store the trained model
            serialize_model -> flag True/False, if True then model will be stored at given path
        Returns:
            -> It will train lstm model by finding best n_step value from the given range of n_steps and
               serialize model and dump at given location if serialize_model = True.
        '''
        features = independent_features.copy()
        features.append(target_feature)
        # preprocess dataframe with independent + target features
        df = self.data_preprocessing(df,features)
        # create copy of the df with independent + target features
        dataset = df[features].copy()
        
        # get train test data
        train_data,test_data = self.train_test_split(dataset,train_size)
        
        # added to overcome the issue of target feature being present as both independent feature and target feature
        if target_feature in independent_features:
            # take only independdent feature from train/test data and exclude target feature
            train_data = train_data.iloc[:,:-1]
            test_data = test_data.iloc[:,:-1]
        
        # scale train and test data
        features_to_scale = independent_features.copy()
        #print('\n','Features to scale before',features_to_scale)
        if 'ROAS_TARGET' in independent_features:
            # get index of ROAS_TARGET to append back in later stage
            ROAS_TARGET_index = independent_features.index('ROAS_TARGET')
            features_to_scale.remove('ROAS_TARGET') # log transformation is done on ROAS_TARGET so we are removing it from scaling
            
        #print('Features to scale : ',features_to_scale)
        
        # take only those feature for scaling which are in features_to_scale list
        train_data_to_scale = train_data[features_to_scale] 
        test_data_to_scale = test_data[features_to_scale]
        
        # scale train and test data and return dictionary of  scaled train data, scaled test data and scaler 
        scaled_output = self.min_max_scaling(train_data_to_scale,test_data_to_scale)

        # print('train scaled output columns before appending ROAS and target','\n',scaled_output['scaled_train_data'].columns)
        # print('test scaled output columns before appending ROAS and target','\n',scaled_output['scaled_test_data'].columns)
        
        if 'ROAS_TARGET' in independent_features:
            # append back non scaled ROAS_TARGET to the df if it is in independent features
            scaled_output['scaled_train_data'].insert(loc=ROAS_TARGET_index,
                                                    column='ROAS_TARGET',
                                                    value=train_data['ROAS_TARGET'])
            
            scaled_output['scaled_test_data'].insert(loc=ROAS_TARGET_index,
                                                    column='ROAS_TARGET',
                                                    value=train_data['ROAS_TARGET'])
        
    
        # append back non scaled target feature
        scaled_output['scaled_train_data'].insert(loc=scaled_output['scaled_train_data'].shape[1],
                                                  column=target_feature,
                                                  value=train_data[target_feature],
                                                  allow_duplicates=True)
        
        scaled_output['scaled_test_data'].insert(loc=scaled_output['scaled_test_data'].shape[1],
                                                  column=target_feature,
                                                  value=train_data[target_feature],
                                                  allow_duplicates=True)
        #print('\n Fetures of the scaled_train_data \n',scaled_output['scaled_train_data'].columns,scaled_output['scaled_train_data'].shape)
        #print('\n Fetures of the scaled_test_data \n',scaled_output['scaled_test_data'].columns, scaled_output['scaled_test_data'].shape)
        
        # get the dataframe of metric values associated with each n_step value
        metric_df_ = self.get_metrics_df(train_data=scaled_output['scaled_train_data'],
                                         test_data=scaled_output['scaled_test_data'],
                                         n_steps = n_steps)
        
        # get the best n_step value by sorting test_rmse column in ascending order
        best_n_steps = int(metric_df_.sort_values('test_rmse').iloc[0]['n_steps'])
        
        # use best_n_steps value from above and get output from tuned_lstm model on 80-20 split data
        model_output = self.tuned_lstm(train_data=scaled_output['scaled_train_data'],
                                       test_data=scaled_output['scaled_test_data'],
                                       n_steps= int(best_n_steps))
        
        ############## get trained model object from 100% data ##########################
        full_scaled_data = scaled_output['scaled_train_data'].append(scaled_output['scaled_test_data'])
        #print('shape of full_sclaed_data = ',full_scaled_data.shape)
        trained_model_on_full_data = self.lstm_model_training_full(full_scaled_data,best_n_steps)

        AD_GROUP_ID = df['AD_GROUP_ID'].iloc[0]
        KEYWORD_ID = df['KEYWORD_ID'].iloc[0]
        MATCH_TYPE = df['MATCH_TYPE'].iloc[0]
        
        # print AKM with RMSE, MAPE, SMAPE, R2 for train and test data
        print('\n',f'AD_GROUP_ID = {AD_GROUP_ID}, KEYWORD_ID = {KEYWORD_ID}, MATCH_TYPE = {MATCH_TYPE}','\n')
        print(f'Best n_steps: {int(best_n_steps)}')
        print('Train Score:  RMSE = %.2f, MAPE = %.2f, SMAPE = %.2f, R2 = %.2f' % (model_output['TRAIN_RMSE'], 
                                                                                   model_output['TRAIN_MAPE'],
                                                                                   model_output['TRAIN_SMAPE'],
                                                                                   model_output['train_R2']))
        print('Test Score: RMSE = %.2f, MAPE = %.2f, SMAPE = %.2f, R2 = %.2f' % (model_output['TEST_RMSE'], 
                                                                                 model_output['TEST_MAPE'],
                                                                                 model_output['TEST_SMAPE'],
                                                                                 model_output['test_R2']))
        
        print('\n','-'*100,'\n')
        
        # create dataframe of AKM values along with scaler object, best n_steps, RMSE, MAPE, SMAPE and R2 metrics
        metric_df = pd.DataFrame({'AD_GROUP_ID' : [AD_GROUP_ID], 
                       'KEYWORD_ID': [KEYWORD_ID], 
                       'MATCH_TYPE': [MATCH_TYPE],
                       'SCALER' : [scaled_output['scaler']],
                       'BEST_N_STEPS' : [best_n_steps],
                       'TRAIN_RMSE' : [model_output['TRAIN_RMSE']],
                       'TEST_RMSE' : [model_output['TEST_RMSE']],
                       'TRAIN_MAPE' : [model_output['TRAIN_MAPE']],
                       'TEST_MAPE' : [model_output['TEST_MAPE']],
                       'TRAIN_SMAPE' : [model_output['TRAIN_SMAPE']],
                       'TEST_SMAPE' : [model_output['TEST_SMAPE']],
                       'TRAIN_R2' : [model_output['train_R2']],
                       'TEST_R2' : [model_output['test_R2']]})
        
        # added on 19th october
        # this dataframe is temporory only for comparision with knn person
        df_metrics_model_comparision = pd.DataFrame({'akm' : str(AD_GROUP_ID)+'-'+str(KEYWORD_ID)+'-'+str(MATCH_TYPE),
                                                     'total_record_post_processed' : len(df),
                                                     'best_n_steps' : [best_n_steps],
                                                     'test_smape_lstm' : [model_output['TEST_SMAPE']],
                                                     'best_n_step_df_total_records' : model_output['train_smape_datalength'] + model_output['test_smape_datalength'],
                                                     'best_n_step_df_train_records' : model_output['train_smape_datalength'],
                                                     'best_n_step_df_test_records' :  model_output['test_smape_datalength']
                                                     })
        
        # create dataframe of the y_test real values and y_test predicted values
        y_test_and_pred = pd.DataFrame({f'{target_feature}_test' : model_output['Y_TEST'],
                                        f'{target_feature}_pred' : model_output['Y_TEST_PRED']})

        if serialize_model :
            #serialize model on 80 % data
            # onnx_output_names = self.keras_to_onnx_model(keras_model=model_output['MODEL'],
            #                                              best_n_steps=int(best_n_steps),
            #                                              n_features=(scaled_output['scaled_train_data'].shape[1])-1,
            #                                              root_path=root_path,
            #                                              AD_GROUP_ID=AD_GROUP_ID,
            #                                              KEYWORD_ID=KEYWORD_ID,
            #                                              MATCH_TYPE=MATCH_TYPE,
            #                                              target_feature=target_feature)
            
            # serialize model on 100% data
            onnx_output_names = self.keras_to_onnx_model(keras_model=trained_model_on_full_data,
                                                         best_n_steps=int(best_n_steps),
                                                         n_features=len(independent_features),
                                                         root_path=root_path,
                                                         AD_GROUP_ID=AD_GROUP_ID,
                                                         KEYWORD_ID=KEYWORD_ID,
                                                         MATCH_TYPE=MATCH_TYPE,
                                                         target_feature=target_feature)
            
        
        # append predicted result to the original dataframe
        output_df = self.append_pred_result(df,model_output['Y_PRED_FULL'],best_n_steps,target_feature)
        # append delta change
        out_df = self.rolling_diff(output_df,target_feature,best_n_steps)
        
        return out_df, y_test_and_pred, metric_df, df_metrics_model_comparision,onnx_output_names # added on 20th october

    def model_1(self):
        '''
        Returns:
            It will use tsa_model() method to train model by using model_1 independent_feature and target feature 
            and return dataframe with prediction appended to it, y_test and y_test_pred dataframe and metrics df. 
        '''
        try:
            # create copy of the input dataframe
            df = self.df.copy()
            # make REPORT_DATE as datetime dtype
            df["REPORT_DATE"]= pd.to_datetime(df["REPORT_DATE"],yearfirst=True)
            # sort dataframe by report_date
            df = df.sort_values(by='REPORT_DATE').reset_index(drop=True)
            
            inference_date = df.iloc[-1]['REPORT_DATE'] + pd.Timedelta(days=1) # this is just for recording training inference date as statistic
            independent_features = self.model_1_independent_features.copy()    # make a copy of the independent features list
            
            # if ROAS_TARGET is in independet features then create and assign ROAS_TARGET feature to the dataframe
            # if 'ROAS_TARGET' in independent_features:
            #     # create ROAS feature
            #     df['ROAS_TARGET'] = np.log(((df['SALES_USD']+1)/(df['SPEND_USD']+1)))
            
            
            target_feature = self.model_1_target_feature
            n_steps = self.n_steps
            train_size = self.train_size
            root_path = self.root_path 
            #print('\n','MODEL 1 Independent Features :',independent_features)
            
           
            model_1_output, model1_test_and_pred, model_1_metric_df,df_metrics_model_comparision,onnx_output_names = self.tsa_model(df=df,
                                                                                                                    independent_features = independent_features,
                                                                                                                    target_feature = target_feature, 
                                                                                                                    n_steps = n_steps,
                                                                                                                    train_size = train_size,
                                                                                                                    root_path = root_path,
                                                                                                                    serialize_model = True)
            
            # serialize akm and best n_steps from each model
            self.serialize_akm_metadata_model_1(root_path,model_1_output,model_1_metric_df,independent_features,inference_date,onnx_output_names)
        
            #return model_1_output, model1_test_and_pred, model_1_metric_df
            return model_1_output, model1_test_and_pred, model_1_metric_df, df_metrics_model_comparision # added on 19th october for comparision

        except:
            AD_GROUP_ID = df.iloc[0]['AD_GROUP_ID']
            KEYWORD_ID = df.iloc[0]['KEYWORD_ID']
            MATCH_TYPE = df.iloc[0]['MATCH_TYPE']
            print('*'*100)
            print('\n',f'{target_feature}_MODEL training failed for AD_GROUP_ID = {AD_GROUP_ID}, KEYWORD_ID = {KEYWORD_ID}, MATCH_TYPE = {MATCH_TYPE}','\n')
            print(traceback.format_exc())
            print('*'*100)
        
       
    def main(self):
        try:
            bid_model_output, bid_model_test_and_pred, metric_df, df_metrics_model_comparision = self.model_1()
            return bid_model_output, bid_model_test_and_pred, metric_df, df_metrics_model_comparision
        except:
            print(traceback.format_exc())

# Train model for multiple AKMs

In [4]:
def run_lstm_training(input_path,akm_to_model_path,independent_features,target_feature,train_size,root_path,n_steps):
    '''
    Args:
        input_path -> input file path with all akm
        akm_to_model_path -> Csv of the AD_GROUP_ID, KEYWORD_ID and MATCH_TYPE to train LSTM models only
        independent_features -> Independent feature list
        target_feature -> target feature name
        train_size -> train size (eg. 0.70)
        root_path -> Path to store LSTM model
        n_steps -> tupe of n_steps (eg. (1,2))
    Returns:
        This method is used to run training for n number of akm and return dataframes .
    '''
    df = pd.read_csv(input_path,parse_dates=['REPORT_DATE']) # read full csv
    df_akm_to_model = pd.read_csv(akm_to_model_path)       # read akm_to_model csv

    def get_records_with_akm_combinations(df, AD_GROUP_ID, KEYWORD_ID, MATCH_TYPE):
        df = df.loc[(df['AD_GROUP_ID'] == AD_GROUP_ID) & (df['KEYWORD_ID'] == KEYWORD_ID) & 
                    (df['MATCH_TYPE'] == MATCH_TYPE)].reset_index(drop=True)
        return df
    
    def isEnoughData(df,column):
        for col in column:
            distcnt = np.count_nonzero(df[col], axis=0)
            if (distcnt < 10):
                return False
        return True

    # # this function will remove rows which have all zeros/NaN for given feature list
    def data_sufficiency_check(df,features):
        df = df.loc[(df[features] != 0).any(axis=1)].dropna(subset=features).reset_index(drop=True)
        return df
    
    insuff_akm_count = 0
    failed_count = 0
    comparision_metric_df = pd.DataFrame()
    bid_model_metric_df = pd.DataFrame()

    for idx, row in df_akm_to_model.iterrows():
        try:
            print(f'\n ************************** Iteration {idx+1} ************************************ \n')
            AD_GROUP_ID, KEYWORD_ID, MATCH_TYPE = row['AD_GROUP_ID'], row['KEYWORD_ID'], row['MATCH_TYPE']
            akm = str(AD_GROUP_ID)+'-'+str(KEYWORD_ID)+'-'+str(MATCH_TYPE)
            # testakm = '157188419817864-177620140515274-PHRASE'
            # if (akm != testakm):
            #     continue
            df_akm = get_records_with_akm_combinations(df,
                                                    AD_GROUP_ID =AD_GROUP_ID,
                                                    KEYWORD_ID=KEYWORD_ID,
                                                    MATCH_TYPE=MATCH_TYPE)
            
            if 'ROAS_TARGET' in independent_features:
                df_akm['ROAS_TARGET'] = np.log(((df_akm['SALES_USD']+1)/(df_akm['SPEND_USD']+1)))
                    
            df_akm = data_sufficiency_check(df_akm,independent_features+[target_feature]) # df_akm is processed and it will be used further

            print('df_akm shape = ', df_akm.shape, 'isEnoughData flag = ',isEnoughData(df_akm,[target_feature]))
        
            if not isEnoughData(df_akm,[target_feature]):
                # print('*'*100)
                print(f'\n ***** Insufficient data for AD_GROUP_ID = {AD_GROUP_ID},KEYWORD_ID = {KEYWORD_ID}, MATCH_TYPE = {MATCH_TYPE} ***** \n')
                # print('*'*100)
                insuff_akm_count+=1
                continue

            if (len(df_akm) <= 10):
                # print('*'*100)
                print(f'\n ***** Insufficient data for AD_GROUP_ID = {AD_GROUP_ID},KEYWORD_ID = {KEYWORD_ID}, MATCH_TYPE = {MATCH_TYPE} ***** \n')
                # print('*'*100)
                insuff_akm_count+=1
                continue

            lstm = LSTM_MODEL(df = df_akm,
                        model_1_independent_features = independent_features,
                        model_1_target_feature = target_feature,
                        n_steps = n_steps,
                        train_size = train_size,
                        root_path = root_path)
            
            bid_model_output, bid_model_test_and_pred, bid_model_metric_df, df_metrics_model_comparision = lstm.main()
            comparision_metric_df = comparision_metric_df.append(df_metrics_model_comparision)
            bid_model_metric_df = bid_model_metric_df.append(bid_model_metric_df)

        except:
            failed_count+=1
            print('*'*100)
            print('\n',f'Training failed for AD_GROUP_ID = {AD_GROUP_ID}, KEYWORD_ID = {KEYWORD_ID}, MATCH_TYPE = {MATCH_TYPE}','\n')
            print(traceback.format_exc())
            print('*'*100)
            pass

    print(f'\n ***** Total skipped AKM combination due to insufficient data = {insuff_akm_count} ***** \n')
    print(f'\n ***** Total Failed AKM combination = {failed_count} ***** \n')

    return bid_model_output, bid_model_test_and_pred, bid_model_metric_df, comparision_metric_df.reset_index(drop=True)

In [5]:
input_path='akmtrainingdata.csv'
akm_to_model_path = 'akm.csv'
train_size = 0.80
root_path = 'LSTMModels'
n_steps=(1,15)
independent_features = ['CPC_USD','SALES_USD','CONVERSIONS','CLICKS','ROAS_TARGET','IMPRESSIONS','SPEND_USD']
target_feature='CPC_USD'

In [6]:
bid_model_output, bid_model_test_and_pred, bid_model_metric_df,comparision_metric_df  = run_lstm_training(input_path=input_path,
                                                                                                  akm_to_model_path=akm_to_model_path,
                                                                                                  independent_features=independent_features,
                                                                                                  target_feature=target_feature,
                                                                                                  train_size=train_size,
                                                                                                  root_path=root_path,
                                                                                                  n_steps=n_steps)


 ************************** Iteration 1 ************************************ 

df_akm shape =  (100, 56) isEnoughData flag =  True


2022-11-15 19:11:52.231648: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-11-15 19:11:52.231701: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-11-15 19:11:52.231728: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (beejay-ThinkPad): /proc/driver/nvidia/version does not exist
2022-11-15 19:11:52.232031: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.



 AD_GROUP_ID = 157188419817864, KEYWORD_ID = 177620140515274, MATCH_TYPE = PHRASE 

Best n_steps: 13
Train Score:  RMSE = 0.24, MAPE = 0.07, SMAPE = 7.43, R2 = 0.53
Test Score: RMSE = 0.55, MAPE = 0.19, SMAPE = 17.05, R2 = -197.94

 ---------------------------------------------------------------------------------------------------- 

Instructions for updating:
Use `tf.compat.v1.graph_util.extract_sub_graph`


2022-11-15 19:16:04.065598: I tensorflow/core/grappler/devices.cc:66] Number of eligible GPUs (core count >= 8, compute capability >= 0.0): 0
2022-11-15 19:16:04.065749: I tensorflow/core/grappler/clusters/single_machine.cc:358] Starting new session
2022-11-15 19:16:04.076451: I tensorflow/core/grappler/optimizers/meta_optimizer.cc:1164] Optimization results for grappler item: graph_to_optimize
  function_optimizer: Graph size after: 140 nodes (0), 171 edges (0), time = 1.74ms.
  function_optimizer: Graph size after: 140 nodes (0), 171 edges (0), time = 1.85ms.
Optimization results for grappler item: sequential_16_lstm_33_while_cond_239620
  function_optimizer: function_optimizer did nothing. time = 0.006ms.
  function_optimizer: function_optimizer did nothing. time = 0.001ms.
Optimization results for grappler item: sequential_16_lstm_32_while_body_239481
  function_optimizer: function_optimizer did nothing. time = 0.003ms.
  function_optimizer: function_optimizer did nothing. time = 0

AKM metadata Dictionary saved at LSTMModels/20221115_157188419817864_177620140515274_PHRASE_CPC_USD_LSTMModel

 ***** Total skipped AKM combination due to insufficient data = 0 ***** 


 ***** Total Failed AKM combination = 0 ***** 

