## Install Packages

In [None]:
import subprocess
import sys

def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", package])

#install("awscli==1.22.3")
#install("s3fs==2022.01.0")

## Import Libraries

In [2]:
import s3fs
import pandas as pd
import numpy as np 
import pickle
import sys
import os 
from typing import List
from typing import Optional 
import re
import pyarrow.parquet as pq
import pyarrow as pa

import xgboost  
from tensorflow.keras.models import load_model 
import warnings
warnings.filterwarnings("ignore")

2022-03-28 04:20:46.206209: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1


## Test Code Block

In [9]:
fs = s3fs.S3FileSystem(anon=False, key='****', secret='****')
fs.ls("s3://zigbang-data/stage/dw_member_loyalty_inference_result/tmpdata")
fs.ls("s3://zigbang-mlops/loyalty")

['zigbang-mlops/loyalty/',
 'zigbang-mlops/loyalty/mean.npy',
 'zigbang-mlops/loyalty/model.json',
 'zigbang-mlops/loyalty/model_logit.pkl',
 'zigbang-mlops/loyalty/model_nn.h5',
 'zigbang-mlops/loyalty/model_xgb.json',
 'zigbang-mlops/loyalty/std.npy']

## Pipeline Parameters

In [None]:
# Parameter settings required for running a pipeline

AWS_KEY = "*****"
AWS_SECRET = "*****"
BASE_DATE = "2022-04-05"
TARGET_DB = "user"

## Define Functions

In [4]:
def timer(fn):
    from time import perf_counter

    def inner(*args, **kwargs):
        start_time = perf_counter()
        to_execute = fn(*args, **kwargs)
        end_time = perf_counter()
        execution_time = end_time - start_time
        print('{0} took {1:.8f}s to execute'.format(fn.__name__, execution_time))
        return to_execute

    return inner 

In [5]:
class BatchInput:
    base_date: str
    target_db: str

In [6]:
class LoyaltyClassifier:
    _CONST_MODEL_BUCKET_PATH = "s3://zigbang-mlops/loyalty" 
    _CONST_MODEL_XGB_PATH = "model/model_xgb.json"
    _CONST_MODEL_LR_PATH = "model/model_logit.pkl"
    _CONST_MODEL_NN_PATH = "model/model_nn.h5"
    _CONST_MODEL_MEAN_PATH = "model/mean.npy"
    _CONST_MODEL_STD_PATH = "model/std.npy"

    _CONST_BASE_S3_URI = "s3://zigbang-data/"
    _CONST_DATA_PATH = "/dw_member_loyalty_feature/"
    _CONST_OUTPUT_PATH = "/dw_member_loyalty_inference_result"  
    
    def __init__(self) -> None:
        self._s3_model_loader()
        self.models, self.mean, self.std = self._load_model() 
             
    @timer
    def batch_classify(self, input):
        df = self._s3_data_loader(input)
        feature_list = [f"f{i}" for i in range(1,19)] 
 
        user_no, X_pred, y_pred = df["user_no"], df[feature_list], np.where(df["call_cnt"]>0, 1, 0)
        
        # standardization
        X_pred -= self.mean
        X_pred /= self.std
             
        d_pred = xgboost.DMatrix(X_pred, label=y_pred)
        prob_xgb = self.models['xgb'].predict(d_pred)
        prob_lr = self.models['lr'].predict(X_pred)
        prob_nn = self.models['nn'].predict(X_pred).flatten()
         
        result = pd.concat([user_no, X_pred, pd.Series(y_pred, name="call"), \
            pd.Series((prob_xgb+prob_lr+prob_nn)/3, name="prob"), \
            pd.Series(prob_xgb, name="prob_xgb"), pd.Series(prob_lr, name="prob_lr"), \
            pd.Series(prob_nn, name="prob_nn"), df["roll_period"]], axis=1)
        # add base date on dag - task2
        #result['base_date'] = pd.to_datetime(input.base_date).strftime("%Y-%m-%d") 
                 
        return result 
 
    @timer
    def _load_model(self): 
        models = {}
        model_xgb = xgboost.Booster()
        model_xgb.load_model(self._CONST_MODEL_XGB_PATH)
        models['xgb'] = model_xgb

        model_lr = pickle.load(open(self._CONST_MODEL_LR_PATH, "rb"))
        models['lr'] = model_lr

        model_nn = load_model(self._CONST_MODEL_NN_PATH)
        models['nn'] = model_nn

        mean = np.load(self._CONST_MODEL_MEAN_PATH)
        std = np.load(self._CONST_MODEL_STD_PATH)
        return models, mean, std
        
    @timer
    def _s3_model_loader(self):
        fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET) 
        fs.ls(self._CONST_MODEL_BUCKET_PATH)
        fs.download("zigbang-mlops/loyalty/model_xgb.json", self._CONST_MODEL_XGB_PATH)
        fs.download("zigbang-mlops/loyalty/model_logit.pkl", self._CONST_MODEL_LR_PATH)
        fs.download("zigbang-mlops/loyalty/model_nn.h5", self._CONST_MODEL_NN_PATH)
        fs.download("zigbang-mlops/loyalty/mean.npy", self._CONST_MODEL_MEAN_PATH)
        fs.download("zigbang-mlops/loyalty/std.npy", self._CONST_MODEL_STD_PATH)

    @timer
    def _s3_data_loader(self, input):
        fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET) 
        print("target url: ", self._CONST_BASE_S3_URI+input.target_db+self._CONST_DATA_PATH)
        batch_data = fs.ls(self._CONST_BASE_S3_URI+input.target_db+self._CONST_DATA_PATH, refresh=True)
        data = ''

        try: 
            for batch in batch_data:
                if bool(re.search(input.base_date, batch)) is True:
                    uri = f's3://{batch}'
                    data = fs.ls(uri, refresh=True)[0]
            if(len(data) == 0):
                raise Exception('wrong s3 uri: ', data) 
            
        except Exception as e:
            print(e)

        bucket_uri = f's3://{data}' 
        df = pq.ParquetDataset(bucket_uri, filesystem=fs).read_pandas().to_pandas()
        
        return df 

## Batch Inference Step

In [7]:
batch_input = BatchInput()
batch_input.base_date = BASE_DATE
batch_input.target_db = TARGET_DB

classifier = LoyaltyClassifier()

 
result = classifier.batch_classify(batch_input) 

_s3_model_loader took 1.08582731s to execute


2022-03-28 04:21:00.624090: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcuda.so.1
2022-03-28 04:21:00.624126: E tensorflow/stream_executor/cuda/cuda_driver.cc:314] failed call to cuInit: UNKNOWN ERROR (-1)
2022-03-28 04:21:00.624142: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (kale8g-0): /proc/driver/nvidia/version does not exist
2022-03-28 04:21:00.624369: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-03-28 04:21:00.662494: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2499995000 Hz
2022-03-28 04:21:00.662788: I tensorflow/compiler/xla/service/service.c

_load_model took 0.37627557s to execute
target url:  s3://zigbang-data/stage/dw_member_loyalty_feature/
_s3_data_loader took 2.40398365s to execute


2022-03-28 04:21:28.176477: W tensorflow/core/framework/cpu_allocator_impl.cc:81] Allocation of 390602448 exceeds 10% of free system memory.


batch_classify took 65.71268230s to execute


In [9]:
result

Unnamed: 0,user_no,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f15,f16,f17,f18,call,prob,prob_xgb,prob_lr,prob_nn,roll_period
0,10000887,-0.117087,-0.163458,-0.301380,0.020191,-0.375167,-0.106524,-0.204909,-0.160836,-0.266485,...,-0.343018,-0.526745,0.235529,-0.200286,0,0.142203,0.023167,0.334836,0.068605,30
1,10005768,-0.117087,-0.163458,-0.301380,-0.292318,-0.487006,-0.106524,-0.204909,-0.160836,-0.266485,...,-0.548881,-0.465996,-0.459681,-0.200286,0,0.138044,0.008374,0.374444,0.031312,30
2,10008430,-0.117087,-0.163458,0.387632,0.020191,-0.487006,0.980519,-0.204909,-0.160836,0.102615,...,0.018925,0.258135,0.533476,0.484647,1,0.511797,0.413050,0.671630,0.450711,30
3,10009709,-0.117087,-0.163458,-0.301380,-0.292318,-0.487006,-0.106524,-0.204909,-0.285277,-0.266485,...,-0.706306,-0.647029,-0.459681,-0.200286,0,0.125228,0.006059,0.360146,0.009480,30
4,10010187,-0.117087,-0.163458,-0.301380,-0.292318,-0.598846,-0.106524,-0.204909,-0.285277,-0.266485,...,-0.921588,-0.729648,-0.459681,-0.200286,0,0.134131,0.009366,0.363075,0.029952,30
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2712512,9983054,-0.117087,-0.163458,-0.301380,-0.292318,-0.598846,-0.106524,-0.204909,-0.285277,-0.266485,...,-1.015774,-0.863296,-0.459681,-0.200286,0,0.126174,0.007833,0.350850,0.019839,7
2712513,9987343,-0.117087,-0.163458,-0.301380,-0.292318,-0.487006,0.436997,0.023918,-0.223057,-0.081935,...,-0.445277,-0.600860,-0.459681,-0.200286,0,0.138394,0.022314,0.345934,0.046932,7
2712514,9988464,0.086430,-0.163458,-0.301380,-0.292318,-0.598846,-0.106524,-0.204909,-0.285277,0.287165,...,-0.963299,-0.772172,-0.261049,-0.200286,0,0.170383,0.092186,0.354473,0.064492,7
2712515,9989063,-0.117087,-0.163458,-0.301380,-0.292318,-0.598846,-0.106524,-0.204909,-0.223057,-0.266485,...,-0.957917,-0.842034,-0.459681,-0.200286,0,0.125945,0.009708,0.350088,0.018038,7


## Upload parquet temp table to S3

In [10]:
table = pa.Table.from_pandas(result) 
    
print("uploading tmp table...")
fs = s3fs.S3FileSystem(anon=False, key=AWS_KEY, secret=AWS_SECRET) 
# temp file clean up
if(len(fs.ls("s3://zigbang-data/"+batch_input.target_db+"/dw_member_loyalty_inference_result/tmpdata")) > 0):
    print("clean up tmp files...")
    fs.rm("s3://zigbang-data/"+batch_input.target_db+"/dw_member_loyalty_inference_result/tmpdata", "-rf")
print("writing tmp table...")
pq.write_to_dataset(table=table, \
    root_path="s3://zigbang-data/"+batch_input.target_db+"/dw_member_loyalty_inference_result/tmpdata", filesystem=fs)
print("done")

uploading tmp table...
clean up tmp files...
writing tmp table...
done
