  
#   1  Model Deployment


 
##   2  Objective and Function Description

The objective of this notebook is to develop deployment API for the best performing ML model.   The functions are
-  process_data
-  regr_metric

### process_data(in_point)    
__input__ : csv file with sample  data point(s).   
__output__ : predictions for the input data.     
__processing__: This function takes the input data sample and produces prediction for the input data.The input data is pre-processed to handle nulls.Then helper functions perform feature engineering to get additional features. These features undergo feature transformation. The transformed features are used to compute the final prediction.    
__display__: The functions has display for process time 

### regr_metric(test_sampl,y_pred)
__input__ : input data ,model preiction.   
__output__ : model metrics  mean_squared_error and mean_absolute_error.     
__processing__: This function takes input data sample and model prediction for this data.Using these two inputs this function computes the metrics for the regression model like mean_squared_error and mean_absolute_error.    
__display__:   mean_squared_error , mean_absolute_error

 
##   3  Create input data point

In [1]:
import pandas as pd
import json
import seaborn as sns#Plots
import matplotlib.pylab as plt
import numpy as np
from scipy import stats
from datetime import datetime, timedelta
import time
# Hyperparameter tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import lightgbm as lgb
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn import metrics 
from sklearn.externals import joblib
# Import label encoder
from sklearn.preprocessing import LabelEncoder
import warnings
warnings.filterwarnings('ignore')

In [21]:
%%time
# extract single data point from data
test_clean = pd.read_csv('test_clean.csv',converters={"fullVisitorId": str})

CPU times: user 1.53 s, sys: 89.1 ms, total: 1.62 s
Wall time: 1.62 s


In [22]:
in_point = test_clean.head(1)

In [23]:
# save data to file 
in_point.to_csv('in_point.csv',index=False)

In [24]:
in_point

Unnamed: 0,channelGrouping,date,fullVisitorId,visitId,visitNumber,visitStartTime,device.browser,device.operatingSystem,device.isMobile,device.deviceCategory,...,totals.timeOnSite,totals.sessionQualityDim,totals.transactions,totals.transactionRevenue,totals.totalTransactionRevenue,trafficSource.referralPath,trafficSource.campaign,trafficSource.source,trafficSource.medium,trafficSource.keyword
0,Organic Search,20180511,7460955084541987166,1526099341,2,1526099341,Chrome,Android,True,mobile,...,973.0,1,,,,(not set),(not set),google,organic,(not provided)


##   4 process_data()

##   4.1  helper functions for process_data()

In [26]:
def transform_cat(indata):
    '''
    encode categorical features
    '''
    # categorical features
    cat_cols =  ['channelGrouping', 'device.browser', 'device.operatingSystem',
                 'device.deviceCategory',  'geoNetwork.continent', 'geoNetwork.subContinent',
                 'geoNetwork.country',  'geoNetwork.region',  'geoNetwork.metro',
                 'geoNetwork.city', 'geoNetwork.networkDomain', 'trafficSource.campaign',
                 'trafficSource.source',  'trafficSource.medium', 'trafficSource.keyword',
                 'trafficSource.referralPath',
                 'browser_category', 'browser_os','source_country',
                 'channelGrouping_browser','channelGrouping_OS']
    for col in cat_cols: 
        labelencoder = LabelEncoder()
        indata[col] = labelencoder.fit_transform(indata[col].values.astype('str' ))

In [27]:
#https://www.kaggle.com/robikscube/tutorial-time-series-forecasting-with-xgboost
def transform_date_int(indata):
    """
    Create temporal features from date fields
    """
    indata['date'] = pd.to_datetime(indata['date'])
    indata['dayofweek'] = indata['date'].dt.dayofweek 
    indata['quarter'] = indata['date'].dt.quarter 
    indata['month'] = indata['date'].dt.month 
    indata['year'] = indata['date'].dt.year 
    indata['dayofyear'] = indata['date'].dt.dayofyear 
    indata['dayofmonth'] = indata['date'].dt.day 
    indata['weekofyear'] = indata['date'].dt.isocalendar().week.astype(float)
    #features for visitStartTime
    indata['vis_date'] = pd.to_datetime(indata['visitStartTime'], unit='s')
    indata['sess_date_hours'] = indata['vis_date'].dt.hour 
    #https://www.kaggle.com/ashishpatel26/permutation-importance-feature-imp-measure-gacrp/notebook
    indata['hits_per_day']   = indata.groupby('dayofyear')['totals.hits'].transform('nunique') 
    indata['hits_per_month'] = indata.groupby('month')['totals.hits'].transform('nunique') 
    indata['hits_per_dom'] = indata.groupby('dayofmonth')['totals.hits'].transform('nunique') 
    indata['hits_per_dow'] = indata.groupby('dayofweek')['totals.hits'].transform('nunique') 
    indata['pageviews_per_day'] = indata.groupby('dayofyear')['totals.pageviews'].transform('nunique') 
    indata['pageviews_per_month'] = indata.groupby('month')['totals.pageviews'].transform('nunique') 
    indata['pageviews_per_dom'] = indata.groupby('dayofmonth')['totals.pageviews'].transform('nunique') 
    indata['pageviews_per_dow'] = indata.groupby('dayofweek')['totals.pageviews'].transform('nunique') 
    indata['month_unique_user_count'] = indata.groupby('month')['fullVisitorId'].transform('nunique')
    indata['day_unique_user_count'] = indata.groupby('dayofyear')['fullVisitorId'].transform('nunique')
    indata['weekday_unique_user_count'] = indata.groupby('dayofweek')['fullVisitorId'].transform('nunique')
    indata['monthday_unique_user_count'] = indata.groupby('dayofmonth')['fullVisitorId'].transform('nunique') 
    indata['browser_category'] = indata['device.browser'] + '_' + indata['device.deviceCategory']
    indata['browser_os'] = indata['device.browser'] + '_' + indata['device.operatingSystem']
    indata['source_country'] = indata['trafficSource.source'] + '_' + indata['geoNetwork.country']
    indata['channelGrouping_browser'] = indata['device.browser'] + "_" + indata['channelGrouping']
    indata['channelGrouping_OS'] = indata['device.operatingSystem'] + "_" + indata['channelGrouping']
    indata['tran_per_day']   = indata.groupby('dayofyear')['totals.transactions'].transform('nunique') 
    indata['tran_per_month'] = indata.groupby('month')['totals.transactions'].transform('nunique') 
    indata['tran_per_dom'] = indata.groupby('dayofmonth')['totals.transactions'].transform('nunique') 
    indata['tran_per_dow'] = indata.groupby('dayofweek')['totals.transactions'].transform('nunique') 
    indata['timeOnSite_per_day']   = indata.groupby('dayofyear')['totals.timeOnSite'].transform('nunique') 
    indata['timeOnSite_per_month'] = indata.groupby('month')['totals.timeOnSite'].transform('nunique') 
    indata['timeOnSite_per_dom'] = indata.groupby('dayofmonth')['totals.timeOnSite'].transform('nunique') 
    indata['timeOnSite_per_dow'] = indata.groupby('dayofweek')['totals.timeOnSite'].transform('nunique') 
    
    #convert int to float to avoid truncations
    for col in indata.columns:
        if indata[col].dtype == 'int64':   
           indata[col] = indata[col].astype(float)
    return indata

In [28]:
def data_transform(indata):
    '''
    transform user level features
    '''
    #https://stackoverflow.com/questions/19078325/naming-returned-columns-in-pandas-aggregate-function
    def diff(x):
      #get difference between max & min date  
      time_d =  x.max() -x.min() 
      return float(time_d.days)
    #https://stackoverflow.com/questions/17071871/how-do-i-select-rows-from-a-dataframe-based-on-column-values  
    def vcount(x):
      #get count of rows for visitStartTime
      return len(indata.loc[indata['visitStartTime'].isin(x)])
    #
    date_min= min(indata['date'])
    date_max= max(indata['date'])   
    # difference between preiod start and first visit
    def min_diff(x):
      time_d= x.min() - date_min
      return float(time_d.days)
    # difference between preiod end and last visit
    def max_diff(x):
      time_d=  date_max - x.max()
      return float(time_d.days)
        

    dfg=indata.groupby('fullVisitorId').agg({ 
    'channelGrouping'             : ['max'],              
    'device.browser'              : 'max',                   
    'device.operatingSystem'      : 'max',                     
    'device.isMobile'             : 'max',      
    'device.deviceCategory'       : 'max',                      
    'geoNetwork.continent'        : 'max',                      
    'geoNetwork.subContinent'     : 'max',                        
    'geoNetwork.country'          : 'max',                       
    'geoNetwork.region'           : 'max',                        
    'geoNetwork.metro'            : 'max',                          
    'geoNetwork.city'             : 'max',                         
    'geoNetwork.networkDomain'    : 'max',                          
    'trafficSource.campaign'      : 'max',                         
    'trafficSource.source'        : 'max',                          
    'trafficSource.medium'        : 'max',                         
    'trafficSource.keyword'       : 'max',                          
    'trafficSource.referralPath'  : 'max',                         
    'date'                      : [('_diff', diff),('_min',min_diff),('_max',max_diff)], 
    'visitStartTime'            : [('_count', vcount)],                    
    'totals.hits'               : [ 'max', 'min', 'sum','mean'],    
    'totals.pageviews'          : ['max', 'min', 'sum','mean'],
    'totals.timeOnSite'         : ['max', 'min', 'sum','mean'], 
    'totals.sessionQualityDim'  : 'max',
    'totals.transactions'       : 'sum',                
    'totals.transactionRevenue' : 'sum',               
    'totals.totalTransactionRevenue' :  'sum',
    'dayofweek'                 :  [ 'max', 'min'],                
    'quarter'                   :  [ 'max', 'min'],                                
    'month'                     :  [ 'max', 'min'],                                   
    'year'                      :  [ 'max', 'min'],                                 
    'dayofyear'                 :  [ 'max', 'min'],                                 
    'dayofmonth'                :  [ 'max', 'min'],                                   
    'weekofyear'                :  [ 'max', 'min'],      
    'sess_date_hours'           :  [ 'max', 'min', 'sum','mean'],
    'hits_per_day'              :  [ 'max', 'min', 'sum','mean'],   
    'hits_per_month'            :  [ 'max', 'min', 'sum','mean'],    
    'hits_per_dom'              :  [ 'max', 'min', 'sum','mean'],    
    'hits_per_dow'              :  [ 'max', 'min', 'sum','mean'],    
    'pageviews_per_day'         :  [ 'max', 'min', 'sum','mean'], 
    'pageviews_per_month'       :  [ 'max', 'min', 'sum','mean'], 
    'pageviews_per_dom'         :  [ 'max', 'min', 'sum','mean'], 
    'pageviews_per_dow'         :  [ 'max', 'min', 'sum','mean'], 
    'month_unique_user_count'   :  [ 'max', 'min', 'sum','mean'],    
    'day_unique_user_count'     :  [ 'max', 'min', 'sum','mean'],    
    'weekday_unique_user_count' :  [ 'max', 'min', 'sum','mean'],    
    'monthday_unique_user_count' :  [ 'max', 'min', 'sum','mean'],    
    'browser_category'          : 'max', 
    'browser_os'                : 'max',
    'source_country'            : 'max',   
    'channelGrouping_browser'   : 'max',   
    'channelGrouping_OS'        : 'max',   
    'tran_per_day'              :  [ 'max', 'min', 'sum','mean'],   
    'tran_per_month'            :  [ 'max', 'min', 'sum','mean'],  
    'tran_per_dom'              :  [ 'max', 'min', 'sum','mean'],  
    'tran_per_dow'              :  [ 'max', 'min', 'sum','mean'],  
    'timeOnSite_per_day'        :  [ 'max', 'min', 'sum','mean'],   
    'timeOnSite_per_month'      :  [ 'max', 'min', 'sum','mean'],  
    'timeOnSite_per_dom'        :  [ 'max', 'min', 'sum','mean'],  
    'timeOnSite_per_dow'        :  [ 'max', 'min', 'sum','mean']  
     
        
    })
    #rename column by appending the aggregate name
    dfg.columns = ["_".join(x) for x in dfg.columns.ravel()]
    return dfg

##    4.2  process_data() function 

In [29]:
def process_data(test_data):
    '''
    process input query point and compute predicted value
    '''
    start = time.process_time()
    #-----------------------------------
    #load saved models using joblib
    cat_cols =  ['channelGrouping', 'device.browser', 'device.operatingSystem',
                 'device.deviceCategory',  'geoNetwork.continent', 'geoNetwork.subContinent',
                 'geoNetwork.country',  'geoNetwork.region',  'geoNetwork.metro',
                 'geoNetwork.city', 'geoNetwork.networkDomain', 'trafficSource.campaign',
                 'trafficSource.source',  'trafficSource.medium', 'trafficSource.keyword',
                 'trafficSource.referralPath',
                 'browser_category', 'browser_os','source_country',
                 'channelGrouping_browser','channelGrouping_OS']
    lbgclf = joblib.load('pkllbgclf')
    lbgreg = joblib.load('pkllbgreg')
    for col in cat_cols:
        model =  'le' + col
        pkl_fl = 'pkl' + col
        model = joblib.load(pkl_fl)
    #-----------------------------------
    #pre-processing data
    #handle boolean data
    test_data['device.isMobile'] = test_data['device.isMobile'].astype(bool)
    #handle numeric data for nulls
    #numeric features
    num_cols =  ['visitId','visitNumber','visitStartTime',
                 'totals.hits','totals.pageviews','totals.sessionQualityDim',
                 'totals.timeOnSite','totals.transactions','totals.transactionRevenue',
                 'totals.totalTransactionRevenue']
    null_cols = in_point[in_point.columns[in_point.isna().any()]]
    for col in num_cols:
        test_data[col] = test_data[col].astype('float')
        if  col in null_cols:
            test_data[col] =0          # replace nulls with zeros
            
    #-----------------------------------     
    #feature transforms
    test_data = transform_date_int(test_data)
    transform_cat(test_data)
    trans_data =  data_transform(test_data)
    #add missing column
    trans_data['fullVisitorId'] = 0
    trans_data= trans_data.drop('totals.totalTransactionRevenue_sum',axis=1)
    #-----------------------------------
    target_cols= [ 'fullVisitorId']
    #Predicting  on test data
    classifier_pred = lbgclf.predict_proba(trans_data.drop(target_cols,axis=1))
    regressor_pred = lbgreg.predict(trans_data.drop(target_cols,axis=1))
    final_pred = (classifier_pred[:,1]*regressor_pred)
    elapsed_time = time.process_time() - start
    print('time',elapsed_time)
    return final_pred 

In [30]:
# pass sample of test data for predictions:
predictions = process_data(in_point)

time 0.23521489500000037


In [31]:
print('predictions',predictions)

predictions [0.01041864]


##   5 regr_metric()

In [41]:
def regr_metric(test_sampl,y_pred):
    ''' 
    Calculate & print regresion metrics
    '''
    #
    submit_df=pd.DataFrame()
    submit_df['fullVisitorId']=test_sampl['fullVisitorId']
    submit_df['PredictedLogRevenue'] = y_pred
    #
    test_df =  test_sampl[['fullVisitorId','totals.totalTransactionRevenue'] ] 
    test_df['totals.totalTransactionRevenue'].fillna(0, inplace=True) 
    test_grp= test_df.groupby('fullVisitorId')\
           ['totals.totalTransactionRevenue'].sum().apply(np.log1p,ais=1).reset_index()
    test_grp = pd.merge(test_grp,submit_df,on='fullVisitorId')
    rms= np.sqrt(metrics.mean_squared_error(test_grp['totals.totalTransactionRevenue'],
                                        test_grp['PredictedLogRevenue']))
    map = metrics.mean_absolute_error(test_grp['totals.totalTransactionRevenue'],test_grp['PredictedLogRevenue'])
    print('mean_squared_error  =',rms)
    print('mean_absolute_error =',map)

In [42]:
regr_metric(in_point,predictions)

mean_squared_error  = 0.010418636002181562
mean_absolute_error = 0.010418636002181562
