In [1]:
from datetime import datetime
from pytz import timezone
import math
import sys
import os

# Define Variables and Functions

In [2]:
deployment = "https://vip.domino.tech"
model_version_id = "5bb6a0d3c9e77c0007cdd20a"
log_file = "churn-model-api-logs.txt"
output_file = log_file + ".gz"

In [3]:
def get_timestamp_ms(year, month, day, hour, minute):
    dt = datetime(year=year, month=month, day=day, hour=hour, minute=minute, tzinfo=timezone("US/Pacific"))
    ts_ms = math.floor(dt.timestamp())*1000
    print(ts_ms)
    return(ts_ms)

In [4]:
NOW = True
ZEROSTART = True

In [5]:
if (NOW):
    endMillis = math.floor(datetime.now(timezone("US/Pacific")).timestamp())*1000
else:
    endMillis = get_timestamp_ms(2018, 10, 4, 16, 28)

In [6]:
if (ZEROSTART):
    startMillis = 0
else:
    startMillis = get_timestamp_ms(2018, 10, 5, 0, 0)

# Construct CURL String

In [7]:
get_logs = "curl --header \"accept: application/x-ndjson\" --header \"X-Domino-Api-Key: " \
            + os.environ['DOMINO_USER_API_KEY'] \
            + "\" '" + deployment \
            + "/v4/modelManager/" + model_version_id \
            + "/logs?startMillis=" + str(startMillis) \
            + "&endMillis=" + str(endMillis) + "'" \
            + " --output " + output_file

In [8]:
! rm $log_file
! $get_logs
! gunzip $output_file

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  8305    0  8305    0     0    179      0 --:--:--  0:00:46 --:--:--  1036


# Define Historical File

In [9]:
historical_file = "/mnt/data/churn_model_logs.txt"

# Inspect Log File

In [10]:
# view raw text

f = open(log_file, "r")
data = f.read()
i=1
printlist = data.split("\n")
for line in printlist:
    if i <= 2: print(line)
    i += 1

{"timeStamp": "20190531T030022.497Z", "requestId": "QARLHXD852FZO7BI", "input": {"data": {"dropperc": 0.03, "mins": 242, "consecmonths": 23, "income": 20}}, "httpResponse": 200, "output": [3.7094176262899946e-06], "timingMillis": 0}
{"timeStamp": "20190531T090010.468Z", "requestId": "G3E5TIXZNC8KDLVB", "input": {"data": {"dropperc": 0.007013206782193444, "mins": 339.4465483354942, "consecmonths": 33.342765613049224, "income": 50.71623066140516}}, "httpResponse": 200, "output": [0.0001454727516093291], "timingMillis": 0}


# Parse Log File

In [11]:
# parse with json library
# view first line

import json
# one line at a time in a list
data = []
for line in open(log_file, 'r'):
    data.append(json.loads(line))

In [12]:
# load into df
# view first rows

import pandas as pd
df = pd.read_json(json.dumps(data))

In [13]:
# flatten 'input'
# view first rows

from pandas.io.json import json_normalize
input_df = json_normalize(df.input)

In [14]:
# flatten 'output'
# not formatted right for json_normalized
# use string functions
# view pre and post processing

s=""
for v in df.output.values:
     s+=str(v) + ","
res=[s]
res[0] = res[0][:-1]
res[0] = res[0].replace("{'churn_Y': ","")
res[0] = res[0].replace("}","")
res[0] = res[0].replace("nan","")
res[0] = res[0].replace("[","")
res[0] = res[0].replace("]","")
out_list = res[0].split(",")
out_df = pd.DataFrame({'output':out_list})
out_df = out_df.apply(pd.to_numeric, errors='ignore')

In [15]:
# join input and output back in
# view first rows

df = df.drop(['input'],axis=1)
df = df.drop(['output'],axis=1)
collist = df.columns.values.tolist()
df = df[collist].join(input_df).join(out_df)

# Parse Historical File

In [16]:
# grab historical data

import json
# one line at a time in a list
data_baseline = []
for line in open(historical_file, 'r'):
    data_baseline.append(json.loads(line))
    
import pandas as pd
df_baseline = pd.read_json(json.dumps(data_baseline))

from pandas.io.json import json_normalize
input_df_baseline = json_normalize(df_baseline.input)

s=""
for v in df_baseline.output.values:
     s+=str(v) + ","
res=[s]
res[0] = res[0][:-1]
res[0] = res[0].replace("{'churn_Y': ","")
res[0] = res[0].replace("}","")
res[0] = res[0].replace("nan","")
res[0] = res[0].replace("[","")
res[0] = res[0].replace("]","")
out_list_baseline = res[0].split(",")
out_df_baseline = pd.DataFrame({'output':out_list_baseline})
out_df_baseline = out_df_baseline.apply(pd.to_numeric, errors='ignore')

df_baseline = df_baseline.drop(['input'],axis=1)
df_baseline = df_baseline.drop(['output'],axis=1)
collist = df_baseline.columns.values.tolist()
df_baseline = df_baseline[collist].join(input_df_baseline).join(out_df_baseline)

# Compare Log and Historical

In [17]:
# libraries for graphing

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import NullFormatter
import matplotlib.dates as mdates
%matplotlib inline
from scipy.stats import kurtosis

In [18]:
df = df.sort_values(by=['timeStamp'])
df_baseline = df_baseline.sort_values(by=['timeStamp'])
df.to_csv('/mnt/data/parsed_model_log.csv')
df_baseline.to_csv('/mnt/data/parsed_model_log_baseline.csv')