# Import modules and define parameters

In [1]:
import time
import os
import gzip
import pickle
import platform
import numpy as np
import pandas as pd

pklhp = pickle.HIGHEST_PROTOCOL

In [2]:
from wbce_fitter import RecurrentWBCE

Using TensorFlow backend.


In [3]:
# database stuff
# valid choices: day, week, month, quarter
timeunit = 'week' 

In [4]:
# meta settings
workers = 4
batch_size = 10*4096
epochs = 1

# names and stuff
purpose = 'gprun'
modelspecfname = 'spec05.py'

In [5]:
#skudex = None
skudex = 1

# db name
skusfname = 'sodapopskus.txt'
discardfirstn = 450 #250

## Load Transaction Data

In [6]:
import funforsql_blagdon as ffs

In [7]:
transactions = ffs.gettransactions_cached(skusfname, timeunit=timeunit)
transactions = transactions.loc[transactions['ordqty']>0,:]

OperationalError: could not connect to server: Operation timed out
	Is the server running on host "prod-blagdon.c3dswjg84n8q.eu-west-1.redshift.amazonaws.com" (52.210.174.154) and accepting
	TCP/IP connections on port 5439?


In [None]:
import matplotlib.pyplot as plt

def get_date_counts(obj):
    orddate, subset = obj
    return (orddate, len(subset))    

groupedby_orddate = transactions.groupby('orddate_index')
transactions_perorddate = np.array(list(map(get_date_counts, list(groupedby_orddate))))

plt.plot(transactions_perorddate[:,0], transactions_perorddate[:,1])
plt.show()

In [None]:
def discardfirst(transactions, n):
    dates = transactions.loc[:, 'orddate_index'] - n
    transactions.loc[:, 'orddate_index'] = dates
    return transactions.loc[dates>=0,:]

transactions = discardfirst(transactions, discardfirstn)

com, split, fin = ffs.gettimes(transactions)
## note that com needs to be 0 since i didn't bother to account for the case where it isn't 0.
print ('com:%i, split:%i, fin:%i' % (com, split, fin))

transtrain = ffs.getdf_intimerange(transactions, com, split)
transtest  = ffs.getdf_intimerange(transactions, split, fin)

uniqueskus = np.sort(np.unique(transtrain.loc[:, 'product_sku']))

## Modify source db type, subset transaction table if necessary

In [None]:
useskus = uniqueskus
if skudex is not None:
    useskus = np.array([uniqueskus[skudex]])
print (useskus)

In [None]:
particularsku = ''
if not (str(useskus) == str(uniqueskus)):
    particularsku = str(useskus[0])
    useobs = np.in1d(transtrain.loc[:,'product_sku'], useskus)
    transtrain = transtrain.loc[useobs,:]
    useobs = np.in1d(transtest.loc[:,'product_sku'], useskus)
    transtest = transtest.loc[useobs,:]

In [None]:
cids = ffs.getcustid(transtrain)

In [None]:
nprods = len(useskus)
nobs = len(cids)

In [None]:
print ('len cids', len(cids))

## Model Fitting!

In [None]:
print (transtrain.head())

In [None]:
# evdetails is pandas dataframe with columns ['id', 'event_time', 'event_type']
d = {'id':         transtrain['custid'], 
     'event_time': transtrain['orddate_index'], 
     'event_type': transtrain['product_sku']}
evdetails = pd.DataFrame(d)
print (evdetails.head())

In [None]:
model = RecurrentWBCE(evdetails, 
                      modelspecfname, 
                      seqlen = split-com+1, 
                      batch_size = batch_size,
                      workers = workers,
                      dbidentifier = skusfname+particularsku)

In [None]:
print ('xf shape: %s' % str(model.datginny[0][0].shape))
print ('ytrue shape: %s' % str(model.datginny[0][1].shape))

In [None]:
model.fit(lr=.01, epochs=epochs)

## Get Metric and Truth

In [None]:
metric = model.predict(fin-split)

In [None]:
nprods = len(useskus)
nobs = len(cids)

truth = np.empty((nobs, nprods, 1))
for prod in np.arange(nprods):
    sku = useskus[prod]
    relevant = transtest.loc[:, 'product_sku'] == sku
    truthtemp = np.in1d(cids, ffs.getcustid(transtest.loc[relevant,:]))
    truth[:, prod, :] = truthtemp.reshape((nobs, 1))

In [None]:
from evaluate import roceval
dump = roceval(truth.reshape(nobs*nprods), metric.reshape(nobs*nprods))

### Format and save...

In [None]:
validationdf=pd.DataFrame(index=cids)
skudex=0
for sku in useskus:
    skuname = str(useskus[skudex])
    validationdf['truth' +skuname]=pd.Series(truth [:,skudex,0], index=cids)
    validationdf['metric'+skuname]=pd.Series(metric[:,skudex,0], index=cids)
    skudex+=1
print (validationdf.head(10))
pickle.dump(validationdf, 
            gzip.open(skusfname+particularsku+'_'+modelspecfname+'_eval.pkl'+str(pklhp), 'wb'), 
            -1)