In [1]:
import gzip
import pickle

import numpy as np
import pandas as pd
import psycopg2 as pg

pklhp = pickle.HIGHEST_PROTOCOL

import os
cwd = os.getcwd()
print (cwd)

/Users/tianlechen/Documents/GitHub/Togepi/tianle-rxplenishment/gru_arrivaltimes/run_crusoely/blagdon_sodapop_jointwrfm


In [2]:
testing = True

# Source transactions of interest

In [3]:
import funforsql as ffs
alltrans = ffs.gettransactions()
if testing: print (alltrans.head(5))

skusfname: sodapopskus.txt
timeunit: week
      custid  ordqty  product_sku                date
0  9623203.0     2.0      10201.0 2013-02-09 20:20:47
1  9475196.0     1.0      10201.0 2013-01-08 14:16:45
2   524466.0     1.0      10201.0 2013-01-08 13:39:34
3  7646254.0     2.0      10201.0 2013-03-05 17:13:34
4  9074093.0     1.0      10201.0 2014-06-04 13:39:00


In [4]:
d = pickle.load(gzip.open('dates.pkl' + str(pklhp), 'rb'))
if testing: 
    for k in d:
        print ('%s:\t%s' % (k, d[k]))

tstart:	2014-01-01 00:00:00
ttrain:	2015-06-30 00:00:00
tend:	2016-12-31 00:00:00


In [5]:
intraining = alltrans['date'] >= d['tstart']
intraining = np.logical_and(intraining, alltrans['date'] <= d['tend'])
alltrans = alltrans.loc[intraining, :]

In [6]:
if testing: print ('len alltrans', len(alltrans))

len alltrans 2042361


In [7]:
if testing: print (alltrans.sort_values(by = 'date').head(5))

            custid  ordqty  product_sku                date
40776    9314863.0     1.0      64330.0 2014-01-01 09:21:16
58099   12698719.0     2.0     181042.0 2014-01-01 09:29:13
792836   8367321.0     2.0      83092.0 2014-01-01 09:52:32
61724   11020411.0     4.0     140363.0 2014-01-01 10:10:41
62444   10199683.0     1.0     140363.0 2014-01-01 10:14:52


# Set up some global parameters

In [8]:
def datestrrep(dt):
    return str(dt.strftime('%Y-%m-%d'))

In [24]:
allcids = np.unique(np.array(alltrans['custid']))
if testing: print ('len unique allcids', len(allcids))

len unique allcids 596811


In [9]:
allskus = np.sort(np.unique(np.array(alltrans.loc[:, 'product_sku'])))
if testing: print (allskus); print('len(allskus):', len(allskus))

[  10201.   37852.   64330.   83092.  120768.  137538.  140363.  181042.]


In [11]:
alltimes = pd.date_range(d['tstart'], d['tend'])
if testing: print ('len alltimes', len(alltimes))

len alltimes 1096


# Source an rfm file

In [12]:
def getrfm(valdate, verbose = False):
    fname = cwd + '/rfmcache/rfm_valdate_%s.pkl2' % str(datestrrep(valdate))
    if verbose: import time; t0 = time.time()
    if pklhp == 2:
        out = pickle.load(gzip.open(fname, 'rb'))
    else:
        out = pickle.load(gzip.open(fname, 'rb'), encoding = 'latin1')
    if verbose: print ('loading took', time.time()-t0)
    return out

In [13]:
if testing:
    x = getrfm(alltimes[0])
    print ('len x:', len(x))

len x: 10


## Set up target cids, times, skus

In [14]:
tgtcids = allcids
if testing: 
    tgtcids = allcids[:10]
    print (len(tgtcids))

10


In [15]:
tgttimes = alltimes
if testing: 
    tgttimes = alltimes[:100]
    print (len(tgttimes))

100


In [16]:
tgtskus = allskus
if testing:
    tgtskus = allskus[:2]
    print (tgtskus)

[ 10201.  37852.]


## Try to get all covariates for tgtcids, tgtskus, tgttimes

data matrix is of shape:
```
len(tgtcids) BY len(tgtskus) BY len(tgttimes) BY 13
```

since there are 13 covariates:
```
(tse, tte, unc, pcs), 3*(rfmall, rfmcat, rfmthis)
```

In [17]:
print ('len(tgtcids):%s, len(tgtskus):%s, len(tgttimes):%s' % (len(tgtcids), len(tgtskus), len(tgttimes)))
print ('max size in MB:', 32*np.prod((len(tgtcids), len(tgtskus), len(tgttimes)))/np.power(10,6))

len(tgtcids):10, len(tgtskus):2, len(tgttimes):100
max size in MB: 0.064


In [25]:
import TimeSeriesTransforms as tstf

def formattedrow(cidtemp, skutemp, tgttimes, verbose = False):
    '''
    returns np.array of shape (len(tgttimes), 13)
    where 13 = #(tse,tte,unc,pcs) + 3*#(rfmall,rfmcat,rfmthis)
    '''
    if verbose: import time
    
    skuindex = int(np.where(allskus == skutemp)[0])
    
    if verbose: t0 = time.time()
    # get event indicator for cidtemp, skutemp
    evtind = alltrans['custid'] == cidtemp
    evtind = np.logical_and(evtind, alltrans['product_sku'] == skutemp)
    # event times as number of days from (origin = min(tgttimes))
    evt = np.sort(np.array((alltrans.loc[evtind, 'date'] - min(tgttimes)).dt.days))
    evt = evt[evt < len(tgttimes)]

    # default is uncensored=False, purchstatus=False, eventindicator=False
    evind = np.zeros(len(tgttimes))
    unc   = np.zeros(len(tgttimes))
    pcs   = np.zeros(len(tgttimes))
    # if there has been an event...
    if len(evt) > 0:
        evind[evt] = 1
        unc[:max(evt)] = 1
        pcs[min(evt):] = 1
    # transform to get tse, tte
    tse = tstf.tse(evind); tte = tstf.tte(evind)
    # reshape so that is of shape (len(tgttimes), 1)
    tse = tse.reshape((len(tgttimes), 1))
    tte = tte.reshape((len(tgttimes), 1))
    unc = unc.reshape((len(tgttimes), 1))
    pcs = pcs.reshape((len(tgttimes), 1))
    if verbose: t1 = time.time(); print('(tse, tte, unc, pcs) done in:', t1-t0); t0 = t1
    
    # set up rfms
    rfmm = np.zeros((len(tgttimes), 3, 3))
    for t in range(len(tgttimes)):
        x = getrfm(tgttimes[t], verbose = verbose)
        rfmall = x[0]
        rfmcat = x[1]
        rfmsku = x[skuindex+2]
        rfmlevel = 0
        for rfmtemp in [rfmall, rfmcat, rfmsku]:
            cidind = rfmtemp['custid'] == cidtemp
            if np.any(cidind):
                rfmm[t, rfmlevel, :] = np.array(rfmtemp.loc[cidind, ['r', 'f', 'm']]).reshape((1, 1, 3))
            rfmlevel += 1
    if verbose: t1 = time.time(); print('rfm source and stuff done in:', t1-t0); t0 = t1
            
    # flatten rfm
    rfmm_rs = rfmm.reshape((len(tgttimes), 9))
    # concatenate all and return
    collection = (tse, tte, unc, pcs, rfmm_rs)
    rowtemp = np.concatenate(collection, axis = 1)
    return rowtemp

In [26]:
def formattedrow_allskus(cidtemp, tgtskus, tgttimes, verbose = False):
    '''
    return long version of formattedrow, done for all in tgtskus
    '''
    longshape = (1, 1, len(tgttimes), 13)
    def worker(skutemp): 
        out = formattedrow(cidtemp, skutemp, tgttimes, verbose = verbose)
        return out.reshape(longshape)
        
    return np.concatenate([worker(k) for k in tgtskus], axis = 1)

In [27]:
if testing:
    import time
    print ('benchmarking for one cid...')
    t0 = time.time()
    out = formattedrow_allskus(tgtcids[0], tgtskus, tgttimes, verbose = True)
    print (time.time() - t0)

benchmarking for one cid...
(tse, tte, unc, pcs) done in: 0.016637802124023438
rfm source and stuff done in: 42.918596029281616
(tse, tte, unc, pcs) done in: 0.007244110107421875
rfm source and stuff done in: 42.55528497695923
85.50242829322815


Estimates for runtimes...
```
for nseq=100, nskus=2, timed=100s
for nseq=1000, nskus=8, expect=4000s
```
Repeat this for...
```
nids=600,000
nthreads=4: expect=4000*600,000/4=20yrs
nthreads=24: expect=4000*600,000/24=3yrs
```

In [21]:
def worker(cidtemp):
    ''' wrapper for formattedrow_allskus for use in Pool.map'''
    print ('doing cid: %s' % cidtemp)
    return formattedrow_allskus(cidtemp, tgtskus, tgttimes)

from multiprocessing import Pool

nthreads = 24
if testing: nthreads = 2

p = Pool(nthreads)
resl = p.map(worker, tgtcids)
data = np.concatenate(resl, axis = 0)

doing cid: 28.0
doing cid: 51.0
doing cid: 68.0
doing cid: 39.0
doing cid: 70.0
doing cid: 152.0
doing cid: 101.0
doing cid: 157.0
doing cid: 163.0
doing cid: 190.0


In [22]:
if testing: print(data.shape)

(10, 2, 100, 13)


In [23]:
fname = 'data.pkl' + str(pklhp)
pickle.dump(data, gzip.open(fname, 'wb'))

Expected file sizes...

```
len(tgtcids):10, len(tgtskus):2, len(tgttimes):100
max size in MB: 0.064
actual size on disk in MB: 0.009

len(tgtcids):600,000, len(tgtskus):8, len(tgttimes):1,000
max size in MB: 153,600
expect size on disk in MB: 21,600

len(tgtcids):1,000, len(tgtskus):8, len(tgttimes):1,000
max size in MB: 256
expect size on disk in MB: 36
```