In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import tensorflow as tf
import functools

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

print(tf.__version__)

sales_df = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv")
items_df = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/items.csv")
item_cats_df = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv")
shops_df = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/shops.csv")
test_df = pd.read_csv("/kaggle/input/competitive-data-science-predict-future-sales/test.csv")

train_df = sales_df.merge(items_df[['item_id','item_category_id']], left_on='item_id', right_on='item_id', how='left')

sales_month_df = train_df.sort_values(['shop_id','item_id','date_block_num']).groupby(['shop_id','item_id','date_block_num'])[['item_cnt_day']].sum().reset_index()

test_shop_ids = set(test_df.shop_id.unique())
train_shop_ids = set(sales_month_df.shop_id.unique())
# check if some shops, items, are not in train
test_item_ids = set(test_df.item_id.unique())
train_item_ids = set(sales_month_df.item_id.unique())
all_item_ids = set(items_df.item_id.unique())
#del tmp_sales_months_df
assert len(train_item_ids) < len(all_item_ids)

month_nbs = np.arange(0,sales_month_df.date_block_num.max()+1)
shops = shops_df.shop_id.unique()
# we first work only with item in train. Then we will add the one that are in test only (with values inherited from their categories)
items = sales_month_df.item_id.unique()


shop_item_ids = sales_month_df.drop_duplicates(subset = ['shop_id','item_id'])[['shop_id','item_id']].values.tolist()


index_tuples = [(shop_item[0], shop_item[1], m) for shop_item in shop_item_ids for m in month_nbs ]


sales_idx = pd.MultiIndex.from_tuples(index_tuples, names=('shop_id','item_id','date_block_num'))

#sales_idx = pd.MultiIndex.from_product([shops, items, month_nbs], names=('shop_id','item_id','date_block_num'))

tmp_sales_months_df = sales_month_df.set_index(['shop_id',
                                                'item_id',
                                                'date_block_num']).reindex(sales_idx, 
                                                                           fill_value=0.).reset_index()
# clean data memory
del sales_idx
del sales_month_df
del sales_df
del train_df


full_sales_month_df = tmp_sales_months_df.merge(items_df[['item_id','item_category_id']],
                            left_on='item_id',
                            right_on='item_id', 
                            how='left')


len(test_item_ids), len(train_item_ids), len(all_item_ids), len(full_sales_month_df)


full_sales_month_df.query('shop_id==59 & item_id==30')

## compute std and average so we can sample from the distribution for eache item, shop, month
we assume a normal distribution

### analyze test data

In [None]:
# only for shop, item tuple in test that are not in train
test_shop_item_ids = test_df.drop_duplicates(subset = ['shop_id','item_id'])[['shop_id','item_id']].values.tolist()

test_shop_item_set = set([(shop_item[0],shop_item[1]) for shop_item in test_shop_item_ids]) 
train_shop_item_set = set([(shop_item[0],shop_item[1]) for shop_item in shop_item_ids])

test_only_shop_items = list(test_shop_item_set.difference(train_shop_item_set))
train_test_shop_items = test_shop_item_set.union(train_shop_item_set)
len(test_only_shop_items)+ len(train_test_shop_items), len(test_shop_item_set.intersection(train_shop_item_set)), len(test_df)

print("nb of items with no categories:", len(items_df[items_df.item_category_id.isna()]))

shop_cat_month_df = full_sales_month_df.groupby(['shop_id',
                                                 'item_category_id',
                                                 'date_block_num']).item_cnt_day.agg([np.mean, 
                                                                                      np.std]).reset_index()

test_index_tuples = [(shop_item[0], shop_item[1], m) for shop_item in test_only_shop_items for m in month_nbs ]
test_only_sales_idx = pd.MultiIndex.from_tuples(test_index_tuples, names=('shop_id','item_id','date_block_num'))
# create a dataframe of sales based on items that are only in test.

test_only_sales_df = pd.DataFrame(index=test_only_sales_idx)
test_only_sales_df.reset_index(inplace=True)

# test only shop, item tuple should not be in train df
def by_len(it):
    return len(full_sales_month_df.query(f"shop_id=={it[0]} & item_id=={it[1]}"))>0
assert len(list(filter( by_len, test_only_shop_items[:100])))==0

test_only_sales_cat_df = test_only_sales_df.merge(items_df[['item_id','item_category_id']], left_on='item_id', right_on='item_id', how='left')


test_only_sales_cat_df.head(10)

test_only_sales_cat_avg_df = test_only_sales_cat_df.merge(shop_cat_month_df, 
                                                          left_on=['shop_id','item_category_id','date_block_num'], 
                                                          right_on=['shop_id','item_category_id','date_block_num'])
assert len(test_only_sales_cat_avg_df== len(test_only_sales_cat_df))

test_only_sales_cat_avg_df.query('(shop_id==22) &(item_id==13463)').head(10)

np.random.seed(42)
test_only_sales_cat_avg_df['item_cnt_day_sampled']=np.random.normal(test_only_sales_cat_avg_df['mean'].values, 
                                                            test_only_sales_cat_avg_df['std'].values)

test_only_sales_cat_avg_df.query('(shop_id==22) &(item_id==13463)').head(10)

test_only_sales_cat_avg_df.loc[:, 'item_cnt_day']=test_only_sales_cat_avg_df.item_cnt_day_sampled.round(0).replace([-0],0)
test_only_item_id = test_only_sales_cat_avg_df.item_id.unique()[10]
assert len(full_sales_month_df.query(f'(shop_id==0)&(item_id=={test_only_item_id})'))==0, "in full_sales_month_df, there should not be items in test only"

# columns should be the same
# Concatenate avg based sales for test only items and regular items
assert len(set(full_sales_month_df.columns).difference(set(test_only_sales_cat_avg_df.columns)))==0
all_sales_df = pd.concat([full_sales_month_df, test_only_sales_cat_avg_df[list(full_sales_month_df.columns.values)]])
assert len(all_sales_df)== len(full_sales_month_df)+len(test_only_sales_cat_avg_df)
assert len(all_sales_df.item_id.unique())==len(items_df.item_id.unique())

all_sales_df.item_cnt_day.fillna(0., inplace=True)

del test_only_sales_cat_avg_df
del test_only_sales_cat_df
del test_only_sales_df
del test_only_sales_idx
del full_sales_month_df

## Train-Validation split
Train on before last 3 month.
Validate on last 3 months.

In [None]:
train_df = all_sales_df[all_sales_df.date_block_num<33]
val_df = all_sales_df

all_sales_df.date_block_num.max(), train_df.date_block_num.max()
print(train_df[(train_df.shop_id==3)&(train_df.item_id==138)])

## Prepare features

In [None]:
WIN_SIZE=5
def prep_win_x(x, win_size=None,):
    if win_size:
        l = x.shape[0]-1
        to = l
        fr = to-win_size
        #print(l, fr, to)
        win_x = x[fr:to]
        #win_y = x[to:, y_col_idx]
    else:
        win_x = x[:-1]
        #win_y = x[-1:,y_col_idx]
    return win_x

def prep_y(x, y_col_idx=None):
    return x[-1:,y_col_idx][0]

tst_x = [np.array([[59., 30.,  1., 13., 0],
 [59., 30.,  2., 10., 1],
 [59., 30.,  3.,  4., 2],
 [59., 30.,  4.,  0., 3],
 [59., 30.,  5.,  0., 4],
 [59., 30.,  6.,  1., 5],
 [59., 30.,  7.,  1., 6]]),
 np.array([[59., 30.,  1., 13., 0],
 [59., 30.,  2., 10., 1],
 [59., 30.,  3.,  4., 2],
 [59., 30.,  4.,  0., 3],
 [59., 30.,  5.,  0., 4],
 [59., 30.,  6.,  1., 5],
 [59., 30.,  7.,  1., 6]])]

res_win=list(map(functools.partial(prep_win_x, win_size=5), tst_x))
assert res_win[0].shape[0]==5
res = list(map(functools.partial(prep_win_x), tst_x))
assert res[0].shape[0]==6

list(map(functools.partial(prep_y, y_col_idx=4), tst_x))

In [None]:
def create_win(df, win_size=30):
    arr = df[['shop_id','item_id','item_category_id','date_block_num', 'item_cnt_day']].sort_values(['shop_id','item_id','date_block_num']).values    
    groups_idx = np.unique(arr[:,:2], axis=0, return_index=True)
    #print(groups_idx)
    group_list = np.split(arr, groups_idx[1][1:])
    #print(group_list)
    
    x = np.array(list(map(functools.partial(prep_win_x, win_size=SEQLEN),group_list[:] )))
    y = np.array(list(map(functools.partial(prep_y, y_col_idx=4),group_list[:] )))
    
    return x, y

SEQLEN=30
train_x, train_y = create_win(train_df[:], win_size=SEQLEN)
assert train_x.shape[0]== train_y.shape[0]    

val_x, val_y = create_win(val_df, win_size=SEQLEN)
assert val_x.shape[0]== val_y.shape[0]    
train_x.shape, train_y.shape, val_x.shape, val_y.shape

In [None]:
train_x[:1], train_y[:1]
assert np.argwhere(np.isnan(train_x)).shape[0]==0, "there should be no nan value in train"
#train_x[13729]

In [None]:
def prep_seq_x(x):
    
    cols =['shop_id','item_id', 'item_category_id', 'month_nb','sales']
    
    cat_cols = ['shop_id','item_id', 'item_category_id']
    seq_cat_cols = ['month_nb']
    seq_cols = ['sales']
    pred_col = 'sales'
    X_cat = x[:, :, np.where(np.isin(cols, cat_cols))].squeeze()
    X_seq = x[:, :, np.where(np.isin(cols, seq_cols))].squeeze(-1)
    # create embeddings for each month
    X_seq_cat = x[:, :, np.where(np.isin(cols, seq_cat_cols))].squeeze(-1)

    return X_cat, X_seq, X_seq_cat


train_X_cat, train_X_seq, train_X_seq_cat = prep_seq_x(train_x[:],)
val_X_cat, val_X_seq, val_X_seq_cat = prep_seq_x(val_x[:],)
train_X_cat.shape, train_X_seq.shape, train_X_seq_cat.shape, train_y.shape

In [None]:
train_X_seq.squeeze().shape, train_X_seq.shape

In [None]:
# baseline with linear model
from sklearn.linear_model import LinearRegression

lr = LinearRegression()
train_X = train_X_seq.squeeze()
val_X = val_X_seq.squeeze()
lr.fit(train_X, train_y)
lr.score(train_X, train_y), lr.score(val_X, val_y)

In [None]:
!mkdir data

In [None]:
train_X_cat.shape, train_X_seq_cat.shape

In [None]:
def to_tf_records(X_cat, X_seq, X_seq_cat,y, file_name):
    with tf.io.TFRecordWriter(file_name) as writer: 
        for i, item in enumerate(X_seq.tolist()):
            x_cat_ser = tf.io.serialize_tensor(X_cat[i])  
            #print(x_cat_ser.numpy())
            ex_schema = {                
                'x_cat': tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(X_cat[i]).numpy()])),
                'x_seq': tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(X_seq[i]).numpy()])),
                # another way to encode 1d lists
                #'x_seq': tf.train.Feature(float_list=tf.train.FloatList(value=X_seq[i])),
                'x_seq_cat': tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(X_seq_cat[i]).numpy()])),
                #'x_seq_cat': tf.train.Feature(int64_list=tf.train.Int64List(value=X_seq_cat[i])),
                'y': tf.train.Feature(float_list=tf.train.FloatList(value=[y[i]]))
            }
            ex = tf.train.Example(features=tf.train.Features(feature=ex_schema))      
            writer.write(ex.SerializeToString())
                                  
                                  
nb_ex = 1000000  #train_X_cat.shape[0]
to_tf_records(train_X_cat[:nb_ex],train_X_seq[:nb_ex], train_X_seq_cat[:nb_ex],train_y[:nb_ex],'data/train_seq.proto')                                  

In [None]:
def parse_tfrecord_fn(example):
    feat_desc = {
        'x_cat': tf.io.FixedLenFeature([], dtype=tf.string),
        'x_seq': tf.io.FixedLenFeature([], dtype=tf.string),
        #'x_seq': tf.io.VarLenFeature(tf.float32),
        'x_seq_cat': tf.io.FixedLenFeature([], dtype=tf.string),        
        'y': tf.io.FixedLenFeature([], dtype=tf.float32)
    }    
    example = tf.io.parse_single_example(example, feat_desc)
    # other way to decode 1d list, if encoded that way
    #example["x_seq"] = tf.sparse.to_dense(example["x_seq"])    
    example['x_seq'] = tf.ensure_shape(tf.io.parse_tensor(example['x_seq'], out_type=tf.float64),(SEQLEN,1))    
    example['x_seq_cat']= tf.ensure_shape(tf.io.parse_tensor(example['x_seq_cat'], out_type=tf.float64), (SEQLEN, 1))
    example['x_cat'] = tf.ensure_shape(tf.io.parse_tensor(example['x_cat'], out_type=tf.float64), (SEQLEN, 3))
    example['y'] = tf.ensure_shape(example['y'], ())
    return example


debug_ex = False
raw_dataset = tf.data.TFRecordDataset(['data/train_seq.proto'])
for features in raw_dataset.map(parse_tfrecord_fn).batch(batch_size=2):
    for key in features.keys():       
        if debug_ex:
            print(f"{key}: {features[key]}")
    break



In [None]:
def prepare_sample(features):    
    ''' filter and format data to given model'''     
    #x_seq = tf.ensure_shape(features['x_seq'], (1, SEQLEN,1))   
    x_seq = features['x_seq']
    y = tf.ensure_shape(features['y'],())
    print("x seq", x_seq.shape, y.shape)
    return x_seq, y

AUTOTUNE = tf.data.AUTOTUNE

def get_dataset(filenames, batch_size):
    
    ## to optimize pipelinening instead of tf.data.TFRecord...
    #(tf.data.Dataset.list_files(filenames)
    #.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE))
    
    dataset = (
        #tf.data.TFRecordDataset(filenames, num_parallel_reads=AUTOTUNE)
        tf.data.Dataset.list_files(filenames)
        .interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        .map(parse_tfrecord_fn, num_parallel_calls=AUTOTUNE)
        .map(prepare_sample, num_parallel_calls=AUTOTUNE)
        .shuffle(batch_size * 10)
        .batch(batch_size)
        .prefetch(AUTOTUNE)
    )
    return dataset

#get_dataset('data/*.proto', batch_size=2)
ds = get_dataset('data/*.proto', batch_size=4)
X_batch, y_batch = next(iter(ds))
for i, row in enumerate(iter(ds)):
    print(i, row[0].shape, row[1].shape)
    if i>10:        
        break

In [None]:
# detect and init the TPU
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()

# instantiate a distribution strategy
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)


In [None]:
class SequenceModel(tf.keras.Model):
    ...
    def __init__(self, seq_len, shop_nb, item_nb, month_nb=SEQLEN, size=64, dropout=0.5, output_bias=None):
        super(SequenceModel, self).__init__()
        self.size = size
        self.shop_nb = shop_nb
        self.item_nb = item_nb
        self.month_nb = month_nb
        
            # https://www.tensorflow.org/tutorials/structured_data/imbalanced_data#checkpoint_the_initial_weights
        if output_bias is not None:
            output_bias = tf.keras.initializers.Constant(output_bias)
        
        self.i = tf.keras.layers.Input(shape=(SEQLEN,1))
        self.rnn = tf.keras.layers.GRU(64, activation='relu')
        self.dnn = tf.keras.layers.Dense(1)        
    
    def call(self, X):        
        print("x",X)
        i = self.i(X)
        out = self.rnn(i)
        y = self.dnn(out)
        
        return y


        
def get_model(seq_len, shop_nb, item_nb, month_nb=SEQLEN, size=64, dropout=0.5, output_bias=None):
    
    seq_input = tf.keras.layers.Input(shape=(month_nb,1))
    x = tf.keras.layers.Flatten()(seq_input)
    #x = tf.keras.layers.GRU(64, activation='relu')(seq_input)
    out = tf.keras.layers.Dense(1)(x)
    model = tf.keras.Model(inputs=seq_input, outputs=out)  

    model = tf.keras.Sequential(
        [
            tf.keras.layers.GRU(64, activation="relu"),
            tf.keras.layers.Dense(1)        
        ]
    )
    
    
    model.compile(loss=tf.keras.losses.Huber(),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return model


tf.debugging.set_log_device_placement(True)
shop_nb = len(shops)
item_nb = len(items)
if tpu:
    with strategy.scope():    
        model = get_model(SEQLEN, shop_nb, item_nb)
    BATCH_SIZE = 16 * tpu_strategy.num_replicas_in_sync
else:
    model = get_model(SEQLEN, shop_nb, item_nb)
    BATCH_SIZE=64
    
model.build(input_shape=(None,SEQLEN,1))
model.summary()
model.fit(get_dataset('data/*.proto', batch_size=BATCH_SIZE), epochs=5, verbose=1)
#model.fit(train_X_seq[:1000], train_y[:1000], batch_size=64, epochs=1)

In [None]:
train_y[:100].shape, train_X_seq[:100].shape

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:

#model.build((None, SEQLEN, 1))


## Model

- don't forget to add a out of vocab categories
- no need to rescale/std values, because we only have category and the value to regress (sales prediction)

In [None]:
# compute regression value mean to init bias


In [None]:
model.evaluate(val_X_seq, val_y)

In [None]:
win_X

In [None]:
#https://www.youtube.com/watch?v=ZnukSLKEw34
ds = tf.data.Dataset.list_files('data/*.proto')
ds = ds.interleave(tf.data.TFRecordDataset, num_parallel_calls=tf.data.experimental.AUTOTUNE)
#ds = tf.data.TFRecordDataset(['data/*.proto'])
ds = ds.map(decode_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds = ds.batch(batch_size=2)
# to enable pipelining
ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)

In [None]:
t1 = [[1, 2]]
t2 = [[7, 8]]
nonscalar = tf.concat([t1, t2], 0)
nonscalar_np = np.array([t1,t2])
print(nonscalar)
print("nonscalarnonscalar", nonscalar)

serialized_nonscalar = tf.io.serialize_tensor(nonscalar)

print("serialized_nonscalar", serialized_nonscalar.numpy())
# to numpy to make it bytes instead of tensor
feature_of_bytes = tf.train.Feature(bytes_list=tf.train.BytesList(value=[serialized_nonscalar.numpy()]))


print("feat of bytes",feature_of_bytes)
features_for_example = {
  'x1': feature_of_bytes
}
example_proto = tf.train.Example(features=tf.train.Features(feature=features_for_example))
print("example",example_proto)

# will be written to tf.record file.
ex_string = example_proto.SerializeToString()
print("serialized ex in string", ex_string)
ex_from_str = tf.train.Example.FromString(ex_string)
print("ex from string", ex_from_str)

feat_desc = {"x1": tf.io.FixedLenFeature([], dtype=tf.string)}

# it takes a serialized string as input, like a row in a tf.record.
parsed_ex = tf.io.parse_single_example(ex_string, feat_desc)
print("parsed ex", parsed_ex)
print("parsed x1 feat as tensor", parsed_ex['x1'])

x1_str = parsed_ex['x1'].numpy()
print("x1 as str", x1_str)

(tf.io.parse_tensor(x1_str, out_type=tf.int32) == nonscalar)
tf.io.parse_tensor(x1_str, out_type=tf.int32)


## Windowing

In [None]:
# no need to pad because we got a 33 fixed size len 
def rolling_win(x, y=None, win_len=1, verbose=False):    
    '''
    y can be none, when we create window in test mode.
    '''
    nb_row = x.shape[0]
    nb_col = x.shape[1]
    if verbose:
        print('nb row', nb_row)
        print('nb col', nb_col)
        
    win_nb = nb_row - win_len + 1
        
    win_len_dim = np.expand_dims(
        np.arange(0, win_len), 0)
    win_nb_dim = np.expand_dims(np.arange(win_nb), 0).T
    win_idx = win_len_dim + win_nb_dim
    win_x = x[win_idx]
    if verbose:
        print("win nb", win_nb)
        print("win len dim", win_len_dim)
        print("win nb dim", win_nb_dim)
        print("win idx", win_idx)
        print("win x", win_x)
        
    win_y = None
    if y is not None:
        y_idx = np.arange(win_len, win_len+win_nb-1)
        win_y = y[y_idx]
        if verbose:
            print("y idx", y_idx)
            print("win y", win_y)
    
        
    return win_x[:-1], win_y
    
    

tst_x = np.array([[59., 30.,  1., 13.],
 [59., 30.,  2., 10.],
 [59., 30.,  3.,  4.],
 [59., 30.,  4.,  0.],
 [59., 30.,  5.,  0.],
 [59., 30.,  6.,  1.],
 [59., 30.,  7.,  1.]])

tst_y = np.expand_dims(np.array([0,1,2,3,4,5,6]),axis=1)
#tst_y = np.array([[0,1,2,3,4,5,6]])

tst_win_x, tst_win_y = rolling_win(tst_x, tst_y, win_len=3, verbose=False)
#tst_win_x.shape, tst_win_y.shape, tst_y.shape



# new approach -> one time serie window per group as input of model

def prep_win(x):
    win_x, win_y = rolling_win(x[:-1], x[1:,4], win_len= 29 )
    return (win_x, win_y)

it = map(prep_win, train_groups[:100])
tmp = list(it)

In [None]:
#%%timeit
def prep_win(df, win_len=10):
        grp_df = df.sort_values(['shop_id', 'item_id','date_block_num']).groupby(['shop_id','item_id'])
        all_x = None
        all_y = None
        for name, grp in grp_df:
            x = np.array(grp.values)
            y = np.expand_dims(np.array(grp.item_cnt_day.values),axis=1)
            #print(x.shape, y.shape)
            # we omit last row as we want to predict it (the goal is to predict next month sales).
            win_x, win_y = rolling_win(x,y, win_len=win_len, verbose=False)                        
            if all_x is None:
                all_x = win_x                
                all_y = win_y
                #print(all_x.shape, all_y.shape)
            else:
                all_x = np.concatenate((all_x, win_x), axis=0)
                all_y = np.concatenate((all_y, win_y), axis=0)
                #print(">",all_x.shape, all_y.shape, win_y.shape)

        return all_x, all_y
            

small_data = False
SEQ_LEN = 30
if small_data:
    sm_shops = [2,3, 5] #shops[:10]
    sm_items = [31,12]#items[:10]

    df = train_df.loc[(train_df.shop_id.isin(sm_shops)) & (train_df.item_id.isin(sm_items))]
    seq, y = prep_win(df,win_len=SEQ_LEN)
else:
    df = train_df
    seq, y = prep_win(df,win_len=SEQ_LEN)

In [None]:
def prep_x_y(seq, seq_len):
    
    cols =['shop_id','item_id', 'month_nb','sales','item_category_id']
    cat_cols = ['shop_id','item_id', 'item_category_id']
    seq_cat_cols = ['month_nb']
    seq_cols = ['sales']
    pred_col = 'sales'
    X_cat = seq[:, :, np.where(np.isin(cols, cat_cols))].squeeze()
    X_seq = seq[:, :, np.where(np.isin(cols, seq_cols))].squeeze(-1)
    # create embeddings for each month
    X_seq_cat = seq[:, :, np.where(np.isin(cols, seq_cat_cols))].squeeze(-1)
    y = seq[:, :, cols.index(pred_col)].squeeze()[:, seq_len - 1]

    return X_cat, X_seq, X_seq_cat,  y


X_cat, X_seq, X_seq_cat, y = prep_x_y(seq, SEQ_LEN)

In [None]:
def rolling_win_padded(x, win_len, pad_nb=None, win_nb=None, verbose=False):
    '''
    Args:
        x: timeserie numpy array of shape (timestep, nb_features )
        win_len: len of window on timeserie
        pad_nb: the nb of left padded zeros (or first padded) that will be used to define first windows. If no pad_nb is provided, it will start to pad every timestep except the last one (which is the first item of window)
        win_nb: forced nb of window to be returned. If None, win_nb is computed.
        verbose: true if debug info is displayed
    '''
    nb_row = x.shape[0]
    nb_col = x.shape[1]

    # if not defined then compute it.
    if win_nb is None:
        if nb_row <= win_len:
            pad_nb = win_len-nb_row
        elif pad_nb is None:
            pad_nb = 0
        # else we use pad_nb given as parameter.
        # nb of rows including pad
        nb_row_total = nb_row + pad_nb
        win_nb = nb_row_total - win_len + 1
        start_rolling_idx = 0
        if verbose:
            print("nb of windows is computed")

    else:
        # the len on which win will be rolled over.
        rolling_len = win_len + (win_nb-1)
        if nb_row >= rolling_len:
            pad_nb = 0
        else:
            pad_nb = rolling_len - nb_row

        nb_row_total = nb_row + pad_nb
        start_rolling_idx = nb_row_total - rolling_len
        if verbose:
            print("nb of windows is forced")

    if verbose:
        print('> nb of windows', win_nb)
        print("> win len:", win_len)
        print("> nb rows:", nb_row)
        print("> nb rows total (with pad):", nb_row_total)
        print("> nb col:", nb_col)
        print("> start index (where we start rolling)", start_rolling_idx)

    pad = np.zeros((pad_nb, nb_col))
    padded_x = np.concatenate([pad, x])

    if verbose:
        print("Nb row included pad:", nb_row_total)
        print("nb windows:", win_nb)
        print("\n pad", pad, "\n padded x", padded_x, '\n padd nb:', pad_nb)
    # -----
    # create a vectorized index based on a rolling index
    # rolling index defines windows with index pointing to data in x
    # we create a matrix of shape (win_len_dim, win_nb_dim)
    win_len_dim = np.expand_dims(
        np.arange(start_rolling_idx, win_len+start_rolling_idx), 0)
    win_nb_dim = np.expand_dims(np.arange(win_nb), 0).T
    if verbose:
        print("win dim", win_len_dim, "\n nb of window dimension", win_nb_dim)
    # we add timestep shift to first window of index
    win_idx = win_len_dim + win_nb_dim
    if verbose:
        print("rolling idx", win_idx)
        print("rolling x", padded_x[win_idx])
    return padded_x[win_idx]


x = np.array([[1, 'a', 1.5],
              [2, 'b', 3.2],
              [3, 'c', 3.5],
              [4, 'd', 3.3],
              [5, 'e', 5.2],
              [6, 'f', 8.2]])


rolling_x = rolling_win_padded(x[:, :], win_len=3, pad_nb=0, verbose=True)
assert rolling_x.shape == (4, 3, 3)

rolling_x = rolling_win_padded(x[:, :], win_len=1, pad_nb=0)
assert rolling_x.shape == (6, 1, 3)

rolling_x = rolling_win_padded(x[:, :], win_len=6, pad_nb=0)
assert rolling_x.shape == (1, 6, 3)

rolling_x = rolling_win_padded(x[:, :], win_len=8, pad_nb=0)
assert rolling_x.shape == (1, 8, 3)


rolling_x = rolling_win_padded(x[:, :], win_len=8, pad_nb=2)
assert rolling_x.shape == (1, 8, 3)

rolling_x = rolling_win_padded(x[:, :], win_len=8, win_nb=3, verbose=False)
assert rolling_x.shape == (3, 8, 3)

rolling_x = rolling_win_padded(x[:, :], win_len=4, win_nb=2, verbose=False)
assert rolling_x.shape == (2, 4, 3)
assert rolling_x[-1, -1, 1] == 'f'  # last item is 6,'f',...

In [None]:
# %%time
from numpy import save, load

INDEX_COL_USE_TO_PREDICT = 3


def prep_windowed_data(df, seq_len):
    grp_df = df.sort_values(['object_id', 'evt_at']).groupby('object_id')
    all_x = None
    min_nb_rows = 10
    for name, group in grp_df:
        #print("name", name,"values", group.values)
        x = np.array(group.values)
        # we only select x for which the last timestep will have a use_to_predict==True, it's the first_true_idx
        first_true_idx = np.where(x[:, INDEX_COL_USE_TO_PREDICT] == True)[0][0]
        # compute the nb of window as nb of row - first_true_idx
        win_nb = x.shape[0] - first_true_idx
        win_x = rolling_win_padded(
            x[:], win_len=seq_len, win_nb=win_nb, verbose=False)
        if all_x is None:
            all_x = win_x
        else:
            all_x = np.concatenate((all_x, win_x), axis=0)

    return all_x


# len of sequence (window)
SEQ_LEN = 10
MIN_NB_ROW_WIN = MIN_NB_EVT_PER_CPNY

load_data = False
if load_data:
    train_seq = np_persistence_client.load_np_array(
        BASE_OUTPUTS_DIR+"train_seq.npy")
    test_seq = np_persistence_client.load_np_array(
        BASE_OUTPUTS_DIR+"test_seq.npy")
else:
    is_dev = False
    if is_dev:
        object_id_list = train_prep_df.object_id.unique()
        # object_id_list=['c:13529']
        # object_id_list=['c:1']
        #train_seq = prep_windowed_data(train_prep_df[train_prep_df.object_id.isin(object_id_list[:1000])], seq_len=SEQ_LEN)
        object_id_list = ['c:40456']
        object_id_list = ['c:104015']
        #object_id_list = test_prep_df.object_id.unique()
        test_seq = prep_windowed_data(test_prep_df[test_prep_df.object_id.isin(
            object_id_list[:1000])], seq_len=SEQ_LEN)

    else:
        train_seq = prep_windowed_data(train_prep_df, seq_len=SEQ_LEN)
        np_persistence_client.save_np_array(
            train_seq, BASE_OUTPUTS_DIR+"train_seq.npy")
        # ---
        test_seq = prep_windowed_data(test_prep_df, seq_len=SEQ_LEN)
        np_persistence_client.save_np_array(
            test_seq, BASE_OUTPUTS_DIR+"test_seq.npy")

        assert np.unique(test_seq[:, -1, INDEX_COL_USE_TO_PREDICT]) == [
            True], "The last timestep must always have a use_to_predict==True in test"
        assert np.unique(train_seq[:, -1, INDEX_COL_USE_TO_PREDICT]) == [
            True], "The last timestep must always have a use_to_predict==True in train"

In [None]:
def prep_x_y(seq, cols, seq_len):
    '''
    return X for sequence, X for company static data and y
    '''
    seq_cols = [
        'cum_acquisition_nb', 'cum_rounds_nb', 'cum_raised_usd',
        'cum_nb_unique_investors', 'cum_nb_successfull_invest_by_investor',
        'nb_days_since_last_evt_type', 'nb_days_since_last_any_evt',
        'nb_days_since_first_evt', 'participants'
    ]
    #seq_cols = ['participants', 'cum_raised_usd', 'cum_nb_unique_investors', 'cum_nb_successfull_invest_by_investor', 'nb_days_since_last_any_evt']
    # try a model without cum counters
    seq_cat_cols = ['evt_type']
    cpny_cols = ['country_code', 'category_code']
    pred_col = 'success_horizon'
    pred_col = 'success_less_5_year'
    pred_col = 'success_less_2_year'
    assert seq.shape[-1] == len(
        cols), "There should be as many columns (cols) as the last dimension of seq"
    X_cpny = seq[:, :, np.where(np.isin(cols, cpny_cols))].squeeze()
    X_seq = seq[:, :, np.where(np.isin(cols, seq_cols))].squeeze()
    X_seq_cat = seq[:, :, np.where(np.isin(cols, seq_cat_cols))].squeeze(-1)
    y = seq[:, :, cols.index(pred_col)].squeeze()[:, seq_len - 1]

    return X_cpny, X_seq, X_seq_cat, y


X_cpny_train, X_seq_train, X_seq_cat_train, y_train = prep_x_y(
    train_seq, cols, SEQ_LEN)
X_cpny_test, X_seq_test, X_seq_cat_test, y_test = prep_x_y(
    test_seq, cols, SEQ_LEN)