In [1]:
import pandas as pd
import numpy as np
from numpy import save
from numpy import load
from matplotlib import pyplot
import tensorflow as tf
from keras.layers import BatchNormalization
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Flatten
from sklearn import preprocessing
from keras.layers import Bidirectional

In [2]:
path=r"C:\Users\Shibbs\Desktop\Praxis\CAPP\cwd\datasets\final\df.csv"

In [12]:
def get_dataset(path):
    
    df = pd.read_csv(path, dtype={ 'event_type':object, 'product_id':int, 'category_id':object,
                                   'category_code':object, 'brand':str, 'price':float, 'user_id':object, 
                                   'user_session':object, 'pid':int})
    le = preprocessing.LabelEncoder()
    df["pid"] = le.fit_transform(df["product_id"])
    df = df.groupby(['user_id', 'event_time'], as_index=False).first()
    df = df.drop_duplicates(subset=['user_id', 'event_time'])
    df = df[['user_id','pid','event_type']]
    
    return df

In [13]:
def prepare_dataset(df):
    
    # add sequence index for every uniuqe user id
    for i in df['user_id'].unique():
        seq = []
        for j in range(df[df.user_id == i].shape[0]):
            seq.append(f"{j}")   
        df.loc[df['user_id']==i,'seq_index'] = [k for k in seq]
    
    # Only keep sequence of length 20
    df.seq_index = df.seq_index.astype(int)  
    df = df[df['seq_index']<20].reset_index()
    
    # pivot, encode and post padding both event_type and product_id sequences
    df2 = pd.pivot_table(data=df,index='user_id',columns=['seq_index'],values=['event_type'],aggfunc='sum')
    df2 = df2.fillna(0)
    df2 = df2.replace(['view'],1)
    df2 = df2.replace(['cart'],2)
    df2 = df2.replace(['purchase'],3)
    df2 = df2[df2['event_type',  1 ]!= 0]
    
    df3 = pd.pivot_table(data=df,index='user_id',columns=['seq_index'],values=['pid'],aggfunc='sum')
    df3 = df3.fillna(0)
    df3 = df3[df3['pid',  1 ]!= 0]
    
    # convert post padding to pre padding because it makes more sense
    event_type_list = []
    for i in range(len(df2.index)):
        event_type = df2.iloc[i].tolist()
        event_type.sort(reverse = True,key=lambda x: x==0)
        event_type_list.append(event_type)
        event_type_array = np.array(event_type_list)
        event_type_array = event_type_array.astype(int)
        event_type_array = event_type_array.flatten()
        
        
    product_id_list = []
    for i in range(len(df3.index)):
        product_id = df3.iloc[i].tolist()
        product_id.sort(reverse = True,key=lambda x: x==0)
        product_id_list.append(product_id)
        product_id_array = np.array(product_id_list)
        product_id_array = product_id_array.flatten()
        
    
    # delete extra sequences to match dimensions
    event_type_=np.delete(event_type_array,[range(217580-1580,217580)])
    product_id_=np.delete(product_id_array,[range(217580-1580,217580)])
    
    # combine both sequences into a 2d numpy array
    X = np.column_stack((product_id_,event_type_))
    X = X.reshape(10800, 20, 2)
    X = X.astype(int)
        
    return X
    

In [14]:
df = get_dataset(path)

In [15]:
df

Unnamed: 0,user_id,pid,event_type
0,1515915625353230000,48545,view
1,1515915625353230000,48545,view
2,1515915625353230000,48545,view
3,1515915625353230000,50637,view
4,1515915625353230000,42316,view
...,...,...,...
865818,1515915625611020000,47813,view
865819,1515915625611020000,25464,view
865820,1515915625611020000,38146,view
865821,1515915625611020000,52972,view


In [None]:
et, pid = prepare_dataset(df) # approx runtime 30 mins, instead just load the numpy array

In [None]:
pid = pid.astype(int)

In [None]:
et.shape[0]/20

In [None]:
a,b=np.unique(pid,return_counts=True)

In [None]:
len(b)

In [None]:
pid

In [None]:
X = np.column_stack((pid,et))

In [None]:
X = X.reshape(10800, 20, 2)

In [None]:
X

In [None]:
#save('x.npy', x)

In [3]:
x = load('x.npy')

In [4]:
x = x.astype(int)

In [10]:
x

array([[[4005145,       1],
        [4005145,       1],
        [4005145,       1],
        ...,
        [  73593,       1],
        [4012993,       1],
        [4101974,       1]],

       [[      0,       0],
        [      0,       0],
        [      0,       0],
        ...,
        [1547839,       1],
        [3620813,       1],
        [3721192,       1]],

       [[1747855,       1],
        [1747856,       1],
        [1747856,       1],
        ...,
        [ 886022,       1],
        [ 886022,       1],
        [1023383,       1]],

       ...,

       [[1821813,       1],
        [1821813,       2],
        [1660969,       1],
        ...,
        [4013582,       1],
        [ 453833,       1],
        [3759430,       1]],

       [[ 470796,       1],
        [1306533,       1],
        [2813900,       1],
        ...,
        [1452896,       1],
        [1453969,       1],
        [1682392,       1]],

       [[ 475904,       1],
        [3829355,       1],
        [3829355

In [None]:
x=x.tolist()

In [19]:
len(x[1])

20

In [5]:
#split into train test
train_size = 10000
train = x[:train_size, :, :]
test = x[train_size:, :, :]

In [None]:
train.shape

In [None]:
test.shape

In [6]:
# split into input and outputs
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

Spliting the sequence of 20 into 19(input sequence) and 1(predicting sequence)

In [7]:
train_X.shape, train_y.shape

((10000, 19, 2), (10000, 2))

In [13]:
train_X.shape[2]

2

In [8]:
# split into input and outputs
test_X.shape, test_y.shape

((800, 19, 2), (800, 2))

In [19]:
    model = Sequential()
    model.add(LSTM(50, input_shape=(train_X.shape[1], train_X.shape[2]), dropout=0.5))
    model.add(BatchNormalization())
    
    model.add(Dense(1024,activation='relu'))
    model.add(Flatten())
    model.add(Dense(53453+3 ,activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    # fit network
    history = model.fit(train_X, train_y, epochs=5, batch_size=1, validation_data=(test_X, test_y), verbose=1, shuffle=False)
    # plot history
    pyplot.plot(history.history['loss'], label='train')
    pyplot.plot(history.history['val_loss'], label='test')
    pyplot.legend()
    pyplot.show()

Epoch 1/5


ValueError: in user code:

    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\engine\training.py", line 1021, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\engine\training.py", line 1010, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\engine\training.py", line 1000, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\engine\training.py", line 860, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\engine\training.py", line 918, in compute_loss
        return self.compiled_loss(
    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\losses.py", line 1862, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "C:\Users\Shibbs\anaconda3\lib\site-packages\keras\backend.py", line 5202, in sparse_categorical_crossentropy
        res = tf.nn.sparse_softmax_cross_entropy_with_logits(

    ValueError: `labels.shape` must equal `logits.shape` except for the last dimension. Received: labels.shape=(2,) and logits.shape=(1, 53456)


In [None]:
model = Sequential()
model.add(LSTM(20, activation='relu', input_shape=(19,2), dropout=0.3, return_sequences=True))
model.add(BatchNormalization())
model.add(LSTM(20, activation='relu', input_shape=(19,2), dropout=0.3))
model.add(BatchNormalization())
model.add(Dense(64,activation='relu'))
model.add(Dense(2,activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy',metrics=['categorical_accuracy']) # changed loss to categorical_crossentropy but loss = nan
history = model.fit(train_X, train_y, batch_size=1,epochs=3, validation_data=(test_X, test_y), verbose=1)

In [None]:
test_input =       [[      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      0,       0],
                    [      48545,   1],
                    [48545  ,       1],
                    [48545  ,       2]]
    
test_input=np.array(test_input)
test_input=test_input.reshape(1,19,2)

test_output = model.predict(test_input, verbose=1)
test_output

# END

In [3]:
path=r"C:\Users\Shibbs\Desktop\lstm_datasets\lstm_with_seq_100.csv"

In [4]:
df = pd.read_csv(path)

In [5]:
df

Unnamed: 0.1,Unnamed: 0,user_id,event_time,product_id,event_type,seq_index
0,0,1.515916e+18,2020-09-29 16:01:54 UTC,4005145,view,0
1,1,1.515916e+18,2020-09-29 16:02:34 UTC,4005145,view,1
2,2,1.515916e+18,2020-10-01 05:57:54 UTC,4005145,view,2
3,3,1.515916e+18,2020-10-02 08:23:40 UTC,4099840,view,3
4,4,1.515916e+18,2020-10-06 06:30:32 UTC,3506650,view,4
...,...,...,...,...,...,...
596253,596253,1.515920e+18,2020-09-24 21:31:54 UTC,1716641,view,95
596254,596254,1.515920e+18,2020-09-24 21:33:40 UTC,4051545,view,96
596255,596255,1.515920e+18,2020-09-24 21:34:07 UTC,1716639,view,97
596256,596256,1.515920e+18,2020-09-24 22:17:54 UTC,1682604,view,98


In [6]:
df.seq_index = df.seq_index.astype(int)  
df = df[df['seq_index']<20].reset_index()

In [10]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

In [11]:
df['product_id'] = le.fit_transform(df['product_id'])

In [12]:
    df2 = pd.pivot_table(data=df,index='user_id',columns=['seq_index'],values=['event_type'],aggfunc='sum')
    df2 = df2.fillna(0)
    df2 = df2.replace(['view'],1)
    df2 = df2.replace(['cart'],2)
    df2 = df2.replace(['purchase'],3)
    df2 = df2[df2['event_type',  1 ]!= 0]
    
    df3 = pd.pivot_table(data=df,index='user_id',columns=['seq_index'],values=['product_id'],aggfunc='sum')
    df3 = df3.fillna(0)
    df3 = df3[df3['product_id',  1 ]!= 0]
    
    # convert post padding to pre padding because it makes more sense
event_type_list = []
    for i in range(len(df2.index)):
        event_type = df2.iloc[i].tolist()
        event_type.sort(reverse = True,key=lambda x: x==0)
        event_type_list.append(event_type)
        event_type_array = np.array(event_type_list)
        event_type_array = event_type_array.astype(int)
        event_type_array = event_type_array.flatten()
        
        
product_id_list = []
    for i in range(len(df3.index)):
        product_id = df3.iloc[i].tolist()
        product_id.sort(reverse = True,key=lambda x: x==0)
        product_id_list.append(product_id)
        product_id_array = np.array(product_id_list)
        product_id_array = product_id_array.flatten()

In [13]:
event_type_array.size

214940

In [14]:
product_id_array.size

214940

In [15]:
event_type_array=np.delete(event_type_array,[range(214940-940,214940)])
product_id_array=np.delete(product_id_array,[range(214940-940,214940)])

In [16]:
X = np.column_stack((event_type_array,product_id_array))

In [26]:
X = X.astype(int)

In [27]:
X = X.reshape(10700, 20, 2)

In [28]:
#split into train test
train_size = 10000
train = X[:train_size, :, :]
test = X[train_size:, :, :]

In [29]:
train_X, train_y = train[:, :-1], train[:, -1]
test_X, test_y = test[:, :-1], test[:, -1]

In [44]:
model = Sequential()
model.add(Bidirectional(LSTM(50, activation='relu'), input_shape=(19, 2)))
# model.add(LSTM(50, activation='relu', input_shape=(19, 2),return_sequences=True))
model.add(Flatten())
model.add(Dense(2))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['categorical_accuracy'])

history = model.fit(train_X, train_y, epochs=3, verbose=2, batch_size=1)

Epoch 1/3
10000/10000 - 40s - loss: 12516.6348 - categorical_accuracy: 0.2195 - 40s/epoch - 4ms/step
Epoch 2/3
10000/10000 - 38s - loss: 130.6804 - categorical_accuracy: 0.0012 - 38s/epoch - 4ms/step
Epoch 3/3
10000/10000 - 38s - loss: 130.6794 - categorical_accuracy: 0.0012 - 38s/epoch - 4ms/step


In [45]:
test_input = [[    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    0,     0],
       [    1,  3813],
       [    1, 11603],
       [    1, 17366],
       [    1, 22156]]

In [46]:
test_input = np.array(test_input)

In [47]:
test_input = test_input.reshape((1, 19, 2))

In [48]:
test_output = model.predict(test_input, verbose=0)

In [49]:
test_output

array([[ 294041.38, -410986.66]], dtype=float32)