In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from collections import defaultdict
from tensorflow.keras.utils import plot_model
from pprint import pprint
# Step 1: Load Data
events = pd.read_csv("/events.csv")
print(len(events))

item_counts = events['itemid'].value_counts()
frequent_items = item_counts[item_counts >= 50].index  # keep items clicked >=5 times
events = events[events['itemid'].isin(frequent_items)]
print(len(events))

events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')
events = events.sort_values(by=["visitorid", "timestamp"])

from itertools import groupby
# Group by visitor and build sessions without consecutive duplicates
def remove_consecutive_duplicates(items):
    return [key for key, _ in groupby(items)]


visitor_id = events["visitorid"].iloc[10]  # or set manually like visitor_id = 123456
print(visitor_id)
# Filter events for that visitor
visitor_events = events[events["visitorid"] == visitor_id][["timestamp", "itemid"]]

# Convert to list of tuples and pprint
pprint(visitor_events.values.tolist())




sessions = events.groupby("visitorid")["itemid"].apply(list).apply(remove_consecutive_duplicates)


2756101
1330854


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  events['timestamp'] = pd.to_datetime(events['timestamp'], unit='ms')


2
[[Timestamp('2015-08-07 17:51:44.567000'), 325215],
 [Timestamp('2015-08-07 17:53:33.790000'), 325215],
 [Timestamp('2015-08-07 17:56:52.664000'), 259884],
 [Timestamp('2015-08-07 18:01:08.920000'), 216305],
 [Timestamp('2015-08-07 18:08:25.669000'), 342816],
 [Timestamp('2015-08-07 18:17:24.375000'), 342816],
 [Timestamp('2015-08-07 18:17:43.170000'), 216305],
 [Timestamp('2015-08-07 18:20:57.845000'), 325215]]


In [68]:

print(len(events))

1330854


In [69]:

# Step 2: Group by session
MAX_LEN = 5  # window size
sessions_raw = events.groupby("visitorid")["itemid"].apply(list)
print(f"Total items: {sum(len(session) for session in sessions_raw)}")
sessions = events.groupby("visitorid")["itemid"].apply(list).apply(remove_consecutive_duplicates)
print(f"Total items: {sum(len(session) for session in sessions)}")



Total items: 1330854
Total items: 1067244


In [71]:

window_size=MAX_LEN
sequences = []
targets = []


for session in sessions:
    if len(session) >= 2:
        for i in range(len(session) - window_size):
            input_seq = session[i : i + window_size]
            target_item = session[i + window_size]
            sequences.append(input_seq)
            targets.append(target_item)

print('sessions --> ', len(sessions))
print(sessions[2])
ind = 1
print(sequences[ind])
print(targets[ind])


sessions -->  680641
[325215, 259884, 216305, 342816, 216305, 325215]
[283115, 38965, 319680, 283115, 319680]
38965


In [72]:

# Step 4: Encode items to integers
all_items = set([item for seq in sequences for item in seq] + targets)
item2idx = {item: idx + 1 for idx, item in enumerate(sorted(all_items))}
idx2item = {idx: item for item, idx in item2idx.items()}

sequences_encoded = [[item2idx[item] for item in seq] for seq in sequences]
targets_encoded = [item2idx[item] for item in targets]

# Step 5: Pad sequences
X = pad_sequences(sequences_encoded, maxlen=MAX_LEN, padding='pre', dtype='int32')
y = np.array(targets_encoded)

# Step 6: Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Build the model
num_items = len(item2idx) + 1  # +1 for padding index 0
embedding_dim = 64

model = Sequential([
    Embedding(input_dim=num_items, output_dim=embedding_dim, input_length=MAX_LEN),
    GRU(64),
    Dense(num_items, activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()




In [73]:
print(len(X_train))


111704


In [74]:

# Step 8: Train
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_split=0.1)

# Step 9: Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {accuracy:.4f}")

# Step 10: Predict next item
def recommend_next_item(user_history):
    encoded = [item2idx.get(item, 0) for item in user_history]
    padded = pad_sequences([encoded], maxlen=MAX_LEN, padding='pre')
    print
    prediction = model.predict(padded, verbose=0)
    top_index = np.argmax(prediction[0])
    return idx2item.get(top_index, "Unknown")

# Example usage
sample_session = sessions.iloc[100][:4]  # Simulate an incomplete session
print("User clicked:", sample_session)
next_item = recommend_next_item(sample_session)
print("Predicted next item:", next_item)


Epoch 1/5
[1m3142/3142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 27ms/step - accuracy: 0.0047 - loss: 8.7998 - val_accuracy: 0.0138 - val_loss: 8.1120
Epoch 2/5
[1m3142/3142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 27ms/step - accuracy: 0.0222 - loss: 7.7149 - val_accuracy: 0.0455 - val_loss: 7.3851
Epoch 3/5
[1m3142/3142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 27ms/step - accuracy: 0.0538 - loss: 6.7738 - val_accuracy: 0.0595 - val_loss: 7.0184
Epoch 4/5
[1m3142/3142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m143s[0m 27ms/step - accuracy: 0.0851 - loss: 6.0971 - val_accuracy: 0.0713 - val_loss: 6.8719
Epoch 5/5
[1m3142/3142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 26ms/step - accuracy: 0.1146 - loss: 5.5684 - val_accuracy: 0.0745 - val_loss: 6.8312
[1m873/873[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 11ms/step - accuracy: 0.0737 - loss: 6.8323
Test Accuracy: 0.0749
User clicked: [260546]
Predicted next it

In [75]:
sample_session = sessions.iloc[100][:4]  # Simulate an incomplete session
print("User clicked:", sample_session)
next_item = recommend_next_item(sample_session)
print("Predicted next item:", next_item)

User clicked: [260546]
Predicted next item: 71733


In [76]:
def recommend_next_item1(user_history):
    encoded = [item2idx.get(item, 0) for item in user_history]
    padded = pad_sequences([encoded], maxlen=MAX_LEN, padding='pre')
    print('padded --> ', padded)
    prediction = model.predict(padded, verbose=0)
    top_index = np.argmax(prediction[0])
    print('top_index --> ', top_index)

    return idx2item.get(top_index, "Unknown")



In [80]:

print(sessions.iloc[70])

[212357, 354724]


In [82]:
sample_session = sessions.iloc[70][:3]  # Simulate an incomplete session
print("User clicked:", sample_session)
next_item = recommend_next_item1(sample_session)
print("Predicted next item:", next_item)

User clicked: [212357, 354724]
padded -->  [[   0    0    0    0 7614]]
top_index -->  3056
Predicted next item: 146613
