In [1]:
import pandas as pd
from collections import defaultdict


In [2]:
def create_sequences(ratings_path, output_path, min_len=3):
    df = pd.read_csv(ratings_path)
    df = df.sort_values(by=["user_id"])  # ensure user order (if timestamp missing)
    
    # Group interactions per user
    user_groups = defaultdict(list)
    for _, row in df.iterrows():
        user_groups[row["user_id"]].append(row["book_isbn"])
    
    # Build (sequence, target) pairs
    sequences = []
    for user_id, books in user_groups.items():
        if len(books) < min_len:
            continue
        for i in range(1, len(books)):
            sequences.append({
                "user_id": user_id,
                "input_sequence": books[:i],
                "target": books[i]
            })
    
    seq_df = pd.DataFrame(sequences)
    seq_df.to_pickle(output_path)
    print(f"✅ Saved {len(seq_df)} sequences to {output_path}")



In [3]:
if __name__ == "__main__":
    create_sequences("./../clean_data/ratings.csv", "./../clean_data/sequences.pkl")

✅ Saved 692910 sequences to ./../clean_data/sequences.pkl
