Merge all files

In [None]:
import pandas as pd
import os

# Define your files and associated type for the third column
file_map = {
    "ubc_data/page_visit.parquet": "url",
    "ubc_data/search_query.parquet": "query",
    "ubc_data/add_to_cart.parquet": "sku",
    "ubc_data/product_buy.parquet": "sku",
    "ubc_data/remove_from_cart.parquet": "sku",

}
# List to hold DataFrames
dfs = []

for file_name, value_type in file_map.items():
    try:
        df = pd.read_parquet(file_name)
    except Exception as e:
        print(f"Error reading {file_name}: {e}")
        continue

    # Ensure required columns
    if not {"client_id", "timestamp"}.issubset(df.columns):
        print(f"Skipping {file_name}: missing client_id or timestamp")
        continue

    # Find third column (value column)
    value_cols = [col for col in df.columns if col not in {"client_id", "timestamp"}]
    if len(value_cols) != 1:
        print(f"Skipping {file_name}: expected 1 value column, found {len(value_cols)}")
        continue

    value_col = value_cols[0]

    # Prepare a unified DataFrame
    tmp = df[["client_id", "timestamp", value_col]].copy()
    tmp = tmp.rename(columns={value_col: "value"})
    tmp["type"] = value_type
    tmp["source"] = os.path.basename(file_name)

    # Convert value to string to avoid mixed-type Parquet errors
    tmp["value"] = tmp["value"].astype(str)

    # Reorder columns
    tmp = tmp[["client_id", "timestamp", "source", "type", "value"]]

    dfs.append(tmp)

# Combine all into one DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

# Save to Parquet
merged_df.to_parquet("merged_behavior.parquet", index=False)


load merged df

In [None]:
import pandas as pd

# Load the parquet file
merged_behavior_df = pd.read_parquet("merged_behavior.parquet")
print(merged_behavior_df.shape)
merged_behavior_df.head()

create sequnce of urls that end with a sku, in define session

In [None]:
from collections import defaultdict
from datetime import datetime, timedelta

sku_to_urls = defaultdict(list)
current_client_id = 0
session_urls = []
current_timestamp = ""
delta = timedelta(minutes=30)

for idx, row in enumerate(merged_behavior_df.itertuples(index=False)):
    if (idx+1) %10_000_000 == 0:
        print(f"processed {idx+1} items.")

    client_id = row.client_id
    timestamp = row.timestamp
    event_type = row.type
    value = row.value
    
    if client_id != current_client_id:
        current_client_id = client_id
        session_urls = []
    
    if event_type == "url":
        #remove duplicate url that occure in sequnce in similar timestamp
        if len(session_urls) > 0:
            if session_urls[-1] != (value, str(timestamp)):
                session_urls.append((value, str(timestamp)))
        else:
            session_urls.append((value, str(timestamp)))
    
    elif event_type == "sku":
        anchor_str = str(timestamp)
        anchor = datetime.strptime(anchor_str, "%Y-%m-%d %H:%M:%S")
        if len(session_urls) == 0:
            continue

        elif abs(datetime.strptime(session_urls[0][1], "%Y-%m-%d %H:%M:%S") - anchor) <= delta:
            sku_to_urls[value].append([i[0] for i in session_urls])
        else:
            filtered = [
                    ts[0] for ts in session_urls
                    if abs(datetime.strptime(ts[1], "%Y-%m-%d %H:%M:%S") - anchor) <= delta
                ]
            sku_to_urls[value].append(filtered)

        session_urls = []

In [None]:
import pandas as pd

# Load the parquet file
product_properties_df = pd.read_parquet("ubc_data/product_properties.parquet")
print(product_properties_df.shape)
product_properties_df.head()

In [None]:
from collections import defaultdict

sku_name = {}
names = []
for idx, row in enumerate(product_properties_df.itertuples(index=False)):

    sku = row.sku
    name = row.name
    
    name_int = list(map(int, name.strip().strip("[]").split()))
    sku_name[str(sku)] = str(name_int)

In [None]:
names_str = [sku_name[i] for i in sku_name]
names_str = list(set(names_str))
names = [list(map(int,i.strip("]").strip("[").split(","))) for i in names_str]


In [None]:
from sklearn.cluster import KMeans

best_cluster = 60
kmeans = KMeans(n_clusters=best_cluster, random_state=42, n_init='auto')
labels = kmeans.fit_predict(names)
name_cluster = {str(q):label  for q, label in zip(names, labels)}

In [None]:
sku_cluster = {}

for i in sku_name:
    sku_cluster[i] = name_cluster[sku_name[i]]

In [None]:
from collections import defaultdict
from collections import Counter

url_to_sku_cluster = defaultdict(list)
for sku in sku_to_urls:
    for urls in sku_to_urls[sku]:
        for url in urls:
            url_to_sku_cluster[url].append(sku_cluster[sku])

url_cluster = {}
for url in url_to_sku_cluster:
    counter = Counter(url_to_sku_cluster[url])
    most_common_value = counter.most_common(1)[0][0]
    url_cluster[url] = most_common_value
    

Query cluster

In [None]:
queries_raw = []


for idx, row in enumerate(merged_behavior_df.itertuples(index=False)):
    if (idx+1) %10_000_000 == 0:
        print(idx+1)
        with open(f"query_{idx+1}.txt", 'w') as q:
            for i in queries_raw:
                q.write(i + "\n")

        queries_raw = []

    event_type = row.type
    value = row.value
    
    if event_type == "query":
        queries_raw.append(value)

with open(f"query_last.txt", 'w') as q:
    for i in queries_raw:
        q.write(i + "\n")     

In [None]:
import glob

queries_raw = []
query_files = glob.glob("query_*.txt")

for f in query_files:
    print(f)
    ln = open(f).readlines()
    for l in ln:
        l = l.strip("\n")
        queries_raw.append(l)

print(len(queries_raw))
queries_raw = list(set(queries_raw))
print(len(queries_raw))


In [None]:
queries = []
for q in queries_raw:
    int_list = list(map(int, q.strip().strip("[]").split()))
    queries.append(int_list)

In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=best_cluster, random_state=42, n_init='auto')
labels = kmeans.fit_predict(queries)
query_cluster = {str(q):label  for q, label in zip(queries, labels)}

In [None]:
combined_cluster = {
    "url": url_cluster,
    "query": query_cluster,
    "sku": sku_cluster
}

import pickle

with open("data_clusters.pkl", "wb") as f:
    pickle.dump(combined_cluster, f)

Create Sequence data for feed to seq2seq model

In [None]:
import pickle

with open("data_clusters.pkl", "rb") as f:
    loaded_dict = pickle.load(f)

In [None]:
from collections import defaultdict
from datetime import timedelta

data = []

seq_input = []
seq_target = []

max_len_thereshold = 500

for idx, row in enumerate(merged_behavior_df.itertuples(index=False)):
    if (idx+1) %10_000_000 == 0:
        print(f"processed {idx+1} items.")
        with open(f'sequnce_data_{idx+1}.csv', 'w') as sq:
            for i in data:
                sq.write(f"{i[0]}#{"_".join(list(map(str, i[1])))}#{"_".join(list(map(str, i[2])))}\n")
        data = []
        # break
    
    # extract row items
    client_id = row.client_id
    timestamp = row.timestamp
    source = row.source
    event_type = row.type
    value = row.value


    # define variable in first loop
    if idx == 0:
        current_client_id = client_id
        current_timestamp = timestamp

    # if user changed or seq len was long
    if current_client_id != client_id or len(seq_input) > max_len_thereshold:
        # if len(seq_input) > 0 and len(seq_target) > 0:
        data.append((current_client_id, seq_input, seq_target))

        
        current_client_id = client_id
        current_timestamp = timestamp
        seq_input = []
        seq_target = []


    time_diff = int((timestamp - current_timestamp).total_seconds())
    
    if source.split(".")[0] in ["page_visit", 'search_query']:
        if len(seq_input) > 0  and  isinstance(seq_input[-1], int):
            seq_input[-1] += time_diff
            
        else:
            seq_input.append(time_diff)
        
        if source.split(".")[0] == "page_visit":
            action = "v"
        else:
            action = "s"
            value = list(map(int, value.strip().strip("[]").split()))
            # value = sum(value)/16

            
        seq_input.append(action + str(value))

    else:
        if len(seq_target) > 0  and  isinstance(seq_target[-1], int):
            seq_target[-1] += time_diff
            
        else:
            seq_target.append(time_diff)

        if source.split(".")[0] == "product_buy":
            action = "b"

        elif source.split(".")[0] == "add_to_cart":
            action = "a"

        else:
            action = "r"

        seq_target.append(action + str(value))


    current_timestamp = timestamp


with open(f'sequnce_data_last.csv', 'w') as sq:
    for i in data:
        sq.write(f"{i[0]}#{"_".join(list(map(str, i[1])))}#{"_".join(list(map(str, i[2])))}\n")


In [None]:
import glob

seq_files = glob.glob("sequnce_data_*.csv")

with open("seq2seq_raw_data.csv", "w") as f:
    for sf in seq_files:
        print(sf)
        data = open(sf).readlines()
        for l in data:
            f.write(l.strip("\n")+ "\n")

In [None]:
data = open("seq2seq_raw_data.csv").readlines()


duration cluster

In [None]:
durations = []

for l in data:
    client_id = l.split("#")[0].strip()
    input_seq = l.split("#")[1].strip()
    output_seq = l.split("#")[2].strip().strip("\n")
    for tok in input_seq.split("_"):
        if tok.startswith("v"): #page visit
            pass
        elif tok.startswith("s"): #query search
            pass
        elif tok != '': # duration
            durations.append(int(tok))

    for tok in output_seq.split("_"):
        if tok.startswith("a"): #add to cart
            pass
        elif tok.startswith("r"): #remove from cart
            pass
        elif tok.startswith("b"): #buy product
            pass
        elif tok != '': # duration
            durations.append(int(tok))

# filter duration
durations = [i for i in durations if i>300]


In [None]:
import numpy as np
from sklearn.cluster import KMeans

X = np.array(durations).reshape(-1, 1)

kmeans = KMeans(n_clusters=5, random_state=42, n_init='auto')
labels = kmeans.fit_predict(X)
durations_cluster = {str(d):label  for d, label in zip(durations, labels)}


In [None]:
import pickle
#save durations_cluster
with open("durations_cluster.pkl", "wb") as f:
    pickle.dump(durations_cluster, f)

create cluster based sequence data

In [None]:
import pickle

#laod durations_cluster
with open("durations_cluster.pkl", "rb") as f:
    durations_clusters = pickle.load(f)

#laod loaded_dict
with open("data_clusters.pkl", "rb") as f:
    loaded_dict = pickle.load(f)


In [None]:
seq2seq_data = []

# data = open("seq2seq_raw_data.csv").readlines()
for l in data:
    cdata = []
    client_id = l.split("#")[0].strip()
    input_seq = l.split("#")[1].strip()
    output_seq = l.split("#")[2].strip().strip("\n")
    cdata.append(client_id)

    cdata.append(",")
    for tok in input_seq.split("_"):
        if tok.startswith("v"): #page visit
            if tok[1:] in loaded_dict['url']:
              # if cdata[-1] != f"v{loaded_dict['url'][tok[1:]]}":
              cdata.append(f"v{loaded_dict['url'][tok[1:]]}")

        elif tok.startswith("s"): #query search
            if tok[1:] in loaded_dict['query']:
                # if cdata[-1] != f"s{loaded_dict['query'][tok[1:]]}":
              cdata.append(f"s{loaded_dict['query'][tok[1:]]}")
        else: # duration
            if tok in durations_clusters:
              # if cdata[-1] != f"d{durations_cluster[tok]}":
              cdata.append(f"d{durations_clusters[tok]}")

    cdata.append(",")
    for tok in output_seq.split("_"):
        if tok.startswith("a"): #add to cart
            if tok[1:] in loaded_dict['sku']:
              # if cdata[-1] != f"a{loaded_dict['sku'][tok[1:]]}":
              cdata.append(f"a{loaded_dict['sku'][tok[1:]]}")

        elif tok.startswith("r"): #remove from cart
            if tok[1:] in loaded_dict['sku']:
              # if cdata[-1] != f"r{loaded_dict['sku'][tok[1:]]}":
              cdata.append(f"r{loaded_dict['sku'][tok[1:]]}")
        elif tok.startswith("b"): #buy product
            if tok[1:] in loaded_dict['sku']:
                # if cdata[-1] != f"b{loaded_dict['sku'][tok[1:]]}":
              cdata.append(f"b{loaded_dict['sku'][tok[1:]]}")
        else: # duration
            if tok in durations_clusters:
              # if cdata[-1] != f"d{durations_cluster[tok]}":
              cdata.append(f"d{durations_clusters[tok]}")
    # if len(" ".join(cdata).replace(" , ",",").split(",")[1].split()) > 1 and len(" ".join(cdata).replace(" , ",",").split(",")[2].split()) > 1:
    seq2seq_data.append(" ".join(cdata).replace(" , ",","))


In [None]:
with open("recsys_data.csv", "w") as f:
    f.write("client_id,input_seq,target_seq\n")
    for l in seq2seq_data:
        f.write(l+"\n")