In [15]:
from google.colab import userdata, drive
import os

drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import scipy.sparse as sp
import numpy as np
import pandas as pd

base_path = "drive/MyDrive/Education/STAT 5330 Project/"
fp = "data/phase_2/song_graph.npz"
path = os.path.join(base_path, fp)

W = sp.load_npz(path)
print(W)

<Compressed Sparse Row sparse matrix of dtype 'float32'
	with 2535765 stored elements and shape (100000, 100000)>
  Coords	Values
  (0, 3416)	0.4327099919319153
  (0, 3920)	0.4180808365345001
  (0, 5298)	0.42188534140586853
  (0, 8927)	0.4175543785095215
  (0, 9904)	0.41491302847862244
  (0, 13951)	0.8769318461418152
  (0, 25689)	0.43012791872024536
  (0, 26496)	0.4161738455295563
  (0, 32368)	0.42478662729263306
  (0, 36501)	0.8529096841812134
  (0, 36996)	0.4266842305660248
  (0, 41734)	0.42238083481788635
  (0, 54912)	0.4143146574497223
  (0, 56011)	0.8329386711120605
  (0, 63271)	0.8696879744529724
  (0, 68974)	0.41649216413497925
  (0, 70896)	0.40914779901504517
  (0, 89065)	0.8758329153060913
  (0, 91254)	0.4196739196777344
  (0, 93758)	0.4257985055446625
  (0, 96032)	0.4265986382961273
  (0, 97477)	0.4124544560909271
  (1, 3749)	0.7684120535850525
  (1, 7507)	0.7616057395935059
  (1, 15327)	0.7709869146347046
  :	:
  (99999, 1475)	0.8625028133392334
  (99999, 2573)	0.91229343414

In [17]:
def build_transition_matrix(W):
    W = W.tocsr()
    d = np.array(W.sum(axis=1)).ravel()
    inv_d = np.zeros_like(d, dtype=float)
    inv_d[d > 0] = 1.0 / d[d > 0]

    D_inv = sp.diags(inv_d)
    P = D_inv @ W
    return P


In [18]:
P = build_transition_matrix(W)

In [19]:
def random_walk(P, start_node, walk_length, rng=None):
    if rng is None:
        rng = np.random.default_rng(42)

    P = P.tocsr()
    n = P.shape[0]

    current = int(start_node)
    path = [current]

    for _ in range(walk_length - 1):
        row_start = P.indptr[current]
        row_end = P.indptr[current + 1]

        neighbors = P.indices[row_start:row_end]
        probs = P.data[row_start:row_end]

        if len(neighbors) == 0:
            # dead-end node: stop the walk
            break

        # probs should already sum to 1, but renormalize for safety
        probs = probs / probs.sum()
        current = rng.choice(neighbors, p=probs)
        path.append(current.item())

    return path


In [20]:
def simulate_user_sessions(P, base_songs, n_sessions, session_length, rng=None):
    if rng is None:
        rng = np.random.default_rng(123)

    sessions = []
    for _ in range(n_sessions):
        start = rng.choice(base_songs)
        path = random_walk(P, start, session_length, rng=rng)
        sessions.append(path)
    return sessions

In [21]:
def simulate_users(P, song_idx, n_users=5000, rng=None):
  if rng is None:
    rng = np.random.default_rng(5330)

  users = []

  for _ in range(n_users):
    base_songs = rng.choice(song_idx, 5)
    n_sessions = int(rng.integers(250, 1000))
    session_length = int(rng.integers(20, 75))
    sessions = simulate_user_sessions(P, base_songs, n_sessions, session_length, rng)

    flat_history = [track for sess in sessions for track in sess]

    users.append(flat_history)

  return users


In [22]:
fp = "data/general/sampled_track_metadata.csv"
path = os.path.join(base_path, fp)

meta = pd.read_csv(path)
meta.shape

(100000, 7)

In [23]:
idx = meta.index.to_list()
users = simulate_users(P, idx, 1000)

In [24]:
df_users = pd.DataFrame({
    "user_id": range(len(users)),
    "history": users
})

In [28]:
df_users["history"].apply(len).describe()

Unnamed: 0,history
count,1000.0
mean,29109.752
std,14157.69289
min,5500.0
25%,18385.0
50%,26015.5
75%,37978.5
max,73408.0


In [29]:
fp = "data/general/user_playthroughs.csv"
path = os.path.join(base_path, fp)

df_users.to_csv(path, index=False)