In [7]:
import os
import json
import sys
from tqdm import tqdm

sys.path.insert(0, "../src/")
from data.utils import PlaylistIterator

In [2]:
files = filter(lambda x: ".json" in x and "chunk" in x, os.listdir("../data/processed"))
files = sorted(files, key=lambda x: int(x[6:].split(".")[0]))
files = list(map(lambda x: os.path.join("../data/processed", x), files))
#files = files[:-2]

In [3]:
pi = PlaylistIterator(files)

In [6]:
songs = set()

for pl in tqdm(pi, total=50000*18):
    songs.update(pl)

1000000it [00:33, 29626.48it/s]                                                 


In [7]:
song2idx = {s: i for i,s in enumerate(songs)}
with open("../models/sparse_full/idx2song.json", "w") as f:
    json.dump({v:k for k,v in song2idx.items()}, f)

In [8]:
rows, cols, data = [], [], []
for i, pl in tqdm(enumerate(pi), total=50000*18):
    for song in set(pl):
        rows.append(i)
        cols.append(song2idx[song])
        data.append(1)

1000000it [01:13, 13619.32it/s]                                                 


In [9]:
from scipy.sparse import csr_matrix

R = csr_matrix((data, (rows, cols)), shape=(max(rows)+1, max(cols)+1))

In [10]:
from scipy.sparse import save_npz

save_npz("../models/sparse_full/r.npz", R)

In [11]:
idx2song = json.load(open("../models/sparse_full/idx2song.json"))
song2idx = {v: int(k) for k,v in idx2song.items()}

In [8]:
# song frequencies
from collections import Counter
c = Counter()
for pl in tqdm(pi, total=50000*20):
    c.update(pl)
c = dict(c)

100%|██████████████████████████████| 1000000/1000000 [00:43<00:00, 23190.12it/s]


In [12]:
c_ = {song2idx[song]: freq for song,freq in c.items()}

In [14]:
with open("../models/sparse_full/frequencies.json", "w") as f:
        json.dump(c_, f)

In [1]:
# Dictionary based sparse
import os
import json
import sys
from tqdm import tqdm

sys.path.insert(0, "../src/")
from data.utils import PlaylistIterator

In [2]:
files = filter(lambda x: ".json" in x and "chunk" in x, os.listdir("../data/processed"))
files = sorted(files, key=lambda x: int(x[6:].split(".")[0]))
files = list(map(lambda x: os.path.join("../data/processed", x), files))

In [3]:
pi = PlaylistIterator(files)

In [4]:
songs = set()

for pl in tqdm(pi, total=50000*20):
    songs.update(pl)

1000000it [00:24, 40852.48it/s]                                                 


In [6]:
song2idx = {s: i for i,s in enumerate(songs)}
with open("../models/sparse_dictionary/idx2song.json", "w") as f:
    json.dump({v:k for k,v in song2idx.items()}, f)

In [7]:
s2p, p2s = {}, {}
for i, pl in tqdm(enumerate(pi), total=50000*20):
    p2s[i] = [song2idx[s] for s in set(pl)]
    for song in p2s[i]:
        if song not in s2p:
            s2p[song] = []
        s2p[song].append(i)
        

100%|██████████████████████████████| 1000000/1000000 [01:14<00:00, 13384.12it/s]


In [8]:
with open("../models/sparse_dictionary/songs.json", "w") as f:
    json.dump(s2p, f)
with open("../models/sparse_dictionary/playlist.json", "w") as f:
    json.dump(p2s, f)

In [11]:
list(p2s.items())[:10]

[(0,
  [815809,
   1026577,
   1321287,
   1861488,
   1635249,
   1864056,
   1999949,
   460708,
   1939368,
   731503,
   1534661,
   124250,
   206101,
   528162,
   913571,
   1959981,
   1902440,
   1544631,
   1533040,
   1457681,
   1620964,
   732369,
   171083,
   1955133,
   567744,
   1438021,
   2146007,
   1909374,
   1553056,
   1080848,
   1350722,
   61608,
   2026467,
   2025165,
   1127375,
   1569764,
   704708,
   81798,
   27602,
   293943,
   581977,
   1767303,
   1425609,
   1583756,
   1961831,
   1291800,
   1738480,
   1744738,
   92399,
   1349292,
   1669414]),
 (1,
  [546284,
   1077717,
   618747,
   567081,
   189957,
   1133579,
   1508177,
   1076263,
   2116652,
   1783537,
   120862,
   103093,
   54121,
   87393,
   1666611,
   2134752,
   1430826,
   505302,
   1888259,
   687174,
   2238752,
   1781526,
   815569,
   2057845,
   1312828,
   1562214,
   1942651,
   2080318,
   1162181,
   844282,
   2093816,
   1840294,
   1273662,
   2118508,
   