# Proyecto Final

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import json

In [2]:
! mkdir tmp
! mkdir tmp/model

mkdir: tmp: File exists
mkdir: tmp/model: File exists


## Set de datos

In [3]:
path = './dataset/data.json'
with open(path) as f:
    data = json.load(f)

ids = []
albums_uri = []
tracks_uri = []
artists_uri = []
position = []
almbums_name = []
artists_name = []
tracks_name = []
playlist_ids = []
playlist_name = []

playlist_id = 0

for playlist in data['playlists']:
    playlist_id += 1
    for track in playlist['tracks']:
        playlist_name.append(playlist['name'])
        playlist_ids.append(playlist_id)
        albums_uri.append(track['album_uri'])
        tracks_uri.append(track['track_uri'])
        artists_uri.append(track['artist_uri'])
        position.append(int(track['pos']))
        almbums_name.append(track['album_name'])
        artists_name.append(track['artist_name'])
        tracks_name.append(track['track_name'])
        
d = {
    'playlist_ids': playlist_ids,
    'playlist_name': playlist_name,
    'albums_uri': albums_uri,
    'tracks_uri': tracks_uri,
    'artists_uri': artists_uri,
    'position': position,
    'albums_name': almbums_name,
    'artists_name': artists_name,
    'tracks_name': tracks_name
}
df = pd.DataFrame(data=d)
df[:10]

Unnamed: 0,albums_name,albums_uri,artists_name,artists_uri,playlist_ids,playlist_name,position,tracks_name,tracks_uri
0,The Cookbook,spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,Missy Elliott,spotify:artist:2wIVse2owClT7go1WT98tk,1,Throwbacks,0,Lose Control (feat. Ciara & Fat Man Scoop),spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,In The Zone,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,Britney Spears,spotify:artist:26dSoYclwsYLMAKD3tpOr4,1,Throwbacks,1,Toxic,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,Dangerously In Love (Alben für die Ewigkeit),spotify:album:25hVFAxTlDvXbx2X2QkUkE,Beyoncé,spotify:artist:6vWDO969PvNqNYHIOW5v0m,1,Throwbacks,2,Crazy In Love,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,Justified,spotify:album:6QPkyl04rXwTGlGlcYaRoW,Justin Timberlake,spotify:artist:31TPClRtHm23RisEBtV3X7,1,Throwbacks,3,Rock Your Body,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,Hot Shot,spotify:album:6NmFmPX56pcLBOFMhIiKvF,Shaggy,spotify:artist:5EvFsr3kj42KNv97ZEnqij,1,Throwbacks,4,It Wasn't Me,spotify:track:1lzr43nnXAijIGYnCT8M8H
5,Confessions,spotify:album:0vO0b1AvY49CPQyVisJLj0,Usher,spotify:artist:23zg3TcAtWQy7J6upgbUnj,1,Throwbacks,5,Yeah!,spotify:track:0XUfyU2QviPAs6bxSpXYG4
6,Confessions,spotify:album:1RM6MGv6bcl6NrAG8PGoZk,Usher,spotify:artist:23zg3TcAtWQy7J6upgbUnj,1,Throwbacks,6,My Boo,spotify:track:68vgtRHr7iZHpzGpon6Jlo
7,PCD,spotify:album:5x8e8UcCeOgrOzSnDGuPye,The Pussycat Dolls,spotify:artist:6wPhSqRtPu1UhRCDX5yaDJ,1,Throwbacks,7,Buttons,spotify:track:3BxWKCI06eQ5Od8TY2JBeA
8,The Writing's On The Wall,spotify:album:283NWqNsCA9GwVHrJk59CG,Destiny's Child,spotify:artist:1Y8cdNmUJH7yBTd9yOvr5i,1,Throwbacks,8,Say My Name,spotify:track:7H6ev70Weq6DdpZyyTmUXk
9,Speakerboxxx/The Love Below,spotify:album:1UsmQ3bpJTyK6ygoOOjG1r,OutKast,spotify:artist:1G9G7WwrXka3Z1r7aIDjI7,1,Throwbacks,9,Hey Ya! - Radio Mix / Club Mix,spotify:track:2PpruBYCo4H7WOBJ7Q2EwM


In [4]:
n_playlist = df['playlist_ids'].nunique()
n_artist= df['artists_uri'].nunique()
n_albumns = df['albums_uri'].nunique()
n_tracks = df['tracks_uri'].nunique()

print('N Playlists:', n_playlist)
print('N Artists:', n_artist)
print('N Albums:', n_albumns)
print('N Tracks:', n_albumns)

N Playlists: 1000
N Artists: 9754
N Albums: 19261
N Tracks: 19261


In [5]:
# Max values to -1 pad arrays
tmp = df.groupby(['playlist_ids'])[['tracks_uri', 'albums_uri', 'artists_uri']].nunique()
max_tracks = tmp['tracks_uri'].max()
max_albums = tmp['albums_uri'].max()
max_artists = tmp['artists_uri'].max()
del tmp

print('Max Tracks:', max_tracks)
print('Max Albums:', max_albums)
print('Max Artists:', max_artists)

Max Tracks: 241
Max Albums: 225
Max Artists: 175


In [6]:
## Entities maps
album_to_int = {entity: i for i, entity in enumerate(df['albums_uri'].unique())}
artist_to_int = {entity: i for i, entity in enumerate(df['artists_uri'].unique())}
track_to_int = {entity: i for i, entity in enumerate(df['tracks_uri'].unique())}
playlist_to_int = {entity: i for i, entity in enumerate(df['playlist_ids'].unique())}

In [7]:
## Prepare dataset
labels = []
playlist_tracks = []
playlist_artists = []
playlist_albums = []

target_playlists = []
target_artists = []
target_tracks = []
target_albums = []

for playlist in df['playlist_ids'].unique():
    n = df[ df['playlist_ids'] == playlist ].shape[0]

    track_ids = df[ df['playlist_ids'] == playlist ]['tracks_uri'].map(lambda x: track_to_int[x]).tolist()
    target_tracks.extend(track_ids)
    track_ids = np.unique(track_ids)
    track_ids = np.pad(track_ids, (0, max_tracks-len(track_ids)), 'constant', constant_values=-1)
    [playlist_tracks.append(track_ids) for _ in range(n)]
    
    artist_ids = df[ df['playlist_ids'] == playlist ]['artists_uri'].map(lambda x: artist_to_int[x]).tolist()
    target_artists.extend(artist_ids)
    artist_ids = np.unique(artist_ids)
    artist_ids = np.pad(artist_ids, (0, max_artists-len(artist_ids)), 'constant', constant_values=-1)
    [playlist_artists.append(artist_ids) for _ in range(n)]
    
    albums_ids = df[ df['playlist_ids'] == playlist ]['albums_uri'].map(lambda x: album_to_int[x]).tolist()
    target_albums.extend(albums_ids)
    albums_ids = np.unique(albums_ids)
    albums_ids = np.pad(albums_ids, (0, max_albums-len(albums_ids)), 'constant', constant_values=-1)
    [playlist_albums.append(albums_ids) for _ in range(n)]
    
    playlist_id = playlist_to_int[playlist]
    [target_playlists.append(playlist_id) for _ in range(n)]
    [labels.append(1) for _ in range(n)]
    
playlist_tracks = np.array(playlist_tracks)
playlist_artists = np.array(playlist_artists)
playlist_albums = np.array(playlist_albums)
# target_artists = np.array(target_artists, dtype=np.int32)
# target_playlists = np.array(target_playlists, dtype=np.int32)
# target_tracks = np.array(target_tracks, dtype=np.int32)
# target_albums = np.array(target_albums, dtype=np.int32)
# labels = np.array(labels, dtype=np.float32)

In [8]:
print('Len playlist_tracks:', len(playlist_tracks))
print('Len playlist_artists:', len(playlist_artists))
print('Len playlist_albums:', len(playlist_albums))

print('Len target_playlists:', len(target_playlists))
print('Len target_artists:', len(target_artists))
print('Len target_tracks:', len(target_tracks))
print('Len target_albums:', len(target_albums))

print('Len labels:', len(labels))

Len playlist_tracks: 67503
Len playlist_artists: 67503
Len playlist_albums: 67503
Len target_playlists: 67503
Len target_artists: 67503
Len target_tracks: 67503
Len target_albums: 67503
Len labels: 67503


In [9]:
# sess = tf.InteractiveSession()

## Modelo Wide & Deep

In [10]:
K = 32

# Define columns input
list_track_column = tf.feature_column.categorical_column_with_identity(
    key='playlist_tracks',
    num_buckets=n_tracks)

list_artist_column = tf.feature_column.categorical_column_with_identity(
    key='playlist_artists',
    num_buckets=n_artist)

list_album_column = tf.feature_column.categorical_column_with_identity(
    key='playlist_albums',
    num_buckets=n_albumns)

target_artist_column = tf.feature_column.categorical_column_with_identity(
    key='target_artist',
    num_buckets=n_artist)

target_playlist_column = tf.feature_column.categorical_column_with_identity(
    key='target_playlist',
    num_buckets=n_playlist)

target_track_column = tf.feature_column.categorical_column_with_identity(
    key='target_track',
    num_buckets=n_tracks)

target_album_column = tf.feature_column.categorical_column_with_identity(
    key='target_album',
    num_buckets=n_albumns)

# TODO: Cross columsn with target

# Wide columns
wide_columns = [
    tf.feature_column.indicator_column(list_track_column),
    tf.feature_column.indicator_column(list_artist_column),
    tf.feature_column.indicator_column(list_album_column)
]

deep_columns = [
    tf.feature_column.embedding_column(
        categorical_column=target_artist_column,
        dimension=K),
    tf.feature_column.embedding_column(
        categorical_column=target_playlist_column,
        dimension=K),
    tf.feature_column.embedding_column(
        categorical_column=target_track_column,
        dimension=K),
    tf.feature_column.embedding_column(
        categorical_column=target_album_column,
        dimension=K)
]

In [11]:
# Dataset API to consume data
def input_fn(batch_size, sample=None):
    dataset = tf.data.Dataset.from_tensor_slices(({
        'playlist_tracks': playlist_tracks,
        'playlist_artists': playlist_artists,
        'playlist_albums': playlist_albums,
        'target_artist': target_artists,
        'target_playlist': target_playlists,
        'target_track': target_tracks,
        'target_album': target_albums}, labels))
    
    if sample is not None:
        print('Subset')
        dataset = dataset.shuffle(buffer_size=sample)
    else:
        sample = len(labels)
        dataset = dataset.shuffle(buffer_size=sample)

    dataset = dataset.batch(batch_size)
    return dataset

In [16]:
num_epochs = 20
batch_size = 126

model = tf.estimator.DNNLinearCombinedRegressor(
        model_dir='tmp/model',
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=[100, 50])

for e in range(num_epochs):
    model.train(input_fn=lambda: input_fn(batch_size))
    print('Done epoch', e)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': 'tmp/model', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x11c782be0>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into tmp/model/model.ckpt.
INFO:tensorflow:loss = 107.547905, step = 1
I

KeyboardInterrupt: 