# Proyecto Final

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
import json

In [None]:
! mkdir tmp
! mkdir tmp/model

## Set de datos

In [None]:
def load_dataset(path):
    with open(path) as f:
        data = json.load(f)

    albums_uri = []
    tracks_uri = []
    artists_uri = []
    position = []
    almbums_name = []
    artists_name = []
    tracks_name = []
    playlist_ids = []
    playlist_name = []
    labels = []

    playlist_id = 0

    for playlist in data['playlists']:
        playlist_id += 1
        for track in playlist['tracks']:
            playlist_name.append(playlist['name'])
            playlist_ids.append(playlist_id)
            albums_uri.append(track['album_uri'])
            tracks_uri.append(track['track_uri'])
            artists_uri.append(track['artist_uri'])
            position.append(int(track['pos']))
            almbums_name.append(track['album_name'])
            artists_name.append(track['artist_name'])
            tracks_name.append(track['track_name'])
            labels.append(1.)
        
    d = {
        'playlist_ids': playlist_ids,
        'playlist_name': playlist_name,
        'albums_uri': albums_uri,
        'tracks_uri': tracks_uri,
        'artists_uri': artists_uri,
        'position': position,
        'albums_name': almbums_name,
        'artists_name': artists_name,
        'tracks_name': tracks_name,
        'labels': labels
    }
    df = pd.DataFrame(data=d)
    return df

df = load_dataset('./dataset/data.json')
df[:10]

In [None]:
# Create train / test samples
def split_train_test(df, ratio=.2):
    df['test'] = 0
    for playlist in df['playlist_ids'].unique():
        playlist_tracks = df[ df['playlist_ids'] == playlist ]['tracks_uri'].tolist()
        n = len(playlist_tracks)
        np.random.shuffle(playlist_tracks)
        test_tracks = playlist_tracks[:int(n*ratio)]
        df.loc[(df['tracks_uri'].isin(test_tracks)) & (df['playlist_ids'] == playlist), 'test'] = 1
    
    return df[ df['test'] == 0 ], df[ df['test'] == 1 ]

train_df, test_df = split_train_test(df)

print('Train shape:', train_df.shape)
print('Test shape:', test_df.shape)
print('All data shape:', df.shape)

In [None]:
# Negative sampling
def sample_negative_data(df):
    N = df['playlist_ids'].nunique()
    all_tracks = df[['albums_name', 'albums_uri', 'artists_name', 'artists_uri', 'tracks_name', 'tracks_uri']].drop_duplicates()
    tracks_dict = all_tracks.to_dict('index')
    tracks_uri_to_id = {v['tracks_uri']: k for k, v in tracks_dict.items()}

    ids = []
    albums_uri = []
    tracks_uri = []
    artists_uri = []
    position = []
    almbums_name = []
    artists_name = []
    tracks_name = []
    playlist_ids = []
    playlist_name = []
    labels = []

    for i, playlist_id in enumerate(df['playlist_ids'].unique()):
        if i % 100 == 0:
            print('Sampling negative data... processing playlist {}/{}'.format(i+1, N))

        name = df[ df['playlist_ids'] == playlist_id ].iloc[0]['playlist_name']
        positive_tracks = df[ df['playlist_ids'] == playlist_id ]['tracks_uri'].tolist()
        negative_tracks = all_tracks['tracks_uri'].unique().tolist()
        negative_tracks = np.setdiff1d(negative_tracks, positive_tracks)
        np.random.shuffle(negative_tracks)
        negative_tracks = negative_tracks[:len(positive_tracks)]

        for track in negative_tracks:
            track_idx = tracks_uri_to_id[track]
            track_data = tracks_dict[track_idx]
        
            playlist_name.append(name)
            playlist_ids.append(playlist_id)
            albums_uri.append(track_data['albums_uri'])
            tracks_uri.append(track_data['tracks_uri'])
            artists_uri.append(track_data['artists_uri'])
            position.append(-1)
            almbums_name.append(track_data['albums_name'])
            artists_name.append(track_data['artists_name'])
            tracks_name.append(track_data['tracks_name'])
            labels.append(0.)
        
    d = {
        'playlist_ids': playlist_ids,
        'playlist_name': playlist_name,
        'albums_uri': albums_uri,
        'tracks_uri': tracks_uri,
        'artists_uri': artists_uri,
        'position': position,
        'albums_name': almbums_name,
        'artists_name': artists_name,
        'tracks_name': tracks_name,
        'labels': labels
    }
    neg_df = pd.DataFrame(data=d)
    df = pd.concat([df, neg_df])
    
    return df

train_df = sample_negative_data(train_df)
train_df[:10]

In [None]:
# Define train / test split
ratio = .2
df['sample'] = np.random.rand(df.shape[0])
df['test'] = np.where(np.random.rand(df.shape[0]) < ratio, 1, 0)
df[:10]

In [None]:
n_playlist = df['playlist_ids'].nunique()
n_artist= df['artists_uri'].nunique()
n_albumns = df['albums_uri'].nunique()
n_tracks = df['tracks_uri'].nunique()

print('N Playlists:', n_playlist)
print('N Artists:', n_artist)
print('N Albums:', n_albumns)
print('N Tracks:', n_albumns)

In [None]:
# Max values to -1 pad arrays
tmp = train_df.groupby(['playlist_ids'])[['tracks_uri', 'albums_uri', 'artists_uri']].nunique()
max_tracks = tmp['tracks_uri'].max()
max_albums = tmp['albums_uri'].max()
max_artists = tmp['artists_uri'].max()
del tmp

print('Max Tracks:', max_tracks)
print('Max Albums:', max_albums)
print('Max Artists:', max_artists)

In [None]:
## Entities maps
album_to_int = {entity: i for i, entity in enumerate(df['albums_uri'].unique())}
artist_to_int = {entity: i for i, entity in enumerate(df['artists_uri'].unique())}
track_to_int = {entity: i for i, entity in enumerate(df['tracks_uri'].unique())}
playlist_to_int = {entity: i for i, entity in enumerate(df['playlist_ids'].unique())}

In [None]:
## Prepare dataset
def prepare_dataset(df):
    labels = []
    playlist_tracks = []
    playlist_artists = []
    playlist_albums = []

    target_playlists = []
    target_artists = []
    target_tracks = []
    target_albums = []

    for playlist in df['playlist_ids'].unique():
        n = df[ df['playlist_ids'] == playlist ].shape[0]

        track_ids = df[ df['playlist_ids'] == playlist ]['tracks_uri'].map(lambda x: track_to_int[x]).tolist()
        target_tracks.extend(track_ids)
        track_ids = np.unique(track_ids)
        track_ids = np.pad(track_ids, (0, max_tracks-len(track_ids)), 'constant', constant_values=-1)
        [playlist_tracks.append(track_ids) for _ in range(n)]
    
        artist_ids = df[ df['playlist_ids'] == playlist ]['artists_uri'].map(lambda x: artist_to_int[x]).tolist()
        target_artists.extend(artist_ids)
        artist_ids = np.unique(artist_ids)
        artist_ids = np.pad(artist_ids, (0, max_artists-len(artist_ids)), 'constant', constant_values=-1)
        [playlist_artists.append(artist_ids) for _ in range(n)]
    
        albums_ids = df[ df['playlist_ids'] == playlist ]['albums_uri'].map(lambda x: album_to_int[x]).tolist()
        target_albums.extend(albums_ids)
        albums_ids = np.unique(albums_ids)
        albums_ids = np.pad(albums_ids, (0, max_albums-len(albums_ids)), 'constant', constant_values=-1)
        [playlist_albums.append(albums_ids) for _ in range(n)]
    
        track_lavels = df[ df['playlist_ids'] == playlist ]['labels'].tolist()
        labels.extend(track_lavels)
    
        playlist_id = playlist_to_int[playlist]
        [target_playlists.append(playlist_id) for _ in range(n)]
    
    playlist_tracks = np.array(playlist_tracks)
    playlist_artists = np.array(playlist_artists)
    playlist_albums = np.array(playlist_albums)
    
    return {
        'labels': labels,
        'playlist_tracks': playlist_tracks,
        'playlist_artists': playlist_artists,
        'playlist_albums': playlist_albums,
        'target_playlists': target_playlists,
        'target_artists': target_artists,
        'target_tracks': target_tracks,
        'target_albums': target_albums
    }

data = prepare_dataset(train_df)
labels = data['labels']
playlist_tracks = data['playlist_tracks']
playlist_artists = data['playlist_artists']
playlist_albums = data['playlist_albums']
target_playlists = data['target_playlists']
target_artists = data['target_artists']
target_tracks = data['target_tracks']
target_albums = data['target_albums']

In [None]:
print('Len playlist_tracks:', len(playlist_tracks))
print('Len playlist_artists:', len(playlist_artists))
print('Len playlist_albums:', len(playlist_albums))

print('Len target_playlists:', len(target_playlists))
print('Len target_artists:', len(target_artists))
print('Len target_tracks:', len(target_tracks))
print('Len target_albums:', len(target_albums))

print('Len labels:', len(labels))

## Modelo Wide & Deep

In [None]:
K = 32

# Define columns input
list_track_column = tf.feature_column.categorical_column_with_identity(
    key='playlist_tracks',
    num_buckets=n_tracks)

list_artist_column = tf.feature_column.categorical_column_with_identity(
    key='playlist_artists',
    num_buckets=n_artist)

list_album_column = tf.feature_column.categorical_column_with_identity(
    key='playlist_albums',
    num_buckets=n_albumns)

target_artist_column = tf.feature_column.categorical_column_with_identity(
    key='target_artist',
    num_buckets=n_artist)

target_playlist_column = tf.feature_column.categorical_column_with_identity(
    key='target_playlist',
    num_buckets=n_playlist)

target_track_column = tf.feature_column.categorical_column_with_identity(
    key='target_track',
    num_buckets=n_tracks)

target_album_column = tf.feature_column.categorical_column_with_identity(
    key='target_album',
    num_buckets=n_albumns)

# Wide columns
wide_columns = [
    tf.feature_column.indicator_column(list_track_column),
    tf.feature_column.indicator_column(list_artist_column),
    tf.feature_column.indicator_column(list_album_column)
]

deep_columns = [
    tf.feature_column.embedding_column(
        categorical_column=target_artist_column,
        dimension=K),
    tf.feature_column.embedding_column(
        categorical_column=target_playlist_column,
        dimension=K),
    tf.feature_column.embedding_column(
        categorical_column=target_track_column,
        dimension=K),
    tf.feature_column.embedding_column(
        categorical_column=target_album_column,
        dimension=K)
]

In [None]:
# Dataset API to consume data
def input_fn(batch_size, sample=None):
    dataset = tf.data.Dataset.from_tensor_slices(({
        'playlist_tracks': playlist_tracks,
        'playlist_artists': playlist_artists,
        'playlist_albums': playlist_albums,
        'target_artist': target_artists,
        'target_playlist': target_playlists,
        'target_track': target_tracks,
        'target_album': target_albums}, labels))
    
    if sample is not None:
        print('Subset')
        dataset = dataset.shuffle(buffer_size=sample)
    else:
        sample = len(labels)
        dataset = dataset.shuffle(buffer_size=sample)

    dataset = dataset.batch(batch_size)
    return dataset

In [None]:
num_epochs = 20
batch_size = 126

model = tf.estimator.DNNLinearCombinedRegressor(
        model_dir='tmp/model',
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=[100, 50])

for e in range(num_epochs):
    model.train(input_fn=lambda: input_fn(batch_size))
    print('Done epoch', e)

## Predict

In [None]:
def dcg(y, y_hat):
    rel = np.array([float(np.isin(i, y_hat)) for i in y])
    metric = np.sum(rel / np.log2(np.arange(2, rel.size + 2)))
    return metric

In [None]:
# Load model
model = tf.estimator.DNNLinearCombinedRegressor(
        model_dir='tmp/model',
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=[100, 50])

In [None]:
# Generate target data
def predict(model, df, playlist_id, ground_truth, top_n=10):
    df = df[['albums_name', 'albums_uri', 'artists_name', 'artists_uri', 'tracks_name', 'tracks_uri']].drop_duplicates()
    n = ground_truth.shape[0]
    df = df.sample(3 * n)

    df['playlist_ids'] = playlist_id
    df = pd.concat([df, ground_truth])
    data = prepare_dataset(df)

    labels = data['labels']
    playlist_tracks = data['playlist_tracks']
    playlist_artists = data['playlist_artists']
    playlist_albums = data['playlist_albums']
    target_playlists = data['target_playlists']
    target_artists = data['target_artists']
    target_tracks = data['target_tracks']
    target_albums = data['target_albums']
    
    batch_size = 64
    results = model.predict(input_fn=lambda: input_fn(batch_size))
    predictions = []

    for track, result in zip(target_tracks, results):
        predictions.append((track, result['predictions'][0]))
    
    
    ground_truth = ground_truth['tracks_uri'].map(lambda x: track_to_int[x]).tolist()
    predictions = sorted(predictions, key=lambda x: x[1], reverse=True)
    predictions = list(map(lambda x: x[0], predictions))[:top_n]
        
    return predictions, ground_truth

In [None]:
# Test model
max_eval = 10
metric = 0.
N = test_df['playlist_ids'].nunique()
print_every_n = 2

for i, playlist_id in enumerate(test_df['playlist_ids'].unique()[:max_eval]):
    if i % print_every_n == 0:
        print('Processing playlist {} / {}'.format(i+1, max_eval))
        print('Mean DCG:', metric / N)
    ground_truth = test_df[ test_df['playlist_ids'] == playlist_id ]
    y_hat, y = predict(model, df, playlist_id, ground_truth)
    metric += dcg(y, y_hat)
    
metric /= N
    
print('Mean DCG:', metric)