In [None]:
#!pip install edward
#!pip install pathos
#!pip install more_itertools

In [2]:
import edward as ed
from tensorflow.python.training.adam import AdamOptimizer
import time, os
from math import sqrt

from gensim.models import Word2Vec
from random import shuffle
import collections
import numpy as np
import random

import tensorflow as tf
import numpy as np
from pathos.multiprocessing import Pool, cpu_count
from more_itertools import chunked
from typing import List, Callable, Union, Any
from math import ceil
from itertools import chain
import logging
from random import shuffle
import matplotlib.pyplot as plt
import pickle

import edward as ed
from edward.models import Normal, Bernoulli
from tensorflow.contrib.tensorboard.plugins import projector

In [3]:
dir_name = 'uncertain_fits/fit' + time.strftime("%y_%m_%d_%H_%M_%S")
in_file = 'MPD_line_sentence/playlists_ntitle_tracks_sentences_id_final'
context_emb_file = 'models/wv_model_mincount5_shuffle_size300_MPD'
ns = 10
K = 300
cs = 20
mb = 5000
n_epochs = 50

In [4]:
os.makedirs(dir_name)
sess = ed.get_session()

In [5]:
def read_data(filename):
    """Extract the first file enclosed in a zip file as a list of words"""
    songs_and_tracks = np.load(filename)
    logging.getLogger('logging_songscuncert').debug('number of loaded playists:'+str(len(songs_and_tracks)))
    return songs_and_tracks


def flatten_list(listoflists):
    return list(chain.from_iterable(listoflists))


def get_optimal():
    x = np.linspace(0, 1, 100)
    a = 1
    optimals = []
    for b in range(7, 17, 2):
        optimal = np.power(x, a) * np.power((1 - x), b) + 1e-3
        optimal = optimal / np.sum(optimal)
        optimals.append(optimal)
    return optimals


def is_goog_embedding(sigmas):
    threshold = 1e-3
    optimals = get_optimal()
    hist = plt.hist(sigmas, bins=100, color='green', label='sigma values')
    distr = (hist[0] + 1e-5) / np.sum(hist[0])
    distance = 0
    for optimal in optimals:
        distance += -np.sum(optimal * np.log(distr / optimal))
    distance = distance / len(optimals)
    return distance < threshold


def process_play_list_constructor(neg_samples:int, dictionary:dict, context_size:int, sampling_table:dict):
    """Generate a function that will clean and tokenize text."""
    def process_play_list(play_lists):
        samples = []
        dictionary_keys = list(dictionary.keys())
        try:
            for play_list in play_lists:
                if sampling_table[play_list[0]] < random.random():
                    songs = play_list[1]
                    shuffle(songs)
                    for song in songs[:context_size]:
                        if song not in dictionary:
                            song = 'UNK'
                        samples.append((int(play_list[0]), dictionary[song], 1))
                    for i in range(neg_samples):
                        random_neg_sample = random.randint(0, len(dictionary) - 1)
                        samples.append((int(play_list[0]), dictionary[dictionary_keys[random_neg_sample]], 0))
        except Exception as e:
            logging.getLogger('logging_songscuncert').error('error '+e)
        return samples

    return process_play_list


def apply_parallel(func: Callable,
                   data: List[Any],
                   cpu_cores: int = None) -> List[Any]:
    """
    Apply function to list of elements.

    Automatically determines the chunk size.
    """
    if not cpu_cores:
        cpu_cores = cpu_count()

    try:
        chunk_size = ceil(len(data) / cpu_cores)
        pool = Pool(cpu_cores)
        transformed_data = pool.map(func, chunked(data, chunk_size), chunksize=1)
    finally:
        pool.close()
        pool.join()
        return transformed_data


def variable_summaries(summary_name, var):
    with tf.name_scope(summary_name):
        mean = tf.reduce_mean(var)
        tf.summary.scalar('mean', mean)
        with tf.name_scope('stddev'):
            stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean)))
        tf.summary.scalar('stddev', stddev)
        tf.summary.scalar('max', tf.reduce_max(var))
        tf.summary.scalar('min', tf.reduce_min(var))


def get_logger():
    logger = logging.getLogger()
    logger.setLevel(logging.DEBUG)

    # create logger
    logger = logging.getLogger("logging_songscuncert")
    logger.setLevel(logging.DEBUG)

    # create console handler and set level to debug
    ch = logging.StreamHandler()
    ch.setLevel(logging.DEBUG)

    # create formatter
    formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(message)s")

    # add formatter to ch
    ch.setFormatter(formatter)

    # add ch to logger
    logger.addHandler(ch)
    return logger

In [6]:
class bayessian_bern_emb_data():
    def __init__(self, input_file, target_emb_file, context_emb_file, ns, K, cs, dir_name):
        self.ns = ns
        self.K = K
        self.cs = cs
        self.dir_name = dir_name
        if target_emb_file:
            self.load_target_embeddings(target_emb_file)
        else:
            self.pretreained_target_embeddings = None
        self.load_context_embeddings(context_emb_file)
        songs_and_tracks = read_data(input_file)
        self.build_dataset(songs_and_tracks)
        #self.batch = self.batch_generator()
        self.N = len(self.playlists)

    def parallel_process_text(self, data):
        """Apply cleaner -> tokenizer."""
        process_text = process_play_list_constructor(self.ns, self.dictionary, self.cs, self.sampling_table)
        return flatten_list(apply_parallel(process_text, data))

    def build_dataset(self, songs_and_tracks):
        raw_playlists, raw_songs = zip(*songs_and_tracks)
        count_playlists = collections.Counter(raw_playlists)
        self.L_target = len(count_playlists.keys())
        self.build_sampling_table(count_playlists)
        self.samples = self.parallel_process_text(songs_and_tracks)
        shuffle(self.samples)
        playlists, songs, labels = zip(*self.samples)
        self.playlists = np.array(list(playlists))
        self.songs = np.array(list(songs))
        self.labels = np.array(list(labels))

    def batch_generator(self, n_minibatch):
        batch_size = n_minibatch
        data_target = self.playlists
        data_context = self.songs
        data_labels = self.labels
        while True:
            if data_target.shape[0] < batch_size:
                data_target = np.hstack([data_target, self.playlists])
                data_context = np.hstack([data_context, self.songs])
                data_labels = np.hstack([data_labels, self.labels])
                if data_target.shape[0] < batch_size:
                    continue
            play_lists = data_target[:batch_size]
            songs = data_context[:batch_size]
            labels = data_labels[:batch_size]
            data_target = data_target[batch_size:]
            data_context = data_context[batch_size:]
            data_labels = data_labels[batch_size:]
            yield play_lists, songs, labels

    def feed(self, n_minibatch, target_placeholder, context_placeholder, labels_placeholder,
             ones_placeholder, zeros_placeholder, shuffling = False):
        play_lists, songs, labels = self.batch.__next__()
        if shuffling:
            labels = np.random.permutation(labels)
        return {target_placeholder: play_lists,
                context_placeholder: songs,
                labels_placeholder: labels,
                ones_placeholder: np.ones((n_minibatch), dtype=np.int32),
                zeros_placeholder: np.zeros((n_minibatch), dtype=np.int32)
                }

    def load_target_embeddings(self, emb_file):
        w2v_model = Word2Vec.load(emb_file)
        target_embeddings = []
        for elem in range(len(w2v_model.wv.vectors)):
            target_embeddings.append(w2v_model.wv.word_vec(str(elem)))
        self.pretreained_target_embeddings = np.array(target_embeddings)

    def load_context_embeddings(self, emb_file):
        w2v_model = Word2Vec.load(emb_file)
        vocabulary = w2v_model.wv.vocab
        self.dictionary = {'UNK': 0}
        for song in vocabulary:
            self.dictionary[song] = vocabulary[song].index + 1
        self.L_context = len(self.dictionary)
        self.pretreained_context_embeddings = np.zeros((1, self.K), dtype=np.float32).tolist()
        self.pretreained_context_embeddings.extend(w2v_model.wv.vectors)

    def build_sampling_table(self, count_playlists):
        sampling_factor = 1e-3
        sampling_table = dict()
        total_occurrences = sum(count_playlists.values())
        for playlist in count_playlists:
            playlist_frequency = (1. * count_playlists[playlist]) / total_occurrences
            sampling_table[playlist] = max(0., ((playlist_frequency - sampling_factor) / playlist_frequency) - sqrt(
                sampling_factor / playlist_frequency))
        self.sampling_table = sampling_table

In [7]:
d = bayessian_bern_emb_data(in_file, None, context_emb_file, ns, K, cs, dir_name)

In [8]:
class bayesian_emb_model():
    def __init__(self, d, n_minibatch, sess, logdir):
        self.K = d.K
        self.sess = sess
        self.logdir = logdir

        with tf.name_scope('model'):
            # Data Placeholder
            with tf.name_scope('input'):
                self.target_placeholder = tf.placeholder(tf.int32)
                self.context_placeholder = tf.placeholder(tf.int32)
                self.labels_placeholder = tf.placeholder(tf.int32, shape=[n_minibatch])
                self.ones_placeholder = tf.placeholder(tf.int32)
                self.zeros_placeholder = tf.placeholder(tf.int32)

            # Index Masks
            with tf.name_scope('priors'):
                self.U = Normal(loc=tf.zeros((d.L_target, self.K), dtype=tf.float32),
                                scale=tf.ones((d.L_target, self.K), dtype=tf.float32))
                self.V = Normal(loc=tf.zeros((d.L_context, self.K), dtype=tf.float32),
                                scale=tf.ones((d.L_context, self.K), dtype=tf.float32))

        with tf.name_scope('natural_param'):
            # Taget and Context Indices
            with tf.name_scope('target_word'):
                pos_indexes = tf.where(
                    tf.equal(self.labels_placeholder, tf.ones(self.labels_placeholder.shape, dtype=tf.int32)))
                pos_words = tf.gather(self.target_placeholder, pos_indexes)
                self.p_rhos = tf.nn.embedding_lookup(self.U, pos_words)
                pos_contexts = tf.gather(self.context_placeholder, pos_indexes)
                self.pos_ctx_alpha = tf.nn.embedding_lookup(self.V, pos_contexts)

            with tf.name_scope('negative_samples'):
                neg_indexes = tf.where(
                    tf.equal(self.labels_placeholder, tf.zeros(self.labels_placeholder.shape, dtype=tf.int32)))
                neg_words = tf.gather(self.target_placeholder, neg_indexes)
                self.n_rho = tf.nn.embedding_lookup(self.U, neg_words)
                neg_contexts = tf.gather(self.context_placeholder, neg_indexes)
                self.neg_ctx_alpha = tf.nn.embedding_lookup(self.V, neg_contexts)

            # Natural parameter
            self.p_eta = tf.reduce_sum(tf.multiply(self.p_rhos, self.pos_ctx_alpha), -1)
            self.n_eta = tf.reduce_sum(tf.multiply(self.n_rho, self.neg_ctx_alpha), -1)

        self.y_pos = Bernoulli(logits=self.p_eta)
        self.y_neg = Bernoulli(logits=self.n_eta)

        # INFERENCE
        self.sigU = tf.nn.softplus(
            tf.matmul(tf.get_variable("sigU", shape=(d.L_target, 1), initializer=tf.ones_initializer()), tf.ones([1, self.K])),
            name="sigmasU")
        self.sigV = tf.nn.softplus(
            tf.matmul(tf.get_variable("sigV", shape=(d.L_context, 1), initializer=tf.ones_initializer()), tf.ones([1, self.K])),
            name="sigmasV")
        self.locU = tf.get_variable("qU/loc", [d.L_target, self.K], initializer=tf.zeros_initializer())
        #self.locV = tf.get_variable("qV/loc", [d.L_context, self.K], initializer=tf.zeros_initializer())

        if d.pretreained_target_embeddings is not None:
            self.qU = Normal(loc=d.pretreained_target_embeddings, scale=self.sigU)
        else:
            self.qU = Normal(loc=self.locU, scale=self.sigU)
        self.qV = Normal(loc=d.pretreained_context_embeddings, scale=self.sigV)

        self.inference = ed.KLqp({self.U: self.qU, self.V: self.qV},
                                 data={self.y_pos: self.ones_placeholder,
                                       self.y_neg: self.zeros_placeholder
                                       })
        with self.sess.as_default():
            tf.global_variables_initializer().run()
        self.summaries = tf.summary.merge_all()
        self.train_writer = tf.summary.FileWriter(self.logdir, self.sess.graph)
        self.saver = tf.train.Saver()
        config = projector.ProjectorConfig()

        alpha = config.embeddings.add()
        alpha.tensor_name = 'qU/loc'
        alpha.metadata_path = '../vocab_alpha.tsv'
        rho = config.embeddings.add()
        rho.tensor_name = 'qV/loc'
        rho.metadata_path = '../vocab_rho.tsv'
        projector.visualize_embeddings(self.train_writer, config)

    def dump(self, fname, data):
        with self.sess.as_default():
            dat = {'rhos': self.qU.loc.eval(),
                   'alpha': self.qV.loc.eval(),
                   'sigma_rhos': self.sigU.eval()[:, 0],
                   'sigma_alphas': self.sigV.eval()[:, 0]}
            pickle.dump(dat, open(fname, "wb+"))

    def build_words_list(self, labels, list_length):
        if len(labels) < list_length:
            empty_list = [''] * (list_length - len(labels))
            labels.extend(empty_list)
        return labels


In [None]:
d.batch = d.batch_generator(mb)
m = bayesian_emb_model(d, mb, sess, dir_name)

In [None]:
def get_n_iters():
    n_batches = len(d.playlists) / mb
    if len(d.playlists) % mb > 0:
        n_batches += 1
    return int(n_batches) * n_epochs, int(n_batches)

In [1]:
# TRAINING
sigmas_list = list()
n_iters, n_batches = get_n_iters()

m.inference.initialize(n_samples=1, n_iter=n_iters, logdir=m.logdir,
                       scale={m.y_pos: n_batches, m.y_neg: n_batches / ns},
                       kl_scaling={m.y_pos: n_batches, m.y_neg: n_batches / ns},
                       optimizer=AdamOptimizer(learning_rate=0.001)
                       )
init = tf.global_variables_initializer()
sess.run(init)
for i in range(m.inference.n_iter):
    info_dict = m.inference.update(feed_dict=d.feed(mb, m.target_placeholder,
                                                    m.context_placeholder,
                                                    m.labels_placeholder,
                                                    m.ones_placeholder,
                                                    m.zeros_placeholder,
                                                    True))
    m.inference.print_progress(info_dict)
    if i % 10000 == 0:
        m.saver.save(sess, os.path.join(m.logdir, "model.ckpt"), i)
        sigmas = m.sigU.eval()[:, 0]
        sigmas_list.append(sigmas)
        pickle.dump(sigmas_list, open(dir_name + "/sigmas.dat", "wb+"))
        if is_goog_embedding(sigmas):
            break
m.saver.save(sess, os.path.join(m.logdir, "model.ckpt"), i)
m.dump(dir_name + "/variational.dat", d)

NameError: name 'get_n_iters' is not defined