In [2]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [5]:
import os
import gensim
import pickle
import pandas as pd
import json
import random

In [6]:
train = open(r"playlist_name_train.json") 
train_names = json.load(train)

In [7]:
test = open(r"playlist_name_test_1k.json") 
test_names = json.load(test)

In [9]:
len(train_names)

92944

In [10]:
len(test_names)

862

In [11]:
train_names = [train_names for train_names in train_names if str(train_names) != 'nan']

In [12]:
test_data = [test_names for test_names in test_names if str(test_names) != 'nan']

In [14]:
print(train_names[:10])

['Happy happy!!', 'Shameless', 'State Champs', 'Classic rock ', 'TURN UP :$', 'bAda$$', 'Half Marathon', 'settle down', 'HANGOVER', 'gLoBaL ']


In [15]:
print(test_names[:10])

['My Jams', 'Neo Soul', 'mix ', 'May Flowers', 'Pool Time', 'Fall 2014', 'dance playlist', 'February 2016', 'cry', 'Basics']


In [16]:
def preprocess(data, tag = True):
    for count, name in enumerate(data):
        tokens = gensim.utils.simple_preprocess(name)
        if tag:
            # For training data, add tags
            yield gensim.models.doc2vec.TaggedDocument(tokens, [count])
        else:
            yield tokens

In [17]:
train_corpus = list(preprocess(train_names, tag = True))
test_corpus = list(preprocess(test_names, tag = False))

In [20]:
print(train_corpus[:10])

[TaggedDocument(words=['happy', 'happy'], tags=[0]), TaggedDocument(words=['shameless'], tags=[1]), TaggedDocument(words=['state', 'champs'], tags=[2]), TaggedDocument(words=['classic', 'rock'], tags=[3]), TaggedDocument(words=['turn', 'up'], tags=[4]), TaggedDocument(words=['bada'], tags=[5]), TaggedDocument(words=['half', 'marathon'], tags=[6]), TaggedDocument(words=['settle', 'down'], tags=[7]), TaggedDocument(words=['hangover'], tags=[8]), TaggedDocument(words=['global'], tags=[9])]


In [21]:
print(test_corpus[:10])

[['my', 'jams'], ['neo', 'soul'], ['mix'], ['may', 'flowers'], ['pool', 'time'], ['fall'], ['dance', 'playlist'], ['february'], ['cry'], ['basics']]


In [22]:
model = gensim.models.doc2vec.Doc2Vec(vector_size = 100, min_count = 1, epochs = 100)

In [23]:
model.build_vocab(train_corpus)

2021-01-15 11:46:38,758 : INFO : collecting all words and their counts
2021-01-15 11:46:38,759 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2021-01-15 11:46:38,780 : INFO : PROGRESS: at example #10000, processed 15207 words (747151/s), 4494 word types, 10000 tags
2021-01-15 11:46:38,802 : INFO : PROGRESS: at example #20000, processed 30329 words (715143/s), 6502 word types, 20000 tags
2021-01-15 11:46:38,826 : INFO : PROGRESS: at example #30000, processed 45711 words (661699/s), 7643 word types, 30000 tags
2021-01-15 11:46:38,853 : INFO : PROGRESS: at example #40000, processed 61116 words (608522/s), 8425 word types, 40000 tags
2021-01-15 11:46:38,881 : INFO : PROGRESS: at example #50000, processed 76275 words (570848/s), 8979 word types, 50000 tags
2021-01-15 11:46:38,900 : INFO : PROGRESS: at example #60000, processed 91611 words (812503/s), 9347 word types, 60000 tags
2021-01-15 11:46:38,922 : INFO : PROGRESS: at example #70000, processed 106960 wo

In [24]:
model.train(train_corpus, total_examples = model.corpus_count, epochs = model.epochs)

2021-01-15 11:46:48,094 : INFO : training model with 3 workers on 10094 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2021-01-15 11:46:49,285 : INFO : EPOCH 1 - PROGRESS: at 28.32% examples, 51725 words/s, in_qsize 5, out_qsize 0
2021-01-15 11:46:50,436 : INFO : EPOCH 1 - PROGRESS: at 70.47% examples, 65442 words/s, in_qsize 5, out_qsize 0
2021-01-15 11:46:50,598 : INFO : worker thread finished; awaiting finish of 2 more threads
2021-01-15 11:46:50,874 : INFO : worker thread finished; awaiting finish of 1 more threads
2021-01-15 11:46:50,878 : INFO : worker thread finished; awaiting finish of 0 more threads
2021-01-15 11:46:50,879 : INFO : EPOCH - 1 : training on 142016 raw words (216738 effective words) took 2.8s, 78058 effective words/s
2021-01-15 11:46:52,463 : INFO : EPOCH 2 - PROGRESS: at 28.24% examples, 38934 words/s, in_qsize 5, out_qsize 0
2021-01-15 11:46:53,703 : INFO : EPOCH 2 - PROGRESS: at 70.40% examples, 54082 words/s, in_qsize 5, out_qsi

In [25]:
fname = "doc2vecmodel"

In [26]:
model.save(fname)

2021-01-15 11:53:18,796 : INFO : saving Doc2Vec object under doc2vecmodel, separately None
2021-01-15 11:53:19,371 : INFO : saved doc2vecmodel


In [33]:
#model1 = gensim.models.doc2vec.Doc2Vec.load(fname)

2021-01-15 12:04:13,247 : INFO : loading Doc2Vec object from doc2vecmodel
2021-01-15 12:04:13,657 : INFO : loading vocabulary recursively from doc2vecmodel.vocabulary.* with mmap=None
2021-01-15 12:04:13,657 : INFO : loading trainables recursively from doc2vecmodel.trainables.* with mmap=None
2021-01-15 12:04:13,658 : INFO : loading wv recursively from doc2vecmodel.wv.* with mmap=None
2021-01-15 12:04:13,658 : INFO : loading docvecs recursively from doc2vecmodel.docvecs.* with mmap=None
2021-01-15 12:04:13,659 : INFO : loaded doc2vecmodel


In [27]:
# Pick a random document from the test corpus and infer a vector from the model
doc_id = random.randint(0, len(test_corpus) - 1)
inferred_vector = model.infer_vector(test_corpus[doc_id])
sims = model.docvecs.most_similar([inferred_vector], topn=len(model.docvecs))

# Compare and print the most/median/least similar documents from the train corpus
print('Test Document ({}): «{}»\n'.format(doc_id, ' '.join(test_corpus[doc_id])))
print(u'SIMILAR/DISSIMILAR DOCS PER MODEL %s:\n' % model)
for label, index in [('MOST', 0), ('MEDIAN', len(sims)//2), ('LEAST', len(sims) - 1)]:
    print(u'%s %s: «%s»\n' % (label, sims[index], ' '.join(train_corpus[sims[index][0]].words)))

2021-01-15 11:55:05,116 : INFO : precomputing L2-norms of doc weight vectors


Test Document (401): «retro»

SIMILAR/DISSIMILAR DOCS PER MODEL Doc2Vec(dm/m,d100,n5,w5,s0.001,t3):

MOST (49423, 0.9465434551239014): «retro»

MEDIAN (36102, 0.4347231388092041): «aug»

LEAST (51360, -0.4998263418674469): «red bull editions edm vegas playlist»

