## Demo-ing Doc2vec

### Imports

In [1]:
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

### Loading data

In [2]:
file_path = '../data/sample_data.csv'
df = pd.read_csv(file_path, names=['tag', 'stream'])

#### Manipulating input dataset

In [3]:
# Casting integer to string
df['tag'] = df['tag'].apply(lambda tag: str(tag))
# Loading input string and casting each element as string
df['stream'] = df['stream'].apply(lambda stream: [str(el) for el in eval(stream)])

In [4]:
# Visualizing input dataframe
df.head(3)

Unnamed: 0,tag,stream
0,677994,"[219345042, 172703514, 184153266, 56192185, 52..."
1,767275,"[96570421, 26516210, 50903853, 26516210, 27729..."
2,786423,"[187228547, 2791348, 35155700, 2791348, 351557..."


### Preparing training data

In [5]:
# Tagging each stream
tagged_streams = [TaggedDocument(stream, [tag]) 
                  for tag, stream in zip(list(df['tag']), list(df['stream']))]

### Defining the model

In [6]:
# Initializing the model
d2v_model = Doc2Vec(min_count=2, window=5, vector_size=20, negative=10)

In [7]:
# Inizializing the vocabulary
d2v_model.build_vocab(tagged_streams)

### Training

In [8]:
d2v_model.train(tagged_streams, total_examples=d2v_model.corpus_count, epochs=10)

### Extracting output

In [9]:
# Extracting list of items
item_ls = list(d2v_model.wv.vocab.keys())

# Extracting list of tags
tag_ls = d2v_model.docvecs.offset2doctag

In [10]:
# Retrieving embedding for each item
item_vector_ls = [d2v_model[item] for item in item_ls]

# Retrieving embedding for each tag
tag_vector_ls = [d2v_model.docvecs[tag_vect] for tag_vect in tag_ls]

In [11]:
# Printing a sample item and its embedding
print(item_ls[0])
print(item_vector_ls[0])

219345042
[-0.16691333  0.01391616  0.08182272  0.22406493 -0.13018888 -0.08907364
 -0.06540466  0.13903038  0.00235399 -0.0282561  -0.07210001  0.18075605
 -0.06638259  0.04872734 -0.30388135  0.17403638 -0.03945316  0.07126762
  0.03873265 -0.07725172]


In [12]:
# Printing a sample tag and its embedding
print(tag_ls[0])
print(tag_vector_ls[0])

677994
[-0.03200712 -0.06808201  0.09429213  0.20017318 -0.1556576   0.08129843
  0.01421014  0.02463668 -0.09386549  0.03395402  0.13053413  0.16646215
 -0.01707764  0.06892715 -0.35897225  0.20359874 -0.12954955  0.11253071
  0.01763733  0.10113616]
