# Doc2Vec (Figures 3, 4 and Tables 3, 4)

adapted (partially) from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [1]:
import json
import re
from collections import namedtuple

import gensim

# script author (me) defined stop words
STOPWORDS_ = ['re', '-', '']

Document = namedtuple('Document', 'words tags')

alldocs = []  # will hold all docs in original order
with open('../data/c_twitter.json') as f_in:
    for i, line in enumerate(f_in):
        c_twt = json.loads(line)
        # c_twt: {'weeknum': str, 'c_text': str, 'tags': [str]}
        words = c_twt['c_text'].split()
        words = [w for w in words if w not in STOPWORDS_]
        tags = [i] + c_twt['tags']
        alldocs.append(Document(words, tags))
        
shuffle_docs = alldocs[:]


In [2]:
import multiprocessing

from gensim.models import Doc2Vec
import gensim.models.doc2vec


cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

# PV-DM w/average
model= Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)


# currently running out of memory
model.build_vocab(alldocs)
print(model)


Doc2Vec(dm/m,d100,n5,w10,mc2,t6)


# Performance of Doc2Vec training could be a figure??

In [3]:
from random import shuffle
import random
import time
import numpy as np

random.seed(1)
np.random.seed(1)

passes = 10
start_time = time.time()

for epoch in range(passes):
    shuffle(shuffle_docs)
    model.train(shuffle_docs)
    print(epoch, model.most_similar('hiv', topn=10))
    
    # do some validation after each epoch
    doc_ids = random.sample(range(len(alldocs)), 1000)
    score = 0
    for doc_id in doc_ids:
        inferred_docvec = model.infer_vector(alldocs[doc_id].words)
        score += sum(abs(model.docvecs[doc_id] - inferred_docvec))
    print("--- score: ", score)
    print("ELAPSED TIME (seconds): ", time.time() - start_time)

0 [('hivaids', 0.9238401055335999), ('chlamydia', 0.7275433540344238), ('hiv-positive', 0.6795859932899475), ('hiv-', 0.667283833026886), ('gonorrhea', 0.6494301557540894), ('syphilis', 0.636401891708374), ('malaria', 0.6282125115394592), ('aidshiv', 0.6197354793548584), ('dengue', 0.6184828877449036), ('lymedisease', 0.6180868148803711)]
--- score:  10474.5051227
ELAPSED TIME (seconds):  454.8811070919037
1 [('hivaids', 0.9392381906509399), ('hiv-positive', 0.7394733428955078), ('chlamydia', 0.7276412844657898), ('hiv-', 0.7015641331672668), ('aidshiv', 0.7008076310157776), ('dengue', 0.6919741630554199), ('malaria', 0.6719186902046204), ('gonorrhea', 0.6625051498413086), ('lymedisease', 0.6565596461296082), ('rabies', 0.6460838913917542)]
--- score:  12430.3644568
ELAPSED TIME (seconds):  908.7602863311768
2 [('hivaids', 0.9493792057037354), ('hiv-positive', 0.7881236672401428), ('aidshiv', 0.7830793261528015), ('chlamydia', 0.7434079051017761), ('dengue', 0.7270612716674805), ('hiv-

In [4]:
# do visualization/clustering of users
# do visualization/clustering of hashtags
# do visualization/clustering of documents

# Table 4: Related Word-Vectors

In [5]:
import pandas as pd
new_vecs = model.most_similar(['hivaids'], topn=10)
df_hivaids = pd.DataFrame(data=new_vecs, columns=['Related-Word', 'Cosine Similarity to \"hivaids\"'])
df_hivaids

Unnamed: 0,Related-Word,"Cosine Similarity to ""hivaids"""
0,hiv,0.957688
1,aidshiv,0.928883
2,dengue,0.879803
3,hiv-positive,0.875377
4,hiv-aids,0.872453
5,giardia,0.868507
6,malaria,0.852559
7,pertussis,0.846965
8,rabies,0.841204
9,hiv',0.837193


In [6]:
new_vecs = model.most_similar(['prep'], topn=10)
df_prep = pd.DataFrame(data=new_vecs, columns=['Related Word', 'Cosine Similarity to \"prep\"'])
df_prep

Unnamed: 0,Related Word,"Cosine Similarity to ""prep"""
0,shiga,0.877325
1,malaria,0.834458
2,giardia,0.788153
3,man-flu,0.776286
4,lyme,0.774702
5,chlamydia,0.758681
6,mumps,0.758419
7,rabies,0.756786
8,meningitis,0.75634
9,hivaids,0.75407


In [7]:
new_vecs = model.most_similar(['prophylaxis'], topn=10)
df_prophylaxis = pd.DataFrame(data=new_vecs, columns=['Related Word', 'Cosine Similarity to \"prophylaxis\"'])
df_prophylaxis

Unnamed: 0,Related Word,"Cosine Similarity to ""prophylaxis"""
0,treatment,0.530189
1,breastfeeding,0.488796
2,transmission,0.474317
3,organ,0.468069
4,mothers,0.466734
5,prevalence,0.466725
6,approved,0.453632
7,equal,0.44759
8,cannabis,0.447496
9,undetectable,0.443953


In [8]:
new_vecs = model.most_similar(['truvada'], topn=10)
df_truvada = pd.DataFrame(data=new_vecs, columns=['Related Word', 'Cosine Similarity to \"truvada\"'])
df_truvada

Unnamed: 0,Related Word,"Cosine Similarity to ""truvada"""
0,charliesheen,0.72783
1,hcv,0.716755
2,malaria,0.702457
3,syphilis,0.675602
4,lymedisease,0.666147
5,icasa2015,0.664901
6,chlamydia,0.663093
7,hepatitis,0.658429
8,zika,0.654923
9,rabies,0.649964


# Figure 3: PCA and tSNE plots of relevent word vectors

In [9]:
import sys

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

# append path to my module
if '/high_spd_work/sm_w2v' not in sys.path:
    sys.path.append('/high_spd_work/sm_w2v')
from sm_w2v.plot_utils import scatter_plot

# related words from above
related_words = list(df_hivaids.iloc[:,0].values) + \
                list(df_prep.iloc[:,0].values) + \
                list(df_prophylaxis.iloc[:,0].values) + \
                list(df_truvada.iloc[:,0].values)

# Prepare data matrix
X = []
text_annotations = []
for word in model.vocab:
    X.append(model[word])
    if word in related_words:
        text_annotations.append(word)
    else:
        text_annotations.append("")
X = np.array(X)

# Do k-means on original data matrix
kmeans = KMeans(n_clusters=3, random_state=1)
kmeans_labels = kmeans.fit_predict(X)

# Do PCA
pca = PCA(n_components=2, copy=True)
pca.fit(X)
print("PCA explained variance ratio: ", pca.explained_variance_ratio_)
pca_comps = pca.transform(X)

# Do tSNE
tsne = TSNE(n_components=2, random_state=1)
tsne_comps = tsne.fit_transform(X)

# Scatter plot
rand_seed = 0
alpha_high = 1.0
alpha_low = 0.05
down_samp_rate = 0.1
plot_lims = None

# save PCA plot in 'notebooks' directory
scatter_plot(pca_comps[:,0], pca_comps[:,1], alpha_high, alpha_low,
            kmeans_labels, text_annotations,
            down_samp_rate, "Fig3a: PCA of Related Words", rand_seed, [-5,10,-20,10])

# save tSNE plot in 'notebooks' directory
scatter_plot(tsne_comps[:,0], tsne_comps[:,1], alpha_high, alpha_low,
            kmeans_labels, text_annotations,
            down_samp_rate, "Fig3b: tSNE of Related Words", rand_seed, [-15,3,-8,15])

PCA explained variance ratio:  [ 0.07563121  0.0579878 ]


# Table 4: Related Hashtags and Tweets

Hashtags begin with "#", users begin with "user--" and tweets are just a number - that refers to the corresponding index in the "alldocs" variable, or the "../data/c_twitter.json" cleaned tweets file

In [10]:
new_vecs = model.docvecs.most_similar(['#truvada'], topn=10)
df_hash_truvada = pd.DataFrame(data=new_vecs, columns=['Related Hashtag/User/Tweet', 'Cosine Similarity to \"#truvada\"'])
df_hash_truvada

Unnamed: 0,Related Hashtag/User/Tweet,"Cosine Similarity to ""#truvada"""
0,#hiv,0.72656
1,#prep,0.725261
2,#hepatitis,0.704099
3,#letsgist,0.703258
4,#zikavirus,0.690609
5,#egaylity,0.670713
6,#imtesting,0.670385
7,9568,0.667223
8,#セフレ,0.661743
9,#メル友,0.661719


In [11]:
new_vecs = model.docvecs.most_similar(['#sexwork'], topn=10)
df_hash_sexwork = pd.DataFrame(data=new_vecs, columns=['Related Hashtag/User/Tweet', 'Cosine Similarity to \"#sexwork\"'])
df_hash_sexwork

Unnamed: 0,Related Hashtag/User/Tweet,"Cosine Similarity to ""#sexwork"""
0,#hiv,0.743877
1,#メル友,0.708935
2,#セフレ,0.708924
3,#egaylity,0.694771
4,287849,0.675552
5,#harmreduction,0.671927
6,#hepatitisc,0.663863
7,#truvada,0.658381
8,697666,0.654542
9,311241,0.651169


In [12]:
new_vecs = model.docvecs.most_similar(['#prep'], topn=10)
df_hash_prep = pd.DataFrame(data=new_vecs, columns=['Related Related Hashtag/User/Tweet', 'Cosine Similarity to \"#prep\"'])
df_hash_prep

Unnamed: 0,Related Related Hashtag/User/Tweet,"Cosine Similarity to ""#prep"""
0,#truvada,0.725261
1,975901,0.681698
2,939037,0.674932
3,1048155,0.672318
4,#anothereffinbreakup,0.669275
5,#studioprep,0.669178
6,1063243,0.665931
7,1021473,0.658816
8,959708,0.657086
9,920222,0.648606


In [13]:
# note, this tweet is popular, warning about 13 signs which indicate that you need HIV testing.
# We see evidence of this populat tweet-retweet in both DTM and Doc2Vec:
#
# Document(words=['#', 'krtebireysyle', 'if', 'you', 'see', 'this', '13',
# 'symptoms', '.', 'do', 'hiv', 'test', 'immediately', '.', 'please', 'read'],
# tags=[603177, '#KFB_Mz_Sope', 'kürtçebirşeysöyle'])
#
alldocs[603177]

Document(words=['#', 'krtebireysyle', 'if', 'you', 'see', 'this', '13', 'symptoms', '.', 'do', 'hiv', 'test', 'immediately', '.', 'please', 'read'], tags=[603177, 'user--#KFB_Mz_Sope', '#kürtçebirşeysöyle'])

In [14]:
new_vecs = model.docvecs.most_similar(['#imtesting'], topn=10)
df_hash_imtesting = pd.DataFrame(data=new_vecs, columns=['Related Related Hashtag/User/Tweet', 'Cosine Similarity to \"#imtesting\"'])
df_hash_imtesting

Unnamed: 0,Related Related Hashtag/User/Tweet,"Cosine Similarity to ""#imtesting"""
0,#hiv,0.833642
1,#hivawareness,0.759361
2,311241,0.697889
3,#iknowkati,0.697414
4,9568,0.676305
5,#nbhaad,0.675332
6,#letsgist,0.673017
7,#truvada,0.670385
8,#worldaidsday2015,0.648545
9,842697,0.648035


# Figure 4: Related Hashtags and Tweets

In [15]:
# related words from above
related_words = list(df_hash_truvada.iloc[:,0].values) + \
                list(df_hash_sexwork.iloc[:,0].values) + \
                list(df_hash_prep.iloc[:,0].values) + \
                list(df_hash_imtesting.iloc[:,0].values)

# Prepare data matrix
X = []
text_annotations = []
for i, word in enumerate(model.docvecs.doctags):
    if (i % 100 == 0 or word in related_words) :
        X.append(model.docvecs[word])
        if word in related_words:
            text_annotations.append(word)
        else:
            text_annotations.append("")
X = np.array(X)

# Do k-means on original data matrix
kmeans = KMeans(n_clusters=3, random_state=1)
kmeans_labels = kmeans.fit_predict(X)

# Do PCA
pca = PCA(n_components=2, copy=True)
pca.fit(X)
print("PCA explained variance ratio: ", pca.explained_variance_ratio_)
pca_comps = pca.transform(X)

# Do tSNE
tsne = TSNE(n_components=2, random_state=1)
tsne_comps = tsne.fit_transform(X)

# Scatter plot
rand_seed = 0
alpha_high = 1.0
alpha_low = 0.05
down_samp_rate = 0.1
plot_lims = None

# save PCA plot in 'notebooks' directory (it won't show here in the notebook)
scatter_plot(pca_comps[:,0], pca_comps[:,1], alpha_high, alpha_low,
            kmeans_labels, text_annotations,
            down_samp_rate, "Fig4a: PCA of Related Hashtags-Users-Tweets", rand_seed, [0,25,-7,3])

# save tSNE plot in 'notebooks' directory (it won't show here in the notebook)
scatter_plot(tsne_comps[:,0], tsne_comps[:,1], alpha_high, alpha_low,
            kmeans_labels, text_annotations,
            down_samp_rate, "Fig4b: tSNE of Hashtags-Users-Tweets", rand_seed, [-10,5,-8,12])

PCA explained variance ratio:  [ 0.05813684  0.03571577]
