# Doc2Vec (Figures 3, 4 and Tables 3, 4)

adapted (partially) from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

In [1]:
import json
import re
from collections import namedtuple

import gensim

Document = namedtuple('Document', 'words tags')

alldocs = []  # will hold all docs in original order
with open('../data/c_twitter.json') as f_in:
    for i, line in enumerate(f_in):
        c_twt = json.loads(line)
        # c_twt: {'weeknum': str, 'c_text': str, 'tags': [str]}
        words = c_twt['c_text'].split()
        tags = [i] + c_twt['tags']
        alldocs.append(Document(words, tags))
        
shuffle_docs = alldocs[:]


In [2]:
import multiprocessing

from gensim.models import Doc2Vec
import gensim.models.doc2vec


cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

# PV-DM w/average
model= Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)


# currently running out of memory
model.build_vocab(alldocs)
print(model)


Doc2Vec(dm/m,d100,n5,w10,mc2,t6)


# Performance of Doc2Vec training could be a figure??

In [3]:
from random import shuffle
import random
import time
import numpy as np

random.seed(1)
np.random.seed(1)

passes = 10
start_time = time.time()

for epoch in range(passes):
    shuffle(shuffle_docs)
    model.train(shuffle_docs)
    print(epoch, model.most_similar('hiv', topn=10))
    
    # do some validation after each epoch
    doc_ids = random.sample(range(len(alldocs)), 1000)
    score = 0
    for doc_id in doc_ids:
        inferred_docvec = model.infer_vector(alldocs[doc_id].words)
        score += sum(abs(model.docvecs[doc_id] - inferred_docvec))
    print("--- score: ", score)
    print("ELAPSED TIME (seconds): ", time.time() - start_time)

0 [('hivaids', 0.9269832372665405), ('chlamydia', 0.7341997027397156), ('hiv-positive', 0.718751847743988), ('hiv-', 0.6594427227973938), ('lymedisease', 0.6554089784622192), ('gonorrhea', 0.6451725959777832), ('malaria', 0.6446251273155212), ('aidshiv', 0.6313222050666809), ('syphilis', 0.6240503787994385), ('dengue', 0.6150895357131958)]
--- score:  10440.2962235
ELAPSED TIME (seconds):  480.0964570045471
1 [('hivaids', 0.9461992383003235), ('hiv-positive', 0.7643320560455322), ('chlamydia', 0.7606968879699707), ('aidshiv', 0.7016288638114929), ('hiv-', 0.6937544345855713), ('malaria', 0.6899576783180237), ('dengue', 0.6810131669044495), ('lymedisease', 0.658905029296875), ('gonorrhea', 0.6587939262390137), ('rabies', 0.6378685235977173)]
--- score:  12501.5201024
ELAPSED TIME (seconds):  955.8942174911499
2 [('hivaids', 0.9528703093528748), ('hiv-positive', 0.8068634271621704), ('aidshiv', 0.7836630940437317), ('chlamydia', 0.7714528441429138), ('dengue', 0.7214716076850891), ('hiv-

In [4]:
# do visualization/clustering of users
# do visualization/clustering of hashtags
# do visualization/clustering of documents

# Table 4: Related Word-Vectors

In [5]:
import pandas as pd
new_vecs = model.most_similar(['hivaids'], topn=10)
df_hivaids = pd.DataFrame(data=new_vecs, columns=['Related-Word', 'Cosine Similarity to \"hivaids\"'])
df_hivaids

Unnamed: 0,Related-Word,"Cosine Similarity to ""hivaids"""
0,hiv,0.958594
1,aidshiv,0.932111
2,hiv-aids,0.895672
3,hiv-positive,0.87881
4,chlamydia,0.867703
5,giardia,0.86426
6,pertussis,0.853045
7,dengue,0.851709
8,hiv',0.8479
9,malaria,0.844412


In [6]:
new_vecs = model.most_similar(['prep'], topn=10)
df_prep = pd.DataFrame(data=new_vecs, columns=['Related Word', 'Cosine Similarity to \"prep\"'])
df_prep

Unnamed: 0,Related Word,"Cosine Similarity to ""prep"""
0,shiga,0.870405
1,malaria,0.847116
2,giardia,0.812757
3,chlamydia,0.809993
4,legionnaires,0.804418
5,meningitis,0.802282
6,lyme,0.801972
7,mumps,0.784868
8,syphilis,0.783558
9,rabies,0.772343


In [7]:
new_vecs = model.most_similar(['prophylaxis'], topn=10)
df_prophylaxis = pd.DataFrame(data=new_vecs, columns=['Related Word', 'Cosine Similarity to \"prophylaxis\"'])
df_prophylaxis

Unnamed: 0,Related Word,"Cosine Similarity to ""prophylaxis"""
0,cannabis,0.507306
1,approved,0.497378
2,breastfeeding,0.494168
3,transmission,0.485931
4,equal,0.481361
5,bisexual,0.472363
6,undetectable,0.471999
7,integrated,0.467222
8,long-term,0.466649
9,prevalence,0.462871


In [8]:
new_vecs = model.most_similar(['truvada'], topn=10)
df_truvada = pd.DataFrame(data=new_vecs, columns=['Related Word', 'Cosine Similarity to \"truvada\"'])
df_truvada

Unnamed: 0,Related Word,"Cosine Similarity to ""truvada"""
0,hcv,0.700769
1,charliesheen,0.695903
2,malaria,0.679063
3,zika,0.668699
4,hepc,0.664521
5,martinshkreli,0.663084
6,coldflu,0.656823
7,lymedisease,0.654922
8,rhobh,0.651982
9,flushot,0.647609


# Figure 3: PCA and tSNE plots of relevent word vectors

In [9]:
import sys

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans

# append path to my module
if '/high_spd_work/sm_w2v' not in sys.path:
    sys.path.append('/high_spd_work/sm_w2v')
from sm_w2v.plot_utils import scatter_plot

# related words from above
related_words = list(df_hivaids.iloc[:,0].values) + \
                list(df_prep.iloc[:,0].values) + \
                list(df_prophylaxis.iloc[:,0].values) + \
                list(df_truvada.iloc[:,0].values)

# Prepare data matrix
X = []
text_annotations = []
for word in model.vocab:
    X.append(model[word])
    if word in related_words:
        text_annotations.append(word)
    else:
        text_annotations.append("")
X = np.array(X)

# Do k-means on original data matrix
kmeans = KMeans(n_clusters=3, random_state=1)
kmeans_labels = kmeans.fit_predict(X)

# Do PCA
pca = PCA(n_components=2, copy=True)
pca.fit(X)
print("PCA explained variance ratio: ", pca.explained_variance_ratio_)
pca_comps = pca.transform(X)

# Do tSNE
tsne = TSNE(n_components=2)
tsne_comps = tsne.fit_transform(X)

# Scatter plot
rand_seed = 0
alpha_high = 1.0
alpha_low = 0.05
down_samp_rate = 0.1
plot_lims = None

# save PCA plot in 'notebooks' directory
scatter_plot(pca_comps[:,0], pca_comps[:,1], alpha_high, alpha_low,
            kmeans_labels, text_annotations,
            down_samp_rate, "Fig3a: PCA of Related Words.pdf", rand_seed, plot_lims)

# save tSNE plot in 'notebooks' directory
scatter_plot(tsne_comps[:,0], tsne_comps[:,1], alpha_high, alpha_low,
            kmeans_labels, text_annotations,
            down_samp_rate, "Fig3b: tSNE of Related Words.pdf", rand_seed, plot_lims)

PCA explained variance ratio:  [ 0.07594191  0.05806724]


# Table 4: Related Hashtags and Tweets

Hashtags begin with "#", users begin with "user--" and tweets are just a number - that refers to the corresponding index in the "alldocs" variable, or the "../data/c_twitter.json" cleaned tweets file

In [10]:
new_vecs = model.docvecs.most_similar(['#truvada'], topn=10)
df_hash_truvada = pd.DataFrame(data=new_vecs, columns=['Related Hashtag/User/Tweet', 'Cosine Similarity to \"#truvada\"'])
df_hash_truvada

Unnamed: 0,Related Hashtag/User/Tweet,"Cosine Similarity to ""#truvada"""
0,#letsgist,0.732927
1,#hiv,0.721267
2,#prep,0.699828
3,#メル友,0.692808
4,#セフレ,0.692701
5,254943,0.689925
6,#sexwork,0.689036
7,487561,0.677703
8,603653,0.664129
9,#egaylity,0.661812


In [11]:
new_vecs = model.docvecs.most_similar(['#sexwork'], topn=10)
df_hash_sexwork = pd.DataFrame(data=new_vecs, columns=['Related Hashtag/User/Tweet', 'Cosine Similarity to \"#sexwork\"'])
df_hash_sexwork

Unnamed: 0,Related Hashtag/User/Tweet,"Cosine Similarity to ""#sexwork"""
0,#hiv,0.742446
1,#セフレ,0.714212
2,#メル友,0.713868
3,#egaylity,0.692382
4,#truvada,0.689036
5,#hepatitisc,0.681088
6,312301,0.674642
7,290827,0.665787
8,#cc16,0.655487
9,254943,0.653148


In [12]:
new_vecs = model.docvecs.most_similar(['#prep'], topn=10)
df_hash_prep = pd.DataFrame(data=new_vecs, columns=['Related Related Hashtag/User/Tweet', 'Cosine Similarity to \"#prep\"'])
df_hash_prep

Unnamed: 0,Related Related Hashtag/User/Tweet,"Cosine Similarity to ""#prep"""
0,#truvada,0.699828
1,975901,0.686178
2,#anothereffinbreakup,0.683982
3,#studioprep,0.68389
4,1060758,0.67403
5,1020142,0.658519
6,user--AnotherEffin'Breakup,0.657873
7,971152,0.655687
8,1048155,0.649939
9,user--Jamie B,0.645216


In [13]:
# note, this tweet is popular, warning about 13 signs which indicate that you need HIV testing.
# We see evidence of this populat tweet-retweet in both DTM and Doc2Vec:
#
# Document(words=['#', 'krtebireysyle', 'if', 'you', 'see', 'this', '13',
# 'symptoms', '.', 'do', 'hiv', 'test', 'immediately', '.', 'please', 'read'],
# tags=[603177, '#KFB_Mz_Sope', 'kürtçebirşeysöyle'])
#
alldocs[603177]

Document(words=['#', 'krtebireysyle', 'if', 'you', 'see', 'this', '13', 'symptoms', '.', 'do', 'hiv', 'test', 'immediately', '.', 'please', 'read'], tags=[603177, 'user--#KFB_Mz_Sope', '#kürtçebirşeysöyle'])

In [14]:
new_vecs = model.docvecs.most_similar(['#imtesting'], topn=10)
df_hash_imtesting = pd.DataFrame(data=new_vecs, columns=['Related Related Hashtag/User/Tweet', 'Cosine Similarity to \"#imtesting\"'])
df_hash_imtesting

Unnamed: 0,Related Related Hashtag/User/Tweet,"Cosine Similarity to ""#imtesting"""
0,#hiv,0.759691
1,#hivawareness,0.707609
2,#nbhaad,0.66922
3,290827,0.652414
4,#eurohivheptestweek,0.647707
5,#clc2015,0.646415
6,#hivtesting,0.640331
7,#letsgist,0.640218
8,#truvada,0.634226
9,696986,0.628146


# Figure 4: Related Hashtags and Tweets

In [16]:
# related words from above
related_words = list(df_hash_truvada.iloc[:,0].values) + \
                list(df_hash_sexwork.iloc[:,0].values) + \
                list(df_hash_prep.iloc[:,0].values) + \
                list(df_hash_imtesting.iloc[:,0].values)

# Prepare data matrix
X = []
text_annotations = []
for i, word in enumerate(model.docvecs.doctags):
    if (i % 100 == 0 or word in related_words) :
        X.append(model.docvecs[word])
        if word in related_words:
            text_annotations.append(word)
        else:
            text_annotations.append("")
X = np.array(X)

# Do k-means on original data matrix
kmeans = KMeans(n_clusters=3, random_state=1)
kmeans_labels = kmeans.fit_predict(X)

# Do PCA
pca = PCA(n_components=2, copy=True)
pca.fit(X)
print("PCA explained variance ratio: ", pca.explained_variance_ratio_)
pca_comps = pca.transform(X)

# Do tSNE
tsne = TSNE(n_components=2)
tsne_comps = tsne.fit_transform(X)

# Scatter plot
rand_seed = 0
alpha_high = 1.0
alpha_low = 0.05
down_samp_rate = 0.1
plot_lims = None

# save PCA plot in 'notebooks' directory (it won't show here in the notebook)
scatter_plot(pca_comps[:,0], pca_comps[:,1], alpha_high, alpha_low,
            kmeans_labels, text_annotations,
            down_samp_rate, "Fig4a: PCA of Related Hashtags-Users-Tweets.pdf", rand_seed, plot_lims)

# save tSNE plot in 'notebooks' directory (it won't show here in the notebook)
scatter_plot(tsne_comps[:,0], tsne_comps[:,1], alpha_high, alpha_low,
            kmeans_labels, text_annotations,
            down_samp_rate, "Fig4b: tSNE of Hashtags-Users-Tweets.pdf", rand_seed, plot_lims)

PCA explained variance ratio:  [ 0.05724305  0.03899559]
