# Load  clean HIV tweets and sentiment tweets
notebook code adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
sentiment training data from http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [16]:
import re
import json
from collections import namedtuple

import gensim

train = False

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z,\.?\'#0-9\s\-_]', '', clean)
    
    clean = clean.replace('...', '.')
    clean = clean.replace(',', ' , ')
    clean = clean.replace('#', ' # ')
    clean = clean.replace('?', ' ? ')
    clean = clean.replace('!', ' ! ')
    
    clean = clean.lower()
    word_list = clean.split()
    
    return word_list

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')


# start with sentiment data
sentiment_docs = []
with open('../data/sentiment_analysis_dataset.csv') as sentiment_data:
    for line_no, line in enumerate(sentiment_data):
        tokens = gensim.utils.to_unicode(line).split(',')
        words = clean_text(tokens[3])
        tags = ['sen_' + str(line_no)] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','train','train','valid'][line_no % 4]  # 3/4 training, 1/4 validation
        sentiment = tokens[1]
        sentiment_docs.append(SentimentDocument(words, tags, split, sentiment))
        
# now add our HIV/PrEP related tweets
hiv_docs = []
tweet_ids = set()
with open('../data/c_twitter.json') as hiv_prep_data:
    for line_no, line in enumerate(hiv_prep_data):
        twt = json.loads(line)
        words = twt['text'].split()
        tags = [str(twt['id'])]
        split = 'extra'
        sentiment = None
        if tags[0] not in tweet_ids:
            hiv_docs.append(SentimentDocument(words, tags, split, sentiment))
            tweet_ids.add(tags[0])

train_docs = [doc for doc in sentiment_docs if doc.split == 'train']
valid_docs = [doc for doc in sentiment_docs if doc.split == 'valid']
alldocs = sentiment_docs + hiv_docs

# shuffle for training
shuffledocs = alldocs[:]



# Build model

In [17]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

model= Doc2Vec(dm=1, dm_mean=1, size=100, window=3, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)

model.build_vocab(alldocs)
print(model)

Doc2Vec(dm/m,d100,n5,w3,mc2,t6)


In [18]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def fit_model_eval_accuracy(X, y, eval_X, eval_y):
    model = LogisticRegression(random_state=1)
    model.fit(X, y)
    
    mean_accuracy = model.score(eval_X, eval_y)
    
    return model, mean_accuracy

def predict_from_model(model, X):
    pred_y = model.predict(X)
    pred_log_prob = model.predict_log_proba(X)
    
    return (pred_y, pred_log_prob)

# Train

(should add per-epoch shuffling for faster training)

In [19]:
from random import shuffle
import datetime
import pickle

passes = 20


if train:
    for epoch in range(passes):

        # train doc vectors
        shuffle(shuffledocs)
        model.train(shuffledocs)

        # train lr model
        X = np.zeros((len(train_docs), 100))
        y = np.zeros(len(train_docs))
        for i, doc in enumerate(train_docs):
            doc_id_str = doc[1][0]
            X[i,] = model.docvecs[doc_id_str]
            y[i] = int(doc[3])

        # validate lr model
        X_eval = np.zeros((len(valid_docs), 100))
        y_eval = np.zeros(len(valid_docs))
        for i, doc in enumerate(valid_docs):
            doc_id_str = doc[1][0]
            X_eval[i,] = model.docvecs[doc_id_str]
            y_eval[i] = int(doc[3])

        lr_model, mean_accuracy = fit_model_eval_accuracy(X, y, X_eval, y_eval)
        print('epoch:', epoch, 'mean accuracy:', mean_accuracy)
        
        model.save('../models/sentiment_analysis.model')
        
        s = pickle.dumps(lr_model)
        with open('../models/lr_sentiment.model', 'wb') as f_out:
            f_out.write(s)

else:
    Doc2Vec.load('../models/sentiment_analysis.model')
    
    with open('../models/lr_sentiment.model', 'rb') as f_in:
        s = f_in.read()
    lr_model = pickle.loads(s)

epoch: 0 mean accuracy: 0.643623814157
epoch: 1 mean accuracy: 0.655649477013
epoch: 2 mean accuracy: 0.662151341928
epoch: 3 mean accuracy: 0.665549237817
epoch: 4 mean accuracy: 0.668541717344
epoch: 5 mean accuracy: 0.671298548609
epoch: 6 mean accuracy: 0.673310427309
epoch: 7 mean accuracy: 0.674123793886
epoch: 8 mean accuracy: 0.675375516906
epoch: 9 mean accuracy: 0.676761534096
epoch: 10 mean accuracy: 0.677481148139
epoch: 11 mean accuracy: 0.678570704614
epoch: 12 mean accuracy: 0.678808886727
epoch: 13 mean accuracy: 0.679295386362
epoch: 14 mean accuracy: 0.68017716695
epoch: 15 mean accuracy: 0.68080302846
epoch: 16 mean accuracy: 0.681426356118
epoch: 17 mean accuracy: 0.681857111003
epoch: 18 mean accuracy: 0.682439897024
epoch: 19 mean accuracy: 0.682779433228


In [20]:
# evaluate HIV/PrEP sentiment
X = np.zeros((len(hiv_docs), 100))
words = []
for i, doc in enumerate(hiv_docs):
    doc_id_str = doc[1][0]
    X[i,] = model.docvecs[doc_id_str]
    words.append(' '.join(doc[0]))
pred_y, pred_log_prob = predict_from_model(lr_model, X)

In [21]:
import pandas as pd

pred_log_prob_neg = [e[0] for e in pred_log_prob]
pred_log_prob_pos = [e[1] for e in pred_log_prob]

index = [doc[1][0] for doc in hiv_docs]
data = np.array([pred_y, pred_log_prob_neg, pred_log_prob_pos, words])
df = pd.DataFrame(data=data.transpose(),
                  columns=['pred_y', 'pred_log_prob_neg', 'pred_log_prob_pos', 'words'],
                  index=index)
# add tweet id to this dataframe

In [22]:
# example HIV/PrEP tweets
# (positive at the top and negative at the bottom)
df = df.sort(columns='pred_log_prob_pos', ascending=True)
df

  app.launch_new_instance()


Unnamed: 0,pred_y,pred_log_prob_neg,pred_log_prob_pos,words
693942712303800320,1.0,-6.60020484995,-0.00136101515585,"greg0wen hi greg , i'm a hiv doctor and health..."
678768814775885824,1.0,-6.37789993886,-0.00170013096302,"rt maroon_bottom thanks pia , btw , for bringi..."
698705569604116480,1.0,-6.01417722982,-0.00244684936714,"being single is great ! , no fighting , arguin..."
678768091778560000,1.0,-5.98859824049,-0.00251032427509,"rt maroon_bottom thanks pia , btw , for bringi..."
668317312190164992,1.0,-5.979011674,-0.00253453605712,# godis great by faith i will build a mansion ...
668025918158454784,1.0,-5.85590713783,-0.00286704338792,amazing to talk to claire_goose this morning o...
668327933589782528,1.0,-5.84055399088,-0.00291146580614,rt mmsyengo # godis great by faith i will buil...
693337586660696064,1.0,-5.82805956119,-0.00294812512722,"sex tip 3 if you're pozphobic , ensure you scr..."
668317383182807040,1.0,-5.82143649721,-0.00296774464964,rt mmsyengo # godis great by faith i will buil...
685438800454115328,1.0,-5.78449452093,-0.0030796013709,karenmilford thats brilliant . my wife did an ...


In [23]:
# lets restrict to prep tweets and truvada tweets only:
prep_tweets = []
truvada_tweets = []

for words in df['words']:
    if 'prep' in words:
        prep_tweets.append(True)
    else:
        prep_tweets.append(False)
        
    if 'truvada' in words:
        truvada_tweets.append(True)
    else:
        truvada_tweets.append(False)


In [24]:
# most positive tweet text, prep tweet text, truvada tweet text
print(df['words'][0], '\n')
print(df['words'][1], '\n')
print(df['words'][2], '\n')

print(df[prep_tweets]['words'][0], '\n')
print(df[prep_tweets]['words'][1], '\n')
print(df[prep_tweets]['words'][2], '\n')

print(df[truvada_tweets]['words'][0], '\n')
print(df[truvada_tweets]['words'][1], '\n')
print(df[truvada_tweets]['words'][3], '\n')

greg0wen hi greg , i'm a hiv doctor and health writer doing a piece on hivprep could i poss contact you for some info ? thanks , verity x 

rt maroon_bottom thanks pia , btw , for bringing the issue of hiv-aids ! that's a big slap for the conservatives and bigots who hinder our p 

being single is great ! , no fighting , arguing ! , having 2 share pizza amp treats amp no std's amp hiv ! ampi'm always happy everyday hehe ! , lol ! 

greg0wen hi greg , i'm a hiv doctor and health writer doing a piece on hivprep could i poss contact you for some info ? thanks , verity x 

beat's 5k run prep in your step-huge success ! thank you to all who participated , runners , volunteers we love you ! 

meet scruff man , matt ! profile matt . prep from columbus , ohio matt says scruff makes travel a lot more fun ! 

great skypeing the john grant just now about bowie , icelandic sagas and truvada ! you're right elliot_rose about the song disappointing ! 

br_webb thinking of you and your friend today , 

In [25]:
# most negative tweet text, prep tweet text, truvada tweet text

print(df['words'][-1], '\n')
print(df['words'][-2], '\n')
print(df['words'][-3], '\n')

print(df[prep_tweets]['words'][-1], '\n')
print(df[prep_tweets]['words'][-2], '\n')
print(df[prep_tweets]['words'][-3], '\n')

print(df[truvada_tweets]['words'][-1], '\n')
print(df[truvada_tweets]['words'][-2], '\n')
print(df[truvada_tweets]['words'][-3], '\n')

hiv sucks big time ! ! and i f # n hate it ! ! we're almost there until this one stupid infection brought him back to where he started . shit ! ! ! 

rt yeseniatwigs girl was wrong my man aint shit wat happened girl he beat me rape me got me prego gave me aids . . . . leave his ass but 

damn flu bug . no energy , shivering . doc says ill b admitted 2 hospital if i get any worse . i hate bloody hospitals ! ! wish it would go 

rt thomaswikjr between glaad hrc i'm not sure wtf to think anymore . also fb deleted london prep # hiv activist greg0wen fb page 

rt samnyembe can we attribute hivaids high rate in kzn to many men having not removed their prepuce ? 

rt lauraagustin unethical drug trial # tenofovir ended in cambodia because of # sexworker activism . relevant given current prep . 

rt megtirrell gild jan 1 price increases per cowenresearch atripla 8 complera 6 . 9 stribild 4 . 9 truvada 6 . 9 tybost 6 . 9 vitekta 6 . 

this # truvada convo has me catching feelings coz majority of

# Topic modeling on positive and negative prep tweets

In [26]:
# topic modeling on top 10,000 (negative) PrEP tweets
corpus_pos = df[prep_tweets]['words'][0:10000]

# topic modeling on top 10,000 (negative) HIV tweets
corpus_neg = df[prep_tweets]['words'][-10000:-1]


In [27]:
from gensim.parsing.preprocessing import STOPWORDS
import re


STOPWORDS_ = ['rt', '', 'httpst', 'amp', 'https\'']

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z\s]', '', clean)

    clean = clean.lower()
    word_list = clean.split()
    word_list = [w for w in word_list if (w not in STOPWORDS_ and w not in STOPWORDS)]

    return word_list

corpus_pos = [clean_text(doc) for doc in corpus_pos]
corpus_neg = [clean_text(doc) for doc in corpus_neg]

In [28]:
from gensim.models.ldamodel import LdaModel
from gensim.models import TfidfModel
from gensim import corpora

def make_tfidf_corpra(texts):
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=100000)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = TfidfModel(corpus)
    corpus = tfidf[corpus]
    
    return dictionary, corpus

dict_pos, corpus_pos = make_tfidf_corpra(corpus_pos)
dict_neg, corpus_neg = make_tfidf_corpra(corpus_neg)

if train:
    model_pos = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_pos.save('../models/lda_sentiment_pos.model')
    
    model_neg = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_neg.save('../models/lda_sentiment_neg.model')

else:
    model_pos = LdaModel.load('../models/lda_sentiment_pos.model')
    model_neg = LdaModel.load('../models/lda_sentiment_neg.model')

In [29]:
model_pos.print_topics(20)

[(0,
  '0.087*day + 0.085*world + 0.079*options + 0.079*aids + 0.077*restingplatypus + 0.067*learn + 0.055*prevention + 0.024*drugs + 0.010*word + 0.009*dec'),
 (1,
  '0.017*gay + 0.017*clinic + 0.017*women + 0.017*talk + 0.015*guys + 0.015*cure + 0.015*doctor + 0.015*black + 0.015*ways + 0.014*video'),
 (2,
  '0.014*stop + 0.014*end + 0.014*community + 0.013*epidemic + 0.011*fight + 0.011*stops + 0.011*poweringprep + 0.010*hivandhepatitis + 0.010*australia + 0.010*thebodydotcom'),
 (3,
  '0.029*risk + 0.028*tool + 0.027*getting + 0.027*latest + 0.025*reduce + 0.024*hrc + 0.021*help + 0.020*stay + 0.019*hivnegative + 0.019*contracting'),
 (4,
  '0.030*prophylaxis + 0.026*preexposure + 0.018*free + 0.018*approves + 0.017*france + 0.016*increase + 0.015*risk + 0.014*government + 0.013*africa + 0.013*young'),
 (5,
  '0.030*daily + 0.029*pill + 0.027*high + 0.027*cdcgov + 0.027*prevent + 0.022*risk + 0.022*benefit + 0.021*americans + 0.020*gilead + 0.020*men'),
 (6,
  '0.025*askthehivdoc +

In [30]:
model_neg.print_topics(20)

[(0,
  '0.102*day + 0.099*world + 0.094*options + 0.092*aids + 0.090*restingplatypus + 0.078*learn + 0.059*prevention + 0.039*americans + 0.018*reveals + 0.012*private'),
 (1,
  '0.019*epidemic + 0.018*undetectable + 0.017*free + 0.017*approves + 0.017*necessary + 0.017*hivpositive + 0.016*france + 0.016*therapy + 0.015*taken + 0.014*hivandhepatitis'),
 (2,
  '0.036*pill + 0.036*risk + 0.035*prevents + 0.032*daily + 0.027*prophylaxis + 0.027*high + 0.026*preexposure + 0.026*benefit + 0.025*know + 0.020*cdcgov'),
 (3,
  '0.021*infections + 0.018*stop + 0.017*black + 0.016*know + 0.013*lets + 0.012*women + 0.012*available + 0.012*gets + 0.012*stops + 0.011*drug'),
 (4,
  '0.020*projectinform + 0.014*petition + 0.014*asking + 0.014*stigma + 0.014*big + 0.013*preparing + 0.013*sexually + 0.013*active + 0.012*equal + 0.012*provides'),
 (5,
  '0.018*help + 0.017*clinic + 0.016*doctor + 0.016*stay + 0.016*right + 0.016*increase + 0.015*queensland + 0.014*says + 0.014*preppies + 0.013*video'),