# Load  clean HIV tweets and sentiment tweets
notebook code adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
sentiment training data from http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [1]:
import re
import json
from collections import namedtuple

import gensim

train = True

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z,\.?\'#0-9\s\-_]', '', clean)
    
    clean = clean.replace('...', '.')
    clean = clean.replace(',', ' , ')
    clean = clean.replace('#', ' # ')
    clean = clean.replace('?', ' ? ')
    clean = clean.replace('!', ' ! ')
    
    clean = clean.lower()
    word_list = clean.split()
    
    return word_list

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')


# start with sentiment data
sentiment_docs = []
with open('../data/sentiment_analysis_dataset.csv') as sentiment_data:
    for line_no, line in enumerate(sentiment_data):
        tokens = gensim.utils.to_unicode(line).split(',')
        words = clean_text(tokens[3])
        tags = ['sen_' + str(line_no)] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','train','train','valid'][line_no % 4]  # 3/4 training, 1/4 validation
        sentiment = tokens[1]
        sentiment_docs.append(SentimentDocument(words, tags, split, sentiment))
        
# now add our HIV/PrEP related tweets
hiv_docs = []
tweet_ids = set()
with open('../data/c_twitter.json') as hiv_prep_data:
    for line_no, line in enumerate(hiv_prep_data):
        twt = json.loads(line)
        words = twt['text'].split()
        tags = [str(twt['id'])]
        split = 'extra'
        sentiment = None
        if tags[0] not in tweet_ids:
            hiv_docs.append(SentimentDocument(words, tags, split, sentiment))
            tweet_ids.add(tags[0])

train_docs = [doc for doc in sentiment_docs if doc.split == 'train']
valid_docs = [doc for doc in sentiment_docs if doc.split == 'valid']
alldocs = sentiment_docs + hiv_docs

# shuffle for training
shuffledocs = alldocs[:]



# Build model

In [2]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

model= Doc2Vec(dm=1, dm_mean=1, size=100, window=3, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)

model.build_vocab(alldocs)
print(model)

Doc2Vec(dm/m,d100,n5,w3,mc2,t6)


In [3]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def fit_model_eval_accuracy(X, y, eval_X, eval_y):
    model = LogisticRegression(random_state=1)
    model.fit(X, y)
    
    mean_accuracy = model.score(eval_X, eval_y)
    
    return model, mean_accuracy

def predict_from_model(model, X):
    pred_y = model.predict(X)
    pred_log_prob = model.predict_log_proba(X)
    
    return (pred_y, pred_log_prob)

# Train

(should add per-epoch shuffling for faster training)

In [4]:
from random import shuffle
import datetime
import pickle

passes = 20


if train:
    for epoch in range(passes):

        # train doc vectors
        shuffle(shuffledocs)
        model.train(shuffledocs)

        # train lr model
        X = np.zeros((len(train_docs), 100))
        y = np.zeros(len(train_docs))
        for i, doc in enumerate(train_docs):
            doc_id_str = doc[1][0]
            X[i,] = model.docvecs[doc_id_str]
            y[i] = int(doc[3])

        # validate lr model
        X_eval = np.zeros((len(valid_docs), 100))
        y_eval = np.zeros(len(valid_docs))
        for i, doc in enumerate(valid_docs):
            doc_id_str = doc[1][0]
            X_eval[i,] = model.docvecs[doc_id_str]
            y_eval[i] = int(doc[3])

        lr_model, mean_accuracy = fit_model_eval_accuracy(X, y, X_eval, y_eval)
        print('epoch:', epoch, 'mean accuracy:', mean_accuracy)
        
        model.save('../models/sentiment_analysis.model')
        
        s = pickle.dumps(lr_model)
        with open('../models/lr_sentiment.model', 'wb') as f_out:
            f_out.write(s)

else:
    Doc2Vec.load('../models/sentiment_analysis.model')
    
    with open('../models/lr_sentiment.model', 'rb') as f_in:
        s = f_in.read()
    lr_model = pickle.loads(s)

epoch: 0 mean accuracy: 0.648694052542
epoch: 1 mean accuracy: 0.659625091219
epoch: 2 mean accuracy: 0.66738628071
epoch: 3 mean accuracy: 0.671838259142
epoch: 4 mean accuracy: 0.674724316873
epoch: 5 mean accuracy: 0.677481148139
epoch: 6 mean accuracy: 0.678669524852
epoch: 7 mean accuracy: 0.68018730236
epoch: 8 mean accuracy: 0.6814060853
epoch: 9 mean accuracy: 0.681988871321
epoch: 10 mean accuracy: 0.68278196708
epoch: 11 mean accuracy: 0.682825042569
epoch: 12 mean accuracy: 0.683924734452
epoch: 13 mean accuracy: 0.684180653531
epoch: 14 mean accuracy: 0.684644348496
epoch: 15 mean accuracy: 0.684672220871
epoch: 16 mean accuracy: 0.68549825671
epoch: 17 mean accuracy: 0.685797251277
epoch: 18 mean accuracy: 0.686344563367
epoch: 19 mean accuracy: 0.686342029514


In [5]:
# evaluate HIV/PrEP sentiment
X = np.zeros((len(hiv_docs), 100))
words = []
for i, doc in enumerate(hiv_docs):
    doc_id_str = doc[1][0]
    X[i,] = model.docvecs[doc_id_str]
    words.append(' '.join(doc[0]))
pred_y, pred_log_prob = predict_from_model(lr_model, X)

In [6]:
import pandas as pd

pred_log_prob_neg = [e[0] for e in pred_log_prob]
pred_log_prob_pos = [e[1] for e in pred_log_prob]

index = [doc[1][0] for doc in hiv_docs]
data = np.array([pred_y, pred_log_prob_neg, pred_log_prob_pos, words])
df = pd.DataFrame(data=data.transpose(),
                  columns=['pred_y', 'pred_log_prob_neg', 'pred_log_prob_pos', 'words'],
                  index=index)
# add tweet id to this dataframe

In [7]:
# example HIV/PrEP tweets
# (negative at the top and positive at the bottom)
df = df.sort(columns='pred_log_prob_pos', ascending=True)
df

  app.launch_new_instance()


Unnamed: 0,pred_y,pred_log_prob_neg,pred_log_prob_pos,words
689891786706309125,1.0,-6.71231784144,-0.00121658246186,They'll find a positive result AmaraGoddess As...
698091697947803649,1.0,-6.07640823164,-0.00229905082396,I'm # single and # HIV I've found some dating ...
708384700969127936,1.0,-5.78869758164,-0.00306666495726,JDFoster83 MSNBC To keep the old capital hill ...
708783689795174400,1.0,-5.67446881662,-0.00343839628996,RT ianharvie If you ever find yourself running...
671590580263194624,1.0,-5.66485332755,-0.00347167534889,RT MAISCEF It's # WorldAIDSDay . Pls as you go...
714349130743681024,1.0,-5.5910801509,-0.00373797317417,"Man , that's a BEAUTIFUL cock -- and a nice se..."
705301932835999744,1.0,-5.58279953728,-0.00376911304159,RT symon_atwyne To all those people that are t...
693570790776795137,1.0,-5.44323289195,-0.00433485878201,Get an anal pap smear with your HIV test to fi...
689944288982126593,1.0,-5.34423107117,-0.0047870616253,"Stay safe out there tonight , and every night ..."
708566225383600128,1.0,-5.28227988054,-0.00509378520776,If you ever find yourself running for presiden...


In [8]:
# lets restrict to prep tweets and truvada tweets only:
prep_tweets = []
truvada_tweets = []

for words in df['words']:
    if 'PrEP' in words:
        prep_tweets.append(True)
    else:
        prep_tweets.append(False)
        
    if 'truvada' in words:
        truvada_tweets.append(True)
    else:
        truvada_tweets.append(False)


In [9]:
# most negative tweet text, prep tweet text, truvada tweet text
df[['words']][0:3].append(df[prep_tweets][['words']][0:3]).append(df[truvada_tweets][['words']][0:3])

Unnamed: 0,words
689891786706309125,They'll find a positive result AmaraGoddess As...
698091697947803649,I'm # single and # HIV I've found some dating ...
708384700969127936,JDFoster83 MSNBC To keep the old capital hill ...
686771263336001537,Another good reason to talk to your doctor abo...
699601641364807681,"When you're on # PrEP , we'll see you every 3 ..."
691667283886874624,"# Gay , bi menlt Watch this great explainer vi..."
686771263336001537,Another good reason to talk to your doctor abo...
686405228414005250,"RussianPharaoh lol . Yeah , you do ! The truva..."
707250115996741632,"my truvada amp Diet Coke diet is going great ,..."


In [10]:
df[['words']][0:3].append(df[prep_tweets][['words']][0:3]).append(df[truvada_tweets][['words']][0:3])['words'].values

array([ "They'll find a positive result AmaraGoddess As for those of you who go for HIV test every 3months , you'll soon find what you re looking 4",
       "I'm # single and # HIV I've found some dating sites specifically for those with # STIs - are they safe ?",
       'JDFoster83 MSNBC To keep the old capital hill cronies happy . . If u listen to this video you will cry amp how they joked about AIDS -',
       "Another good reason to talk to your doctor about PrEP . If you're doing any butt stuff , boys , truvada and condoms !",
       "When you're on # PrEP , we'll see you every 3 months ! Why ? To keep you and your partners healthy . Call 716 . 541 . 0676 .",
       '# Gay , bi menlt Watch this great explainer vid on # PrEP amp see if its right for you !',
       "Another good reason to talk to your doctor about PrEP . If you're doing any butt stuff , boys , truvada and condoms !",
       "RussianPharaoh lol . Yeah , you do ! The truvada is filtered through the kidneys , so you'll

In [11]:
# most negative tweet text, prep tweet text, truvada tweet text
df[['words']][-4:-1].append(df[prep_tweets][['words']][-4:-1]).append(df[truvada_tweets][['words']][-4:-1])

Unnamed: 0,words
708411785922154499,"Also , how fucking vile of Hillary to say . Re..."
691498394553028608,I wonder why he beat her ass when she was tryn...
708468377770180610,Aaannd . Hillary Clinton breathes a sigh of re...
684991376723869697,RT gaston_croupier # Truvada patent's not expi...
669565035866386433,Equality_MI Syph amp Hep C have gone up 550 in...
715914320530505728,Xaviom8 in interviews he says he was adherent ...
703263887131590656,not surprised at all that someone got HIV on t...
707401264708132865,Now reading that truvada does not protect agai...
704783434670014466,I think I have conjunctivitis unless truvada c...


In [12]:
df[['words']][-4:-1].append(df[prep_tweets][['words']][-4:-1]).append(df[truvada_tweets][['words']][-4:-1])['words'].values

array([ 'Also , how fucking vile of Hillary to say . Reagan did fucking NOTHING during the AIDS epidemic until it was too late . What a stupid old hag .',
       "I wonder why he beat her ass when she was tryna leave like she wasn't gone be running back when she found out she had HIV amp nobody want her",
       'Aaannd . Hillary Clinton breathes a sigh of relief that Twitter has left its outrage of her AIDS comments behind to tend to Drumpf debacle .',
       "RT gaston_croupier # Truvada patent's not expired yet but it is sold online as a generic drug ? There's something rotten in internet # PrEP h",
       "Equality_MI Syph amp Hep C have gone up 550 in Gay Men bc many feel tht bc they're on PrEP , they don't need condoms . HIV isn't the only STI .",
       "Xaviom8 in interviews he says he was adherent . strain was highly resistant , and Truvada wouldn't have blocked it anyways . PrEP didn't fail .",
       'not surprised at all that someone got HIV on truvada . people get pregnant

# Topic modeling on positive and negative prep tweets

In [13]:
# topic modeling on top 10,000 (negative) PrEP tweets
corpus_pos = df[prep_tweets]['words'][0:10000]

# topic modeling on top 10,000 (negative) HIV tweets
corpus_neg = df[prep_tweets]['words'][-10000:-1]


In [14]:
from gensim.parsing.preprocessing import STOPWORDS
import re


STOPWORDS_ = ['rt', '', 'httpst', 'amp', 'https\'']

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z\s]', '', clean)

    clean = clean.lower()
    word_list = clean.split()
    word_list = [w for w in word_list if (w not in STOPWORDS_ and w not in STOPWORDS)]

    return word_list

corpus_pos = [clean_text(doc) for doc in corpus_pos]
corpus_neg = [clean_text(doc) for doc in corpus_neg]

In [15]:
from gensim.models.ldamodel import LdaModel
from gensim.models import TfidfModel
from gensim import corpora

def make_tfidf_corpra(texts):
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=100000)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = TfidfModel(corpus)
    corpus = tfidf[corpus]
    
    return dictionary, corpus

dict_pos, corpus_pos = make_tfidf_corpra(corpus_pos)
dict_neg, corpus_neg = make_tfidf_corpra(corpus_neg)

if train:
    model_pos = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_pos.save('../models/lda_sentiment_pos.model')
    
    model_neg = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_neg.save('../models/lda_sentiment_neg.model')

else:
    model_pos = LdaModel.load('../models/lda_sentiment_pos.model')
    model_neg = LdaModel.load('../models/lda_sentiment_neg.model')

In [16]:
pd.DataFrame([row[1].split(' + ') for row in model_pos.print_topics(20)]).iloc[:,0:5]

Unnamed: 0,0,1,2,3,4
0,0.026*safe,0.025*aspirin,0.022*men,0.021*latest,0.020*gay
1,0.050*aids,0.049*options,0.049*day,0.048*world,0.047*restingplatypus
2,0.027*arent,0.023*prevention,0.023*tool,0.016*need,0.016*men
3,0.035*canada,0.022*video,0.022*approve,0.021*askthehivdoc,0.019*approved
4,0.034*reduce,0.025*percent,0.024*help,0.023*risk,0.022*exception
5,0.031*regimen,0.027*nhs,0.021*clinic,0.021*latino,0.018*available
6,0.030*got,0.022*daily,0.019*prevent,0.018*story,0.018*meet
7,0.053*man,0.038*contracts,0.038*daily,0.031*gay,0.027*truvada
8,0.031*uk,0.023*new,0.018*women,0.018*treatment,0.016*transgender
9,0.036*hivpositive,0.033*rare,0.017*man,0.015*stigma,0.014*important


In [17]:
pd.DataFrame([row[1].split(' + ') for row in model_neg.print_topics(20)]).iloc[:,0:5]

Unnamed: 0,0,1,2,3,4
0,0.023*prevents,0.022*know,0.021*pill,0.015*like,0.015*prevention
1,0.042*man,0.042*aids,0.039*day,0.037*world,0.035*restingplatypus
2,0.046*contracts,0.039*gay,0.033*truvada,0.029*arent,0.026*drugresistant
3,0.027*story,0.026*approve,0.021*canada,0.018*tests,0.018*country
4,0.030*reduce,0.030*help,0.027*percent,0.024*exception,0.022*infections
5,0.024*contracted,0.021*doctor,0.021*speaks,0.020*man,0.016*findings
6,0.021*latino,0.018*dont,0.017*canada,0.017*effective,0.017*risk
7,0.027*askthehivdoc,0.019*transgender,0.017*getprepla,0.017*greaterthanaids,0.016*hivplusmag
8,0.016*need,0.016*women,0.015*men,0.014*program,0.014*black
9,0.029*safe,0.027*new,0.025*aspirin,0.023*study,0.021*infections
