# Load  clean HIV tweets and sentiment tweets
notebook code adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
sentiment training data from http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [36]:
import re
import json
from collections import namedtuple

import gensim

train = False

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z,\.?\'#0-9\s\-_]', '', clean)
    
    clean = clean.replace('...', '.')
    clean = clean.replace(',', ' , ')
    clean = clean.replace('#', ' # ')
    clean = clean.replace('?', ' ? ')
    clean = clean.replace('!', ' ! ')
    
    clean = clean.lower()
    word_list = clean.split()
    
    return word_list

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')


# start with sentiment data
sentiment_docs = []
with open('../data/sentiment_analysis_dataset.csv') as sentiment_data:
    for line_no, line in enumerate(sentiment_data):
        tokens = gensim.utils.to_unicode(line).split(',')
        words = clean_text(tokens[3])
        tags = ['sen_' + str(line_no)] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','train','train','valid'][line_no % 4]  # 3/4 training, 1/4 validation
        sentiment = tokens[1]
        sentiment_docs.append(SentimentDocument(words, tags, split, sentiment))
        
# now add our HIV/PrEP related tweets
hiv_docs = []
tweet_ids = set()
with open('../data/c_twitter.json') as hiv_prep_data:
    for line_no, line in enumerate(hiv_prep_data):
        twt = json.loads(line)
        words = twt['text'].split()
        tags = [str(twt['id'])]
        split = 'extra'
        sentiment = None
        if tags[0] not in tweet_ids:
            hiv_docs.append(SentimentDocument(words, tags, split, sentiment))
            tweet_ids.add(tags[0])

train_docs = [doc for doc in sentiment_docs if doc.split == 'train']
valid_docs = [doc for doc in sentiment_docs if doc.split == 'valid']
alldocs = sentiment_docs + hiv_docs

# shuffle for training
shuffledocs = alldocs[:]



# Build model

In [37]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

model= Doc2Vec(dm=1, dm_mean=1, size=100, window=3, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)

model.build_vocab(alldocs)
print(model)

Doc2Vec(dm/m,d100,n5,w3,mc2,t6)


In [38]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def fit_model_eval_accuracy(X, y, eval_X, eval_y):
    model = LogisticRegression(random_state=1)
    model.fit(X, y)
    
    mean_accuracy = model.score(eval_X, eval_y)
    
    return model, mean_accuracy

def predict_from_model(model, X):
    pred_y = model.predict(X)
    pred_log_prob = model.predict_log_proba(X)
    
    return (pred_y, pred_log_prob)

# Train

(should add per-epoch shuffling for faster training)

In [39]:
from random import shuffle
import datetime
import pickle

passes = 20


if train:
    for epoch in range(passes):

        # train doc vectors
        shuffle(shuffledocs)
        model.train(shuffledocs)

        # train lr model
        X = np.zeros((len(train_docs), 100))
        y = np.zeros(len(train_docs))
        for i, doc in enumerate(train_docs):
            doc_id_str = doc[1][0]
            X[i,] = model.docvecs[doc_id_str]
            y[i] = int(doc[3])

        # validate lr model
        X_eval = np.zeros((len(valid_docs), 100))
        y_eval = np.zeros(len(valid_docs))
        for i, doc in enumerate(valid_docs):
            doc_id_str = doc[1][0]
            X_eval[i,] = model.docvecs[doc_id_str]
            y_eval[i] = int(doc[3])

        lr_model, mean_accuracy = fit_model_eval_accuracy(X, y, X_eval, y_eval)
        print('epoch:', epoch, 'mean accuracy:', mean_accuracy)
        
        model.save('../models/sentiment_analysis.model')
        
        s = pickle.dumps(lr_model)
        with open('../models/lr_sentiment.model', 'wb') as f_out:
            f_out.write(s)

else:
    Doc2Vec.load('../models/sentiment_analysis.model')
    
    with open('../models/lr_sentiment.model', 'rb') as f_in:
        s = f_in.read()
    lr_model = pickle.loads(s)

In [40]:
# evaluate HIV/PrEP sentiment
X = np.zeros((len(hiv_docs), 100))
words = []
for i, doc in enumerate(hiv_docs):
    doc_id_str = doc[1][0]
    X[i,] = model.docvecs[doc_id_str]
    words.append(' '.join(doc[0]))
pred_y, pred_log_prob = predict_from_model(lr_model, X)

In [41]:
import pandas as pd

pred_log_prob_neg = [e[0] for e in pred_log_prob]
pred_log_prob_pos = [e[1] for e in pred_log_prob]

index = [doc[1][0] for doc in hiv_docs]
data = np.array([pred_y, pred_log_prob_neg, pred_log_prob_pos, words])
df = pd.DataFrame(data=data.transpose(),
                  columns=['pred_y', 'pred_log_prob_neg', 'pred_log_prob_pos', 'words'],
                  index=index)
# add tweet id to this dataframe

In [42]:
# example HIV/PrEP tweets
# (negative at the top and positive at the bottom)
df = df.sort(columns='pred_log_prob_pos', ascending=True)
df

  app.launch_new_instance()


Unnamed: 0,pred_y,pred_log_prob_neg,pred_log_prob_pos,words
669383541994590208,1.0,-1.03505633107,-0.438824968739,Aunt you got pregnant too young Me you got HIV...
708406257154002944,1.0,-1.03474266259,-0.438997805667,RT stevesilberman This was what AIDS was reall...
704090070970568704,1.0,-1.03431063144,-0.43923600026,RT GOP__Patriot Poll Should open-borders anti-...
687389193081872386,1.0,-1.03393337817,-0.439444124423,RT TOPublicHealth The Works provides testing f...
670760277575532544,1.0,-1.03379560268,-0.439520163177,RT crazyaction2 Charlie Sheen Never Told Porns...
714951302040719360,1.0,-1.03367002146,-0.439589486033,The Girl He Met On Facebook Infects Him With H...
690339288865730561,1.0,-1.03310386291,-0.439902182532,RT Wonkette Shaming gays for having the HIV ? ...
692459705243930624,1.0,-1.03301091912,-0.439953542828,RT FCAA ejaforg announced 5 . 4 million in gra...
709767795056087040,1.0,-1.03293961688,-0.439992949128,RT KoffGodd How you got Postive Vibes In yo bi...
685784775265763328,1.0,-1.03292425226,-0.440001441199,RT AJEdinho # DearAfricanParents if You see th...


In [43]:
# lets restrict to prep tweets and truvada tweets only:
prep_tweets = []
truvada_tweets = []

for words in df['words']:
    if 'PrEP' in words:
        prep_tweets.append(True)
    else:
        prep_tweets.append(False)
        
    if 'truvada' in words:
        truvada_tweets.append(True)
    else:
        truvada_tweets.append(False)


In [44]:
# most positive tweet text, prep tweet text, truvada tweet text
df[['words']][0:3].append(df[prep_tweets][['words']][0:3]).append(df[truvada_tweets][['words']][0:3])

Unnamed: 0,words
669383541994590208,Aunt you got pregnant too young Me you got HIV...
708406257154002944,RT stevesilberman This was what AIDS was reall...
704090070970568704,RT GOP__Patriot Poll Should open-borders anti-...
671844912611106817,He won't use condoms because intimacy means mo...
702564069899829248,"PrEP Queensland Aids Council , # HIV Foundatio..."
678066220244054016,RT JDatTheBody At the core of our programs is ...
670452952033796096,"RT PrEPped4AIDS If you're a Poz top , find a n..."
697146971971137536,Another reason 4 # Ireland amp # UK 2 immediat...
671766588094197760,RT EvanJPeterson For # worldAIDSday my early #...


In [58]:
df[['words']][0:12].append(df[prep_tweets][['words']][0:4]).append(df[truvada_tweets][['words']][0:4])['words'].values

array([ 'Aunt you got pregnant too young Me you got HIV though # ThanksgivingClapBack',
       'RT stevesilberman This was what AIDS was really like under the Reagans . via reluctantlyjoe',
       'RT GOP__Patriot Poll Should open-borders anti-white # Rubio be tested for drug use amp HIV ? # Marcomentum # MarcoRubio # NeverRubio',
       'RT TOPublicHealth The Works provides testing for HIV anonymous amp rapid test available . Call 416-392-0520 for more info',
       'RT crazyaction2 Charlie Sheen Never Told Pornstar Girlfriend That He Had HIV',
       'The Girl He Met On Facebook Infects Him With HIV Screenshots',
       'RT Wonkette Shaming gays for having the HIV ? HOW RUUUUUUUUUUDE ! # FullerHouse',
       'RT FCAA ejaforg announced 5 . 4 million in grants to support orgs addressing # HIV in new amp innovative ways !',
       'RT KoffGodd How you got Postive Vibes In yo bio when the only thing positive bout you is HIV ?',
       'RT AJEdinho # DearAfricanParents if You see this 13 

In [46]:
# most negative tweet text, prep tweet text, truvada tweet text
df[['words']][-4:-1].append(df[prep_tweets][['words']][-4:-1]).append(df[truvada_tweets][['words']][-4:-1])

Unnamed: 0,words
667876353434218496,RT virginiakimba What Jenny McCarthy Gets Wron...
674228857428406272,"RT UNICEF Martha , 19 of Malawi , was born wit..."
708419352198979588,RT frankrichny Insulting to imply that advocac...
718151675907805187,If ur partner is # poz and ur considering havi...
717410321338503168,"RT thotjawn I've got PrEP , I've got poppers ,..."
705438295489175553,Doctor of Man Who Contracted HIV on PrEP Discu...
708049461151457282,Why aren't # latino men using # truvada as # P...
676401572994265089,Why aren't # latino men using # truvada as # P...
678600769264750592,The debate and stigma about # truvada and # HI...


In [47]:
df[['words']][-4:-1].append(df[prep_tweets][['words']][-4:-1]).append(df[truvada_tweets][['words']][-4:-1])['words'].values

array(['RT virginiakimba What Jenny McCarthy Gets Wrong Again HIV Isnt .',
       'RT UNICEF Martha , 19 of Malawi , was born with # HIV . Through antiretroviral treatment her son was born HIV-free',
       'RT frankrichny Insulting to imply that advocacy for Alzheimers long after out of office in any way balances Reagan AIDS record .',
       'If ur partner is # poz and ur considering having a baby , look at # PrEP . # Serodiscordant',
       "RT thotjawn I've got PrEP , I've got poppers , I've got my husband from Grindr , who could ask for anything more ? # GaysIn2016",
       'Doctor of Man Who Contracted HIV on PrEP Discusses His Findings , and What They Mean',
       "Why aren't # latino men using # truvada as # PrEP ?",
       "Why aren't # latino men using # truvada as # PrEP ?",
       'The debate and stigma about # truvada and # HIV prevention'], dtype=object)

# Topic modeling on positive and negative prep tweets

In [48]:
# topic modeling on top 10,000 (postive) PrEP tweets
corpus_pos = df[prep_tweets]['words'][0:10000]

# topic modeling on top 10,000 (negative) HIV tweets
corpus_neg = df[prep_tweets]['words'][-10000:-1]


In [49]:
from gensim.parsing.preprocessing import STOPWORDS
import re


STOPWORDS_ = ['rt', '', 'httpst', 'amp', 'https\'']

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z\s]', '', clean)

    clean = clean.lower()
    word_list = clean.split()
    word_list = [w for w in word_list if (w not in STOPWORDS_ and w not in STOPWORDS)]

    return word_list

corpus_pos = [clean_text(doc) for doc in corpus_pos]
corpus_neg = [clean_text(doc) for doc in corpus_neg]

In [50]:
from gensim.models.ldamodel import LdaModel
from gensim.models import TfidfModel
from gensim import corpora

def make_tfidf_corpra(texts):
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=100000)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = TfidfModel(corpus)
    corpus = tfidf[corpus]
    
    return dictionary, corpus

dict_pos, corpus_pos = make_tfidf_corpra(corpus_pos)
dict_neg, corpus_neg = make_tfidf_corpra(corpus_neg)

if train:
    model_pos = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_pos.save('../models/lda_sentiment_pos.model')
    
    model_neg = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_neg.save('../models/lda_sentiment_neg.model')

else:
    model_pos = LdaModel.load('../models/lda_sentiment_pos.model')
    model_neg = LdaModel.load('../models/lda_sentiment_neg.model')

In [51]:
pd.DataFrame([row[1].split(' + ') for row in model_pos.print_topics(20)]).iloc[:,0:5]

Unnamed: 0,0,1,2,3,4
0,0.026*safe,0.025*aspirin,0.022*men,0.021*latest,0.020*gay
1,0.050*aids,0.049*options,0.049*day,0.048*world,0.047*restingplatypus
2,0.027*arent,0.023*prevention,0.023*tool,0.016*need,0.016*men
3,0.035*canada,0.022*video,0.022*approve,0.021*askthehivdoc,0.019*approved
4,0.034*reduce,0.025*percent,0.024*help,0.023*risk,0.022*exception
5,0.031*regimen,0.027*nhs,0.021*clinic,0.021*latino,0.018*available
6,0.030*got,0.022*daily,0.019*prevent,0.018*story,0.018*meet
7,0.053*man,0.038*contracts,0.038*daily,0.031*gay,0.027*truvada
8,0.031*uk,0.023*new,0.018*women,0.018*treatment,0.016*transgender
9,0.036*hivpositive,0.033*rare,0.017*man,0.015*stigma,0.014*important


In [52]:
pd.DataFrame([row[1].split(' + ') for row in model_neg.print_topics(20)]).iloc[:,0:5]

Unnamed: 0,0,1,2,3,4
0,0.023*prevents,0.022*know,0.021*pill,0.015*like,0.015*prevention
1,0.042*man,0.042*aids,0.039*day,0.037*world,0.035*restingplatypus
2,0.046*contracts,0.039*gay,0.033*truvada,0.029*arent,0.026*drugresistant
3,0.027*story,0.026*approve,0.021*canada,0.018*tests,0.018*country
4,0.030*reduce,0.030*help,0.027*percent,0.024*exception,0.022*infections
5,0.024*contracted,0.021*doctor,0.021*speaks,0.020*man,0.016*findings
6,0.021*latino,0.018*dont,0.017*canada,0.017*effective,0.017*risk
7,0.027*askthehivdoc,0.019*transgender,0.017*getprepla,0.017*greaterthanaids,0.016*hivplusmag
8,0.016*need,0.016*women,0.015*men,0.014*program,0.014*black
9,0.029*safe,0.027*new,0.025*aspirin,0.023*study,0.021*infections
