# Load  clean HIV tweets and sentiment tweets
notebook code adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
sentiment training data from http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [1]:
import re
import json
from collections import namedtuple

import gensim

train = False

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z,\.?\'#0-9\s\-_]', '', clean)
    
    clean = clean.replace('...', '.')
    clean = clean.replace(',', ' , ')
    clean = clean.replace('#', ' # ')
    clean = clean.replace('?', ' ? ')
    clean = clean.replace('!', ' ! ')
    
    clean = clean.lower()
    word_list = clean.split()
    
    return word_list

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')


# start with sentiment data
sentiment_docs = []
with open('../data/sentiment_analysis_dataset.csv') as sentiment_data:
    for line_no, line in enumerate(sentiment_data):
        tokens = gensim.utils.to_unicode(line).split(',')
        words = clean_text(tokens[3])
        tags = ['sen_' + str(line_no)] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','train','train','valid'][line_no % 4]  # 3/4 training, 1/4 validation
        sentiment = tokens[1]
        sentiment_docs.append(SentimentDocument(words, tags, split, sentiment))
        
# now add our HIV/PrEP related tweets
hiv_docs = []
tweet_ids = set()
with open('../data/c_twitter.json') as hiv_prep_data:
    for line_no, line in enumerate(hiv_prep_data):
        twt = json.loads(line)
        words = twt['text'].split()
        tags = [str(twt['id'])]
        split = 'extra'
        sentiment = None
        if tags[0] not in tweet_ids:
            hiv_docs.append(SentimentDocument(words, tags, split, sentiment))
            tweet_ids.add(tags[0])

train_docs = [doc for doc in sentiment_docs if doc.split == 'train']
valid_docs = [doc for doc in sentiment_docs if doc.split == 'valid']
alldocs = sentiment_docs + hiv_docs

# shuffle for training
shuffledocs = alldocs[:]



# Build model

In [2]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

model= Doc2Vec(dm=1, dm_mean=1, size=100, window=3, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)

model.build_vocab(alldocs)
print(model)

Doc2Vec(dm/m,d100,n5,w3,mc2,t6)


In [3]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def fit_model_eval_accuracy(X, y, eval_X, eval_y):
    model = LogisticRegression(random_state=1)
    model.fit(X, y)
    
    mean_accuracy = model.score(eval_X, eval_y)
    
    return model, mean_accuracy

def predict_from_model(model, X):
    pred_y = model.predict(X)
    pred_log_prob = model.predict_log_proba(X)
    
    return (pred_y, pred_log_prob)

# Train

(should add per-epoch shuffling for faster training)

In [4]:
from random import shuffle
import datetime
import pickle

passes = 3


if train:
    for epoch in range(passes):

        # train doc vectors
        shuffle(shuffledocs)
        model.train(shuffledocs)

        # train lr model
        X = np.zeros((len(train_docs), 100))
        y = np.zeros(len(train_docs))
        for i, doc in enumerate(train_docs):
            doc_id_str = doc[1][0]
            X[i,] = model.docvecs[doc_id_str]
            y[i] = int(doc[3])

        # validate lr model
        X_eval = np.zeros((len(valid_docs), 100))
        y_eval = np.zeros(len(valid_docs))
        for i, doc in enumerate(valid_docs):
            doc_id_str = doc[1][0]
            X_eval[i,] = model.docvecs[doc_id_str]
            y_eval[i] = int(doc[3])

        lr_model, mean_accuracy = fit_model_eval_accuracy(X, y, X_eval, y_eval)
        print('epoch:', epoch, 'mean accuracy:', mean_accuracy)
        
        model.save('../models/sentiment_analysis.model')
        
        s = pickle.dumps(lr_model)
        with open('../models/lr_sentiment.model', 'wb') as f_out:
            f_out.write(s)

else:
    Doc2Vec.load('../models/sentiment_analysis.model')
    
    with open('../models/lr_sentiment.model', 'rb') as f_in:
        s = f_in.read()
    lr_model = pickle.loads(s)

epoch: 0 mean accuracy: 0.643405902862
epoch: 1 mean accuracy: 0.655003344685
epoch: 2 mean accuracy: 0.660484067137


In [5]:
# evaluate HIV/PrEP sentiment
X = np.zeros((len(hiv_docs), 100))
words = []
for i, doc in enumerate(hiv_docs):
    doc_id_str = doc[1][0]
    X[i,] = model.docvecs[doc_id_str]
    words.append(' '.join(doc[0]))
pred_y, pred_log_prob = predict_from_model(lr_model, X)

In [6]:
import pandas as pd

pred_log_prob_neg = [e[0] for e in pred_log_prob]
pred_log_prob_pos = [e[1] for e in pred_log_prob]

index = [doc[1][0] for doc in hiv_docs]
data = np.array([pred_y, pred_log_prob_neg, pred_log_prob_pos, words])
df = pd.DataFrame(data=data.transpose(),
                  columns=['pred_y', 'pred_log_prob_neg', 'pred_log_prob_pos', 'words'],
                  index=index)
# add tweet id to this dataframe

In [7]:
# example HIV/PrEP tweets
# (positive at the top and negative at the bottom)
df = df.sort(columns='pred_log_prob_pos', ascending=True)
df

  app.launch_new_instance()


Unnamed: 0,pred_y,pred_log_prob_neg,pred_log_prob_pos,words
671633859272974336,1.0,-7.61052503556,-0.000495334444267,"rt nnenna # worldaidsday when , like me , you ..."
671631381542395904,1.0,-6.97999700344,-0.000930738994106,"rt nnenna # worldaidsday when , like me , you ..."
677613948762324992,1.0,-6.84803582579,-0.00106210252473,"johnqabc hey john , recorded the shows to catc..."
671627969195044864,1.0,-6.47689961974,-0.00153975827466,"rt nnenna # worldaidsday when , like me , you ..."
698705569604116480,1.0,-6.20903615668,-0.00201320011589,"being single is great ! , no fighting , arguin..."
671637358819831808,1.0,-6.17407567194,-0.00208490205255,"rt nnenna # worldaidsday when , like me , you ..."
672196831061876736,1.0,-5.90563004802,-0.00272778196493,"# apple hey , apple , with your new headphone ..."
699917225948487680,1.0,-5.80318616094,-0.00302248693484,acts for aids last night was a success ! seunj...
669485269012189188,1.0,-5.79649065499,-0.00304282283778,hey charlie sheen u got hiv u matter no perfec...
699917367388872704,1.0,-5.75672457696,-0.00316645768357,acts for aids last night was a success ! seunj...


In [8]:
# lets restrict to prep tweets and truvada tweets only:
prep_tweets = []
truvada_tweets = []

for words in df['words']:
    if 'prep' in words:
        prep_tweets.append(True)
    else:
        prep_tweets.append(False)
        
    if 'truvada' in words:
        truvada_tweets.append(True)
    else:
        truvada_tweets.append(False)


In [9]:
# most positive tweet text, prep tweet text, truvada tweet text
print(df['words'][0], '\n')
print(df[prep_tweets]['words'][0], '\n')
print(df[truvada_tweets]['words'][0], '\n')

rt nnenna # worldaidsday when , like me , you have family living with hiv you learn you fear less you plan you obey you live right you edu 

greg0wen hi greg , i'm a hiv doctor and health writer doing a piece on hivprep could i poss contact you for some info ? thanks , verity x 

great skypeing the john grant just now about bowie , icelandic sagas and truvada ! you're right elliot_rose about the song disappointing ! 



In [10]:
# most negative tweet text, prep tweet text, truvada tweet text

print(df['words'][-1], '\n')
print(df[prep_tweets]['words'][-1], '\n')
print(df[truvada_tweets]['words'][-1], '\n')

its not about his hiv status its about the shower . . shows he stupid , no not unaware not misinformed no . stupid , people dont want a stupid leader 

i'm so glad not to be on fb or ig fb owned anymore . # hiv # truvada # prep folks if you can't find greg here is why 

i'm so glad not to be on fb or ig fb owned anymore . # hiv # truvada # prep folks if you can't find greg here is why 



# Topic modeling on positive and negative prep tweets

In [11]:
# topic modeling on top 10,000 (negative) PrEP tweets
corpus_pos = df[prep_tweets]['words'][0:10000]

# topic modeling on top 10,000 (negative) HIV tweets
corpus_neg = df[prep_tweets]['words'][-10000:-1]


In [12]:
from gensim.parsing.preprocessing import STOPWORDS
import re


STOPWORDS_ = ['rt', '', 'httpst', 'amp', 'https\'']

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z\s]', '', clean)

    clean = clean.lower()
    word_list = clean.split()
    word_list = [w for w in word_list if (w not in STOPWORDS_ and w not in STOPWORDS)]

    return word_list

corpus_pos = [clean_text(doc) for doc in corpus_pos]
corpus_neg = [clean_text(doc) for doc in corpus_neg]

In [13]:
from gensim.models.ldamodel import LdaModel
from gensim.models import TfidfModel
from gensim import corpora

def make_tfidf_corpra(texts):
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=100000)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = TfidfModel(corpus)
    corpus = tfidf[corpus]
    
    return dictionary, corpus

dict_pos, corpus_pos = make_tfidf_corpra(corpus_pos)
dict_neg, corpus_neg = make_tfidf_corpra(corpus_neg)

if train:
    model_pos = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_pos.save('../models/lda_sentiment_pos.model')
    
    model_neg = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_neg.save('../models/lda_sentiment_neg.model')

else:
    model_positive = LdaModel.load('../models/lda_sentiment_pos.model')
    model_positive = LdaModel.load('../models/lda_sentiment_neg.model')

In [14]:
model_pos.print_topics(20)

[(0,
  '0.019*clinic + 0.018*new + 0.017*condoms + 0.017*infections + 0.016*use + 0.016*prescription + 0.015*spread + 0.015*cdc + 0.015*negative + 0.013*testing'),
 (1,
  '0.104*world + 0.102*day + 0.097*options + 0.097*restingplatypus + 0.090*aids + 0.079*learn + 0.062*prevention + 0.009*individuals + 0.009*queer + 0.008*washingtonpost'),
 (2,
  '0.028*high + 0.027*risk + 0.019*france + 0.019*benefit + 0.016*cdcgov + 0.015*approves + 0.015*free + 0.013*gre + 0.013*study + 0.013*year'),
 (3,
  '0.018*nhpc + 0.017*gilead + 0.015*support + 0.013*conversation + 0.012*community + 0.012*prevention + 0.012*europe + 0.012*hivandhepatitis + 0.011*gets + 0.011*heated'),
 (4,
  '0.029*option + 0.023*prevention + 0.021*cjpatruno + 0.016*party + 0.015*thebodydotcom + 0.013*equal + 0.012*approval + 0.012*tasp + 0.012*potential + 0.012*gilead'),
 (5,
  '0.021*prophylaxis + 0.021*prevent + 0.020*preexposure + 0.017*askthehivdoc + 0.017*daily + 0.016*greaterthanaids + 0.015*taking + 0.015*pill + 0.014

In [15]:
model_neg.print_topics(20)

[(0,
  '0.019*finds + 0.017*cases + 0.016*infection + 0.015*clinic + 0.015*prevent + 0.012*fulton + 0.012*gregwen + 0.012*nocost + 0.012*stigma + 0.011*fights'),
 (1,
  '0.026*option + 0.019*cjpatruno + 0.018*prevention + 0.014*equal + 0.013*effective + 0.012*south + 0.012*drug + 0.012*discuss + 0.012*preventing + 0.010*big'),
 (2,
  '0.015*askthehivdoc + 0.015*americans + 0.013*right + 0.013*dont + 0.013*greaterthanaids + 0.013*like + 0.012*women + 0.012*help + 0.011*trial + 0.011*stay'),
 (3,
  '0.031*reduce + 0.028*risk + 0.022*gilead + 0.021*daily + 0.020*video + 0.019*physicians + 0.019*need + 0.019*cdc + 0.017*medscape + 0.017*getting'),
 (4,
  '0.026*available + 0.021*prescription + 0.020*drugs + 0.014*stop + 0.011*doctors + 0.011*drug + 0.011*new + 0.011*victoria + 0.011*lgbt + 0.011*k'),
 (5,
  '0.018*new + 0.016*zero + 0.015*community + 0.014*queensland + 0.013*potential + 0.013*w + 0.013*thebodydotcom + 0.013*prevention + 0.012*san + 0.012*year'),
 (6,
  '0.083*world + 0.079