# Load  clean HIV tweets and sentiment tweets
notebook code adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
sentiment training data from http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [1]:
import re
import json
from collections import namedtuple

import gensim

train = False

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z,\.?\'#0-9\s\-_]', '', clean)
    
    clean = clean.replace('...', '.')
    clean = clean.replace(',', ' , ')
    clean = clean.replace('#', ' # ')
    clean = clean.replace('?', ' ? ')
    clean = clean.replace('!', ' ! ')
    
    clean = clean.lower()
    word_list = clean.split()
    
    return word_list

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')


# start with sentiment data
sentiment_docs = []
with open('../data/sentiment_analysis_dataset.csv') as sentiment_data:
    for line_no, line in enumerate(sentiment_data):
        tokens = gensim.utils.to_unicode(line).split(',')
        words = clean_text(tokens[3])
        tags = ['sen_' + str(line_no)] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','train','train','valid'][line_no % 4]  # 3/4 training, 1/4 validation
        sentiment = tokens[1]
        sentiment_docs.append(SentimentDocument(words, tags, split, sentiment))
        
# now add our HIV/PrEP related tweets
hiv_docs = []
tweet_ids = set()
with open('../data/c_twitter.json') as hiv_prep_data:
    for line_no, line in enumerate(hiv_prep_data):
        twt = json.loads(line)
        words = twt['text'].split()
        tags = [str(twt['id'])]
        split = 'extra'
        sentiment = None
        if tags[0] not in tweet_ids:
            hiv_docs.append(SentimentDocument(words, tags, split, sentiment))
            tweet_ids.add(tags[0])

train_docs = [doc for doc in sentiment_docs if doc.split == 'train']
valid_docs = [doc for doc in sentiment_docs if doc.split == 'valid']
alldocs = sentiment_docs + hiv_docs

# shuffle for training
shuffledocs = alldocs[:]



# Build model

In [2]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

model= Doc2Vec(dm=1, dm_mean=1, size=100, window=3, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)

model.build_vocab(alldocs)
print(model)

Doc2Vec(dm/m,d100,n5,w3,mc2,t6)


In [3]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def fit_model_eval_accuracy(X, y, eval_X, eval_y):
    model = LogisticRegression(random_state=1)
    model.fit(X, y)
    
    mean_accuracy = model.score(eval_X, eval_y)
    
    return model, mean_accuracy

def predict_from_model(model, X):
    pred_y = model.predict(X)
    pred_log_prob = model.predict_log_proba(X)
    
    return (pred_y, pred_log_prob)

# Train

(should add per-epoch shuffling for faster training)

In [4]:
from random import shuffle
import datetime
import pickle

passes = 20


if train:
    for epoch in range(passes):

        # train doc vectors
        shuffle(shuffledocs)
        model.train(shuffledocs)

        # train lr model
        X = np.zeros((len(train_docs), 100))
        y = np.zeros(len(train_docs))
        for i, doc in enumerate(train_docs):
            doc_id_str = doc[1][0]
            X[i,] = model.docvecs[doc_id_str]
            y[i] = int(doc[3])

        # validate lr model
        X_eval = np.zeros((len(valid_docs), 100))
        y_eval = np.zeros(len(valid_docs))
        for i, doc in enumerate(valid_docs):
            doc_id_str = doc[1][0]
            X_eval[i,] = model.docvecs[doc_id_str]
            y_eval[i] = int(doc[3])

        lr_model, mean_accuracy = fit_model_eval_accuracy(X, y, X_eval, y_eval)
        print('epoch:', epoch, 'mean accuracy:', mean_accuracy)
        
        model.save('../models/sentiment_analysis.model')
        
        s = pickle.dumps(lr_model)
        with open('../models/lr_sentiment.model', 'wb') as f_out:
            f_out.write(s)

else:
    Doc2Vec.load('../models/sentiment_analysis.model')
    
    with open('../models/lr_sentiment.model', 'rb') as f_in:
        s = f_in.read()
    lr_model = pickle.loads(s)

epoch: 0 mean accuracy: 0.647624766886
epoch: 1 mean accuracy: 0.660826137193
epoch: 2 mean accuracy: 0.666965661234
epoch: 3 mean accuracy: 0.671328954837
epoch: 4 mean accuracy: 0.674323968215
epoch: 5 mean accuracy: 0.67649801346
epoch: 6 mean accuracy: 0.679161092192
epoch: 7 mean accuracy: 0.679655193384
epoch: 8 mean accuracy: 0.681028541312
epoch: 9 mean accuracy: 0.682148504014
epoch: 10 mean accuracy: 0.68278196708
epoch: 11 mean accuracy: 0.683531987351
epoch: 12 mean accuracy: 0.684038757804
epoch: 13 mean accuracy: 0.684948410768
epoch: 14 mean accuracy: 0.685277811562
epoch: 15 mean accuracy: 0.685503324414
epoch: 16 mean accuracy: 0.685888469959
epoch: 17 mean accuracy: 0.686547271548
epoch: 18 mean accuracy: 0.68696282332
epoch: 19 mean accuracy: 0.687236479364


In [5]:
# evaluate HIV/PrEP sentiment
X = np.zeros((len(hiv_docs), 100))
words = []
for i, doc in enumerate(hiv_docs):
    doc_id_str = doc[1][0]
    X[i,] = model.docvecs[doc_id_str]
    words.append(' '.join(doc[0]))
pred_y, pred_log_prob = predict_from_model(lr_model, X)

In [6]:
import pandas as pd

pred_log_prob_neg = [e[0] for e in pred_log_prob]
pred_log_prob_pos = [e[1] for e in pred_log_prob]

index = [doc[1][0] for doc in hiv_docs]
data = np.array([pred_y, pred_log_prob_neg, pred_log_prob_pos, words])
df = pd.DataFrame(data=data.transpose(),
                  columns=['pred_y', 'pred_log_prob_neg', 'pred_log_prob_pos', 'words'],
                  index=index)
# add tweet id to this dataframe

In [7]:
# example HIV/PrEP tweets
# (positive at the top and negative at the bottom)
df = df.sort(columns='pred_log_prob_pos', ascending=True)
df

  app.launch_new_instance()


Unnamed: 0,pred_y,pred_log_prob_neg,pred_log_prob_pos,words
689891786706309125,1.0,-6.54515378747,-0.00143809662873,They'll find a positive result AmaraGoddess As...
714349130743681024,1.0,-6.31467834778,-0.00181118692129,"Man , that's a BEAUTIFUL cock -- and a nice se..."
699917225948487680,1.0,-5.6486322204,-0.00352854926873,Acts for AIDS last night was a success ! Seunj...
699917367388872704,1.0,-5.52215821151,-0.00400522200148,Acts for AIDS last night was a success ! Seunj...
710814387435651072,1.0,-5.50580928917,-0.00407137584204,Make a philanthropic gift to Casey House by Ma...
713627741988454400,1.0,-5.50513011989,-0.00407414757708,RT OddeOMontle Lately you can't even tell if a...
712310532305526784,1.0,-5.49297153723,-0.00412408871347,Make a philanthropic gift to Casey House by Ma...
716995028405657600,1.0,-5.45502233486,-0.00428394430416,Hmm . . Perhaps if you add # HIV and # AID has...
708566225383600128,1.0,-5.44290718452,-0.00433627397378,If you ever find yourself running for presiden...
699601637485072385,1.0,-5.43753475285,-0.00435968399348,"When you're on # PrEP , we'll see you every 3 ..."


In [8]:
# lets restrict to prep tweets and truvada tweets only:
prep_tweets = []
truvada_tweets = []

for words in df['words']:
    if 'PrEP' in words:
        prep_tweets.append(True)
    else:
        prep_tweets.append(False)
        
    if 'truvada' in words:
        truvada_tweets.append(True)
    else:
        truvada_tweets.append(False)


In [35]:
# most positive tweet text, prep tweet text, truvada tweet text
df[['words']][0:3].append(df[prep_tweets][['words']][0:3]).append(df[truvada_tweets][['words']][0:3])

Unnamed: 0,words
689891786706309125,They'll find a positive result AmaraGoddess As...
714349130743681024,"Man , that's a BEAUTIFUL cock -- and a nice se..."
699917225948487680,Acts for AIDS last night was a success ! Seunj...
699601637485072385,"When you're on # PrEP , we'll see you every 3 ..."
686771263336001537,Another good reason to talk to your doctor abo...
684487022732558336,RT startswith_me The amazing MikeRizzi talks H...
686771263336001537,Another good reason to talk to your doctor abo...
702554476822052864,"RT ThaDozierBoi D0wJ0nEs the truvada , you sur..."
702554117420535808,"D0wJ0nEs the truvada , you sure it wasn't the ..."


In [33]:
# most negative tweet text, prep tweet text, truvada tweet text
df[['words']][-4:-1].append(df[prep_tweets][['words']][-4:-1]).append(df[truvada_tweets][['words']][-4:-1])

Unnamed: 0,words
697170039984758784,I miss AIDS I miss cancer I miss earthquakes I...
709567201288851456,like boo boo g ? and he with this man ? that s...
691498394553028608,I wonder why he beat her ass when she was tryn...
669565035866386433,Equality_MI Syph amp Hep C have gone up 550 in...
702295297221787648,RT STAB Long-acting # PrEP delivery system did...
669564889644605441,Equality_MI Syph amp Hep C have gone up 550 in...
686408367653412867,NastyAssFvck RussianPharaoh truvada doesn't no...
703260706427297793,For a while it was total heterodoxy to suggest...
706713533224857601,I took my truvada today and Nancy Reagan died ...


# Topic modeling on positive and negative prep tweets

In [11]:
# topic modeling on top 10,000 (negative) PrEP tweets
corpus_pos = df[prep_tweets]['words'][0:10000]

# topic modeling on top 10,000 (negative) HIV tweets
corpus_neg = df[prep_tweets]['words'][-10000:-1]


In [12]:
from gensim.parsing.preprocessing import STOPWORDS
import re


STOPWORDS_ = ['rt', '', 'httpst', 'amp', 'https\'']

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z\s]', '', clean)

    clean = clean.lower()
    word_list = clean.split()
    word_list = [w for w in word_list if (w not in STOPWORDS_ and w not in STOPWORDS)]

    return word_list

corpus_pos = [clean_text(doc) for doc in corpus_pos]
corpus_neg = [clean_text(doc) for doc in corpus_neg]

In [13]:
from gensim.models.ldamodel import LdaModel
from gensim.models import TfidfModel
from gensim import corpora

def make_tfidf_corpra(texts):
    dictionary = corpora.Dictionary(texts)
    dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=100000)
    corpus = [dictionary.doc2bow(text) for text in texts]
    tfidf = TfidfModel(corpus)
    corpus = tfidf[corpus]
    
    return dictionary, corpus

dict_pos, corpus_pos = make_tfidf_corpra(corpus_pos)
dict_neg, corpus_neg = make_tfidf_corpra(corpus_neg)

if train:
    model_pos = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_pos.save('../models/lda_sentiment_pos.model')
    
    model_neg = LdaModel(corpus_pos, id2word=dict_pos, num_topics=10, iterations=50, passes=10)
    model_neg.save('../models/lda_sentiment_neg.model')

else:
    model_pos = LdaModel.load('../models/lda_sentiment_pos.model')
    model_neg = LdaModel.load('../models/lda_sentiment_neg.model')

In [44]:
pd.DataFrame([row[1].split(' + ') for row in model_pos.print_topics(20)]).iloc[:,0:5]

Unnamed: 0,0,1,2,3,4
0,0.055*world,0.050*day,0.049*options,0.048*restingplatypus,0.041*aids
1,0.025*exception,0.013*tested,0.013*sexual,0.012*health,0.011*aids
2,0.036*hivpositive,0.035*rare,0.026*man,0.024*treatment,0.020*got
3,0.033*arent,0.027*positive,0.024*latino,0.024*daily,0.023*taking
4,0.050*contracts,0.038*man,0.034*daily,0.032*truvada,0.028*gay
5,0.024*reported,0.023*contract,0.018*user,0.017*gay,0.016*transgender
6,0.017*toolbox,0.016*bareback,0.016*projectinform,0.016*contracting,0.016*great
7,0.025*preexposure,0.024*prophylaxis,0.019*uk,0.018*tool,0.016*sex
8,0.039*study,0.025*aspirin,0.025*safe,0.019*according,0.018*askthehivdoc
9,0.025*doctor,0.024*contracted,0.020*know,0.020*prevents,0.020*pill


In [45]:
pd.DataFrame([row[1].split(' + ') for row in model_neg.print_topics(20)]).iloc[:,0:5]

Unnamed: 0,0,1,2,3,4
0,0.046*rare,0.024*life,0.021*bareback,0.020*bb,0.019*words
1,0.039*truvada,0.037*man,0.028*safe,0.027*daily,0.026*aspirin
2,0.033*prevents,0.027*pill,0.025*know,0.018*risk,0.016*nhs
3,0.030*exception,0.030*hivpositive,0.023*uk,0.022*man,0.020*proud
4,0.040*infections,0.028*new,0.027*cdc,0.027*reduce,0.026*percent
5,0.026*preexposure,0.023*risk,0.023*prophylaxis,0.018*women,0.016*reduce
6,0.045*world,0.041*day,0.040*options,0.040*restingplatypus,0.039*learn
7,0.019*dont,0.019*askthehivdoc,0.017*greaterthanaids,0.017*sex,0.016*getprepla
8,0.033*arent,0.026*contract,0.017*men,0.015*truvada,0.012*epidemic
9,0.028*video,0.024*latino,0.022*doctor,0.019*results,0.019*shows
