# Load  clean HIV tweets and sentiment tweets
notebook code adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
sentiment training data from http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [1]:
import re
import json
from collections import namedtuple

import gensim

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z,\.?\'#0-9\s\-_]', '', clean)
    
    clean = clean.replace('...', '.')
    clean = clean.replace(',', ' , ')
    clean = clean.replace('#', ' # ')
    clean = clean.replace('?', ' ? ')
    clean = clean.replace('!', ' ! ')
    
    clean = clean.lower()
    word_list = clean.split()
    
    return word_list

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')


# start with sentiment data
sentiment_docs = []
with open('../data/sentiment_analysis_dataset.csv') as sentiment_data:
    for line_no, line in enumerate(sentiment_data):
        tokens = gensim.utils.to_unicode(line).split(',')
        words = clean_text(tokens[3])
        tags = ['sen_' + str(line_no)] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','train','train','valid'][line_no % 4]  # 3/4 training, 1/4 validation
        sentiment = tokens[1]
        sentiment_docs.append(SentimentDocument(words, tags, split, sentiment))
        
# now add our HIV/PrEP related tweets
hiv_docs = []
tweet_ids = set()
with open('../data/c_twitter.json') as hiv_prep_data:
    for line_no, line in enumerate(hiv_prep_data):
        twt = json.loads(line)
        words = twt['text'].split()
        tags = [str(twt['id'])]
        split = 'extra'
        sentiment = None
        if tags[0] not in tweet_ids:
            hiv_docs.append(SentimentDocument(words, tags, split, sentiment))
            tweet_ids.add(tags[0])

train_docs = [doc for doc in sentiment_docs if doc.split == 'train']
valid_docs = [doc for doc in sentiment_docs if doc.split == 'valid']
alldocs = sentiment_docs + hiv_docs

# shuffle for training
shuffledocs = alldocs[:]



# Build model

In [2]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

model= Doc2Vec(dm=1, dm_mean=1, size=100, window=3, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)

model.build_vocab(alldocs)
print(model)

Doc2Vec(dm/m,d100,n5,w3,mc2,t6)


In [3]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def fit_model_eval_accuracy(X, y, eval_X, eval_y):
    model = LogisticRegression()
    model.fit(X, y)
    
    mean_accuracy = model.score(eval_X, eval_y)
    
    return model, mean_accuracy

def predict_from_model(model, X):
    pred_y = model.predict(X)
    pred_log_prob = model.predict_log_proba(X)
    
    return (pred_y, pred_log_prob)

# Train

(should add per-epoch shuffling for faster training)

In [4]:
from random import shuffle
import datetime

passes = 3

for epoch in range(passes):
    
    # train doc vectors
    shuffle(shuffledocs)
    model.train(shuffledocs)

    # train lr model
    X = np.zeros((len(train_docs), 100))
    y = np.zeros(len(train_docs))
    for i, doc in enumerate(train_docs):
        doc_id_str = doc[1][0]
        X[i,] = model.docvecs[doc_id_str]
        y[i] = int(doc[3])
    
    # validate lr model
    X_eval = np.zeros((len(valid_docs), 100))
    y_eval = np.zeros(len(valid_docs))
    for i, doc in enumerate(valid_docs):
        doc_id_str = doc[1][0]
        X_eval[i,] = model.docvecs[doc_id_str]
        y_eval[i] = int(doc[3])
    
    lr_model, mean_accuracy = fit_model_eval_accuracy(X, y, X_eval, y_eval)
    print('epoch:', epoch, 'mean accuracy:', mean_accuracy)

epoch: 0 mean accuracy: 0.650163686856
epoch: 1 mean accuracy: 0.663058461039
epoch: 2 mean accuracy: 0.669210654342


In [5]:
# evaluate HIV/PrEP sentiment
X = np.zeros((len(hiv_docs), 100))
words = []
for i, doc in enumerate(hiv_docs):
    doc_id_str = doc[1][0]
    X[i,] = model.docvecs[doc_id_str]
    words.append(' '.join(doc[0]))
pred_y, pred_log_prob = predict_from_model(lr_model, X)

In [6]:
import pandas as pd

pred_log_prob_neg = [e[0] for e in pred_log_prob]
pred_log_prob_pos = [e[1] for e in pred_log_prob]

index = [doc[1][0] for doc in hiv_docs]
data = np.array([pred_y, pred_log_prob_neg, pred_log_prob_pos, words])
df = pd.DataFrame(data=data.transpose(),
                  columns=['pred_y', 'pred_log_prob_neg', 'pred_log_prob_pos', 'words'],
                  index=index)
# need to fill this array

In [10]:
# example HIV/PrEP tweets
# (positive at the top and negative at the bottom)
df = df.sort(columns='pred_log_prob_pos', ascending=True)
df

  app.launch_new_instance()


Unnamed: 0,pred_y,pred_log_prob_neg,pred_log_prob_pos,words
692436204294422529,1.0,-5.92488787486,-0.00267568372999,alurajenson hey amazingly cute alura ! shall l...
674123081405730818,1.0,-5.88489037633,-0.00278502567484,lambdalegal great to see your support on hiv a...
697319095293898752,1.0,-5.80961099404,-0.00300310111488,rt featherssa well done you guys . doing a gre...
697900253387890688,1.0,-5.78522483129,-0.0030773496643,mmatos11 that's so awesome ! good for you ! i'...
669807939281096704,1.0,-5.55589222822,-0.00387210574383,rt thtscotland hiv amp hepatitis testing at ou...
671814882300329984,1.0,-5.54220750006,-0.00392556354241,rt pepperxo_ pepperxo_ oh but hiv turns into a...
678768026162884608,1.0,-5.4829058357,-0.00416589720911,"rt maroon_bottom thanks pia , btw , for bringi..."
686189903600848896,1.0,-5.48194189792,-0.00416992319905,hi hivblogger i follow your blog and you share...
668078375110594560,1.0,-5.36398258482,-0.00469321751785,jennymccarthy begging you to stop talking abou...
671830137558720513,1.0,-5.34992460102,-0.0047598190055,rt mlgconline if you have a lovely singing voi...


In [13]:
# most positive tweet text
df['words'][0]

"alurajenson hey amazingly cute alura ! shall let you know that i'll passing an alcohol , drugs and hiv test via gp as you requested my dear ."

In [14]:
# most negative tweet text
df['words'][-1]

"hiv sucks big time ! ! and i f # n hate it ! ! we're almost there until this one stupid infection brought him back to where he started . shit ! ! !"

In [9]:
# topic modeling on PrEP tweets
# topic modeling on HIV tweets