# Load  clean HIV tweets and sentiment tweets
notebook code adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
sentiment training data from http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [6]:
import re
import json
from collections import namedtuple

import gensim

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z,\.?\'#0-9\s\-_]', '', clean)
    
    clean = clean.replace('...', '.')
    clean = clean.replace(',', ' , ')
    clean = clean.replace('#', ' # ')
    clean = clean.replace('?', ' ? ')
    clean = clean.replace('!', ' ! ')
    
    clean = clean.lower()
    word_list = clean.split()
    
    return word_list

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')


# start with sentiment data
sentiment_docs = []
with open('../data/sentiment_analysis_dataset.csv') as sentiment_data:
    for line_no, line in enumerate(sentiment_data):
        tokens = gensim.utils.to_unicode(line).split(',')
        words = clean_text(tokens[3])
        tags = ['sen_' + str(line_no)] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','train','train','valid'][line_no % 4]  # 3/4 training, 1/4 validation
        sentiment = tokens[1]
        sentiment_docs.append(SentimentDocument(words, tags, split, sentiment))
        
# now add our HIV/PrEP related tweets
hiv_docs = []
with open('../data/c_twitter.json') as hiv_prep_data:
    for line_no, line in enumerate(hiv_prep_data):
        twt = json.loads(line)
        words = twt['text'].split()
        tags = [str(twt['id'])]
        split = 'extra'
        sentiment = None
        hiv_docs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in sentiment_docs if doc.split == 'train']
valid_docs = [doc for doc in sentiment_docs if doc.split == 'valid']
alldocs = sentiment_docs + hiv_docs

# shuffle for training
shuffledocs = alldocs[:]



In [7]:
# u_tags = [" ".join(doc.words) for doc in hiv_docs]

# print(len(set(u_tags)), len(hiv_docs))
twt['id']

701209839582756864

# Build model

In [8]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

model= Doc2Vec(dm=1, dm_mean=1, size=100, window=3, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)

model.build_vocab(alldocs)
print(model)

Doc2Vec(dm/m,d100,n5,w3,mc2,t6)


In [9]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def fit_model_eval_accuracy(X, y, eval_X, eval_y):
    model = LogisticRegression()
    model.fit(X, y)
    
    mean_accuracy = model.score(eval_X, eval_y)
    
    return model, mean_accuracy

def predict_from_model(model, X):
    pred_y = model.predict(X)
    pred_log_prob = model.predict_log_proba(X)
    
    return (pred_y, pred_log_prob)

# Train

(should add per-epoch shuffling for faster training)

In [10]:
from random import shuffle
import datetime

passes = 3

for epoch in range(passes):
    
    # train doc vectors
    shuffle(shuffledocs)
    model.train(shuffledocs)

    # train lr model
    X = np.zeros((len(train_docs), 100))
    y = np.zeros(len(train_docs))
    for i, doc in enumerate(train_docs):
        doc_id_str = doc[1][0]
        X[i,] = model.docvecs[doc_id_str]
        y[i] = int(doc[3])
    
    # validate lr model
    X_eval = np.zeros((len(valid_docs), 100))
    y_eval = np.zeros(len(valid_docs))
    for i, doc in enumerate(valid_docs):
        doc_id_str = doc[1][0]
        X_eval[i,] = model.docvecs[doc_id_str]
        y_eval[i] = int(doc[3])
    
    lr_model, mean_accuracy = fit_model_eval_accuracy(X, y, X_eval, y_eval)
    print('epoch:', epoch, 'mean accuracy:', mean_accuracy)

epoch: 0 mean accuracy: 0.648749797292
epoch: 1 mean accuracy: 0.660060913808
epoch: 2 mean accuracy: 0.666266318009


In [11]:
# evaluate HIV/PrEP sentiment
X = np.zeros((len(hiv_docs), 100))
words = []
for i, doc in enumerate(hiv_docs):
    doc_id_str = doc[1][0]
    X[i,] = model.docvecs[doc_id_str]
    words.append(' '.join(doc[0]))
pred_y, pred_log_prob = predict_from_model(lr_model, X)

In [12]:
import pandas as pd

pred_log_prob_neg = [e[0] for e in pred_log_prob]
pred_log_prob_pos = [e[1] for e in pred_log_prob]

index = [doc[1][0] for doc in hiv_docs]
data = np.array([pred_y, pred_log_prob_neg, pred_log_prob_pos, words])
df = pd.DataFrame(data=data.transpose(),
                  columns=['pred_y', 'pred_log_prob_neg', 'pred_log_prob_pos', 'words'],
                  index=index)
# need to fill this array

In [13]:
# example positive HIV/PrEP tweets
df.sort(columns='pred_log_prob_pos', ascending=True)

  from ipykernel import kernelapp as app


Unnamed: 0,pred_y,pred_log_prob_neg,pred_log_prob_pos,words
674303273692696577,1.0,-9.06658729541,-0.000115466563585,"# truvada you have unprotected sex , you need ..."
674303273692696577,1.0,-9.06658729541,-0.000115466563585,"# truvada you have unprotected sex , you need ..."
668081058261303297,1.0,-8.90132988924,-0.000136216941978,would be kind of funny if she argued that all ...
668081058261303297,1.0,-8.90132988924,-0.000136216941978,would be kind of funny if she argued that all ...
668081058261303297,1.0,-8.90132988924,-0.000136216941978,would be kind of funny if she argued that all ...
668279022468640768,1.0,-8.85214050146,-0.000143085391922,"alanroden easy . i'll hiv the badge , thanks ...."
668279022468640768,1.0,-8.85214050146,-0.000143085391922,"alanroden easy . i'll hiv the badge , thanks ...."
668279022468640768,1.0,-8.85214050146,-0.000143085391922,"alanroden easy . i'll hiv the badge , thanks ...."
667484532770275329,1.0,-8.79616954571,-0.000151323007218,rt mridmorris hope you all saw positivelad on ...
667484532770275329,1.0,-8.79616954571,-0.000151323007218,rt mridmorris hope you all saw positivelad on ...


In [14]:
# example negative HIV/PrEP tweets
df.sort(columns='pred_log_prob_neg', ascending=True)

  from ipykernel import kernelapp as app


Unnamed: 0,pred_y,pred_log_prob_neg,pred_log_prob_pos,words
667483792345550849,0.0,-0.000101867908143,-9.19188453566,"not only is jenny mccarthy stupid , so's evry1..."
667483792345550849,0.0,-0.000101867908143,-9.19188453566,"not only is jenny mccarthy stupid , so's evry1..."
667483792345550849,0.0,-0.000101867908143,-9.19188453566,"not only is jenny mccarthy stupid , so's evry1..."
667547605388296192,0.0,-0.000106233417006,-9.14992495382,"rt momswhovax skepticpedi ugh , it's scary how..."
667547605388296192,0.0,-0.000106233417006,-9.14992495382,"rt momswhovax skepticpedi ugh , it's scary how..."
667547605388296192,0.0,-0.000106233417006,-9.14992495382,"rt momswhovax skepticpedi ugh , it's scary how..."
668407896032616448,0.0,-0.000107727647657,-9.13595816018,rt aaronlwittnebel retweeted josh robbins imst...
668407896032616448,0.0,-0.000107727647657,-9.13595816018,rt aaronlwittnebel retweeted josh robbins imst...
668407896032616448,0.0,-0.000107727647657,-9.13595816018,rt aaronlwittnebel retweeted josh robbins imst...
668142181220261889,0.0,-0.000128262345076,-8.96149695113,rt blu1x i think it's pure ignorance to call s...


In [15]:
# topic modeling on PrEP tweets
# topic modeling on HIV tweets