# Load  clean HIV tweets and sentiment tweets
notebook code adapted from https://github.com/piskvorky/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb
sentiment training data from http://thinknook.com/wp-content/uploads/2012/09/Sentiment-Analysis-Dataset.zip

In [None]:
import re
import json
from collections import namedtuple

import gensim

def clean_text(text):
    clean = re.sub(r'http.*$', '', text)
    clean = re.sub(r'[^a-zA-Z,\.?\'#0-9\s\-_]', '', clean)
    
    clean = clean.replace('...', '.')
    clean = clean.replace(',', ' , ')
    clean = clean.replace('#', ' # ')
    clean = clean.replace('?', ' ? ')
    clean = clean.replace('!', ' ! ')
    
    clean = clean.lower()
    word_list = clean.split()
    
    return word_list

SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')


# start with sentiment data
sentiment_docs = []
with open('../data/sentiment_analysis_dataset.csv') as sentiment_data:
    for line_no, line in enumerate(sentiment_data):
        tokens = gensim.utils.to_unicode(line).split(',')
        words = clean_text(tokens[3])
        tags = ['sen_' + str(line_no)] # `tags = [tokens[0]]` would also work at extra memory cost
        split = ['train','train','train','valid'][line_no % 4]  # 3/4 training, 1/4 validation
        sentiment = tokens[1]
        sentiment_docs.append(SentimentDocument(words, tags, split, sentiment))
        
# now add our HIV/PrEP related tweets
hiv_docs = []
with open('../data/c_twitter.json') as hiv_prep_data:
    for line_no, line in enumerate(hiv_prep_data):
        twt = json.loads(line)
        words = twt['text'].split()
        tags = [twt['user_id_str']]
        split = 'extra'
        sentiment = None
        hiv_docs.append(SentimentDocument(words, tags, split, sentiment))

train_docs = [doc for doc in sentiment_docs if doc.split == 'train']
valid_docs = [doc for doc in sentiment_docs if doc.split == 'valid']
alldocs = sentiment_docs + hiv_docs


# Build model

In [None]:
from gensim.models import Doc2Vec
import gensim.models.doc2vec
from collections import OrderedDict
import multiprocessing

cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "this will be painfully slow otherwise"

model= Doc2Vec(dm=1, dm_mean=1, size=100, window=3, negative=5, hs=0, min_count=2, workers=cores,
           max_vocab_size=10000, seed=1)

model.build_vocab(alldocs)
print(model)

In [None]:
from sklearn.linear_model import LogisticRegression
import numpy as np

def fit_model_eval_accuracy(X, y, eval_X, eval_y):
    model = LogisticRegression()
    model.fit(X, y)
    
    mean_accuracy = model.score(eval_X, eval_y)
    
    return model, mean_accuracy

def predict_from_model(model, X):
    pred_y = model.predict(X)
    pred_log_prob = model.predict_log_proba(X)
    
    return (pred_y, pred_log_prob)

# Train

(should add per-epoch shuffling for faster training)

In [None]:
from random import shuffle
import datetime

for epoch in range(passes):
    
    # train doc vectors
    model.train(alldocs)

    # train lr model
    X = np.zeros((len(train_docs), 100))
    y = np.array(len(train_docs))
    for i, doc in enumerate(train_docs):
        doc_id_str = doc[1][0]
        X[i,] = model[doc_id_str]
        y[i] = doc[3]
    
    # validate lr model
    X_eval = np.zeros((len(valid_docs), 100))
    y_eval = np.array(len(valid_docs))
    for i, doc in enumerate(valid_docs):
        doc_id_str = doc[1][0]
        X_eval[i,] = model[doc_id_str]
        y_eval[i] = doc[3]
    
    lr_model, mean_accuracy = fit_model_eval_accuracy(X, y, eval_X, eval_y)
    print('epoch:', epoch, 'mean accuracy:', mean_accuracy)

In [None]:
# do doc2vec training on sentiment tweets:
# tweet -> doc-vec -> classification

In [None]:
# example negitive PrEP tweets
# example negitive HIV tweets

In [None]:
# topic modeling on PrEP tweets
# topic modeling on HIV tweets