In [None]:
from helper_functions import *

In [None]:
import multiprocessing
import gensim.models.word2vec as w2v
from gensim import corpora, models, similarities

def make_gram_vec(vec_slice):
    gv = []
    divisor = 0
    for i, v in enumerate(vec_slice):
        gv.append(v * (i+1))
        divisor += (i+1)
    gram_vec = np.sum(np.array(gv), axis=0)
    gram_vec = gram_vec/divisor
    return gram_vec

def sentence_vector_ngram(vecs, num_grams):
    if num_grams == 1:
        return np.sum(np.array(vecs), axis=0)
    gram_vecs = []
    for i in range(0, len(vecs) - (num_grams-1)):
        gram_vec = make_gram_vec(vecs[i:i+num_grams])
        gram_vecs.append(gram_vec)   
    final = np.sum(np.array(gram_vecs), axis=0)
    return final

def calculate_sentence_vector(vecs):
    num_grams = 4
    return sentence_vector_ngram(vecs, num_grams)

def get_sentence_vectors(tokens, model):
    vocab_set = set(list(model.wv.vocab.keys()))
    vecsindexed = []
    for clean_tokens in tokens:
        vecs = []
        for token in clean_tokens:
            if token in vocab_set:
                vecs.append(model.wv[token])
        final = calculate_sentence_vector(vecs)
        vecsindexed.append(final)
    return vecsindexed

def make_word2vec_model(sentences):
    params = {'min_count': 3, 'window': 5, 'sample': 0.001, 'sg': 0, 'negative': 5, 'num_features': 768}
    print("w2v training data contained: " + str(len(sentences)) + " sentences.")
    num_workers = multiprocessing.cpu_count()
    sentence_count = len(sentences)

    word2vec = w2v.Word2Vec(sg=params["sg"],
                            seed=1,
                            workers=num_workers,
                            size=params["num_features"],
                            min_count=params["min_count"],
                            window=params["window"],
                            sample=params["sample"])


    word2vec.build_vocab(sentences)
    print("Training model with vocabulary length:", len(word2vec.wv.vocab))
    epoch_count = 10
    word2vec.train(sentences, total_examples=sentence_count, epochs=epoch_count)
    return word2vec

In [None]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument
import multiprocessing

def make_doc2vec_model(sentences):
    num_workers = multiprocessing.cpu_count()
    sentence_count = len(sentences)

    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(sentences)]
    alpha = 0.025
    doc2vec = Doc2Vec(documents,
                      dm=1,
                      alpha=alpha,
                      min_alpha=0.00025, 
                      window=2, 
                      min_count=1,
                      vector_size=768,
                      workers=num_workers)
    print("Training model with vocabulary length:", len(doc2vec.wv.vocab))
    
    epoch_count = 10
    print("Training doc2vec")
    doc2vec.train(documents, total_examples=sentence_count, epochs=epoch_count)
    return doc2vec

In [None]:
# Load texts and tokens created by preprocess_and_tokenize_tweets.ipynb
prefix = ""
print("Loading tokens")
tokens = load_json("preprocessed/" + prefix + "tokens.json")
print(len(tokens))
print("Loading texts")
texts = load_json("preprocessed/" + prefix + "texts.json")
print(len(texts))

In [None]:
# Make w2v vectors
print("Building w2v model")
w2v_model = make_word2vec_model(tokens)
print("Built")
print("Converting texts to vectors")
w2v_vectors = get_sentence_vectors(tokens, w2v_model)
print(len(w2v_vectors))
w2v_text_vec = {}
for i, t in enumerate(texts):
    w2v_text_vec[t] = w2v_vectors[i]
print("Saving")
save_bin(w2v_text_vec, "preprocessed/" + prefix + "word2vec_text_vec.pkl")
print("Done")

In [None]:
# Make d2v vectors
print("Building d2v model")
d2v_model = make_doc2vec_model(tokens)
print("Model built")
d2v_text_vec = {}
for i, text in enumerate(texts):
    vec = d2v_model.docvecs[i]
    d2v_text_vec[text] = vec
print("Saving")
save_bin(d2v_text_vec, "preprocessed/" + prefix + "doc2vec_text_vec.pkl")
print("Done")

In [None]:
from sentence_transformers import SentenceTransformer
print("Instantiating BERT model")
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')

# Since converting sentences to bert vectors is time-consuming
# we load the existing vectors and obtain a list of those sentences
# that haven't yet been converted, so that we don't have to convert
# all every time we run this.
# Converting 100,000 sentences takes about an hour
old_bert_text_vec = {}
print("Loading existing vectors")
if os.path.exists("preprocessed/bert_text_vec.pkl"):
    old_bert_text_vec = load_bin("preprocessed/" + prefix + "bert_text_vec.pkl")
    print(len(old_bert_text_vec))
already_processed = set([x for x, v in old_bert_text_vec.items()])
not_processed = list(set(texts).difference(already_processed))
print("Not processed: " + str(len(not_processed)))

# This encodes the vectors. There is no output during the process
# so just be patient
bert_vectors = bert_model.encode(not_processed)
print("Vectors encoded")

# Combine newly created bert vectors with those that were already saved
for i, t in enumerate(not_processed):
    old_bert_text_vec[t] = bert_vectors[i]
bert_text_vec = {}
for t in texts:
    bert_text_vec[t] = old_bert_text_vec[t]

# Save the new set
print("Saving vectors")
save_bin(bert_text_vec, "preprocessed/" + prefix + "bert_text_vec.pkl")
print("Done")

In [None]:
# Combine the three sentence vectors into meta embeddings and save for next step
combined_vecs = []
for i, text in enumerate(texts):
    if i % 100000 == 0:
        print(i)
    bert_vec = bert_text_vec[text]
    d2v_vec = d2v_text_vec[text]
    w2v_vec = w2v_text_vec[text]
    combined = np.sum([bert_vec, d2v_vec, w2v_vec], axis=0)
    combined_vecs.append(combined)
print("Saving")
save_bin(combined_vecs, "preprocessed/" + prefix + "combined_vecs.pkl")
print("Done. Now execute tweet_graph_analysis.ipynb")