In [1]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [2]:
eids, dfs = utils.get_twitter_from_dir("data/Twitter/", columns=["author_id", "created_at", "text"])

In [3]:
def timedelta(df):
    df["created_at"] = pd.to_datetime(df["created_at"])
    df.sort_values(by="created_at", inplace=True)
    df["timedelta"] = (df["created_at"] - df["created_at"].iloc[0]).dt.total_seconds()
    df.reset_index(drop=True, inplace=True)
    return df

def add_bins(df, bin_size):
    df["bin"] = pd.cut(df["timedelta"],
                       range(0, int(df["timedelta"].max()) + bin_size, bin_size),
                       include_lowest=True, right=True, labels=False)
    non_empty_bins = df["bin"].unique()
    bin_timedelta_map = {b2: b2 - b1 for b1, b2 in zip(non_empty_bins, non_empty_bins[1:])}
    bin_timedelta_map[0] = 0
    df["timedelta_previous_bin"] = df["bin"].apply(lambda x: bin_timedelta_map[x])
    return df

def cut_bins(df, threshold):
    if threshold is None:
        return df
    return df.loc[df["bin"] <= threshold]

bin_size = 60 * 60 #  1 hours
threshold = None#2 * 90 #  180 bins = 180 * 12 hours = 90 days
for i, df in enumerate(dfs):
    dfs[i] = cut_bins(add_bins(timedelta(df), bin_size), threshold)


In [9]:
text = pd.concat([df.text for df in dfs], ignore_index=True)

In [14]:
def get_document(texts, tokens_only=False):
    for i, text in enumerate(texts):
        tokens = gensim.utils.simple_preprocess(text)
        if tokens_only:
            yield tokens
        else:
            yield TaggedDocument(tokens, [i])

corpus = list(get_document(text))

In [19]:
len(corpus)

787785

In [15]:
model = Doc2Vec(min_count=1, vector_size=100, epochs=20)
model.build_vocab(corpus)

In [16]:
model.train(corpus, total_examples=model.corpus, epochs=model.epochs)

In [20]:
model.infer_vector(corpus[0][0]).shape

(100,)

In [21]:
model.save("models/doc2vec_gensim_100.model")