In [2]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [3]:
eids, dfs = utils.get_twitter_from_dir("data/Twitter/")

In [8]:
def get_bins(time, bin_size, threshold):
    return np.histogram(time, range(0, threshold, bin_size))

hist, bins = get_bins(dfs[0]["timedelta"], 60*60*12, 60*60*24*90)

In [11]:
nonzero_bin_idx = hist.nonzero()[0]
nonzero_bin_idx

array([ 0,  1,  2,  3,  4,  5,  6,  8,  9, 16, 40, 44], dtype=int64)

In [13]:
np.insert(nonzero_bin_idx[1:] - nonzero_bin_idx[:-1], 0, 0)

array([ 0,  1,  1,  1,  1,  1,  1,  2,  1,  7, 24,  4], dtype=int64)

In [23]:
bins[hist.nonzero()[0]]

array([      0,   43200,   86400,  129600,  172800,  216000,  259200,
        345600,  388800,  691200, 1728000, 1900800])

In [4]:
def add_bins(dfs, bin_size, threshold):
    for df in dfs:
        df["bin"] = pd.cut(df["timedelta"], range(0, min(int(df["timedelta"].max() + bin_size), threshold), bin_size),
                           include_lowest=True, right=False, labels=False)
        df["timedelta_previous_bin"] = df["bin"].diff(periods=1) * bin_size
        df.at[0, "timedelta_previous_bin"] = 0
    return dfs
dfs = add_bins(dfs, 60*60*12, 60*60*24*90)

In [7]:
dfs[0][0:2]

Unnamed: 0,author_id,created_at,id,text,timedelta,bin,timedelta_previous_bin
0,14294848,2015-11-29 19:14:12+00:00,671044506154893312,Walmart has been accused of forcing a Marine t...,0.0,0,0.0
1,2896866728,2015-11-29 19:16:09+00:00,671044995600613376,Toys for Nots https://t.co/XGAnfjbRLM https://...,117.0,0,0.0


In [None]:
dfs = []
for i in range(22):
    dfs.append(pd.read_csv(f"data/Twitter/{eids[i]}.csv").drop(["Unnamed: 0"], axis=1))

In [27]:
text = pd.concat([df.text for df in dfs], ignore_index=True)

In [36]:
def get_document(texts, tokens_only=False):
    for i, text in enumerate(texts):
        tokens = gensim.utils.simple_preprocess(text)
        if tokens_only:
            yield tokens
        else:
            yield TaggedDocument(tokens, [i])

corpus = list(get_document(text))

In [37]:
model = Doc2Vec(min_count=1, vector_size=100, epochs=20)
model.build_vocab(corpus)

In [39]:
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [40]:
model.infer_vector(corpus[0][0])

array([-0.17115064, -0.05264604, -0.02527478, -0.07649111,  0.08997355,
       -0.02585767,  0.07933673, -0.00032029, -0.03986941, -0.1816277 ,
        0.04672395,  0.01782438, -0.15455325, -0.04994693,  0.147877  ,
        0.05942931, -0.17510912, -0.08842883, -0.09098212,  0.12263536,
       -0.14447547,  0.06814875, -0.07894375, -0.00067529,  0.02474921,
        0.01541242, -0.1718468 , -0.0121068 , -0.0084113 , -0.08377558,
        0.00078501, -0.02814474,  0.01598635,  0.19416256, -0.06209496,
        0.19483422,  0.07029165, -0.16450347, -0.10841694,  0.0283405 ,
        0.09851441,  0.03884224,  0.03381258, -0.16646175, -0.0633166 ,
       -0.02029075, -0.09742698, -0.0827698 , -0.03703212, -0.00772073,
        0.09910872, -0.12129635,  0.15699853, -0.12124533, -0.06546462,
       -0.00630968, -0.0254756 ,  0.08579536, -0.00882947, -0.00762816,
        0.03582111,  0.06079223,  0.03300074, -0.1500538 ,  0.01795505,
       -0.09092139, -0.04543098, -0.0754821 ,  0.01237744, -0.19

In [42]:
model.save("models/doc2vec_gensim_100.model")