In [1]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [68]:
eids, dfs = utils.get_twitter_from_dir("data/Twitter/", 5)

In [69]:
def timedelta(df):
    df["created_at"] = pd.to_datetime(df["created_at"])
    df.sort_values(by="created_at", inplace=True)
    df["timedelta"] = (df["created_at"] - df["created_at"].iloc[0]).dt.total_seconds()
    df.reset_index(drop=True, inplace=True)
    return df

def add_bins(df, bin_size):
    df["bin"] = pd.cut(df["timedelta"],
                       range(0, int(df["timedelta"].max()) + bin_size, bin_size),
                       include_lowest=True, right=True, labels=False)
    non_empty_bins = df["bin"].unique()
    bin_timedelta_map = {b2: b2 - b1 for b1, b2 in zip(non_empty_bins, non_empty_bins[1:])}
    bin_timedelta_map[0] = 0
    df["timedelta_previous_bin"] = df["bin"].apply(lambda x: bin_timedelta_map[x])
    return df

def cut_bins(df, threshold):
    return df.loc[df["bin"] <= threshold]

bin_size = 60 * 60 * 12 #  12 hours
threshold = 2 * 90 #  180 bins = 180 * 12 hours = 90 days
for i, df in enumerate(dfs):
    dfs[i] = cut_bins(add_bins(timedelta(df), bin_size), threshold)

In [70]:
for df in dfs:
    print(len(df))

38
79
161
3
34
29


In [64]:
dfs[3]

Unnamed: 0,author_id,created_at,id,text,timedelta,bin,timedelta_previous_bin
0,6428152,2008-03-05 02:17:04+00:00,766826971,McCain is the predicted GOP winner (what a com...,0.0,0,0
1,11546552,2008-03-05 20:05:46+00:00,767201559,George W. Bush Dances On The Grave Of McCain's...,64122.0,1,1
2,2260601,2008-05-23 05:18:26+00:00,818031867,«Il est toujours plus convenable d'avoir 20 an...,6836482.0,158,157


In [None]:
dfs = []
for i in range(22):
    dfs.append(pd.read_csv(f"data/Twitter/{eids[i]}.csv").drop(["Unnamed: 0"], axis=1))

In [27]:
text = pd.concat([df.text for df in dfs], ignore_index=True)

In [36]:
def get_document(texts, tokens_only=False):
    for i, text in enumerate(texts):
        tokens = gensim.utils.simple_preprocess(text)
        if tokens_only:
            yield tokens
        else:
            yield TaggedDocument(tokens, [i])

corpus = list(get_document(text))

In [37]:
model = Doc2Vec(min_count=1, vector_size=100, epochs=20)
model.build_vocab(corpus)

In [39]:
model.train(corpus, total_examples=model.corpus_count, epochs=model.epochs)

In [40]:
model.infer_vector(corpus[0][0])

array([-0.17115064, -0.05264604, -0.02527478, -0.07649111,  0.08997355,
       -0.02585767,  0.07933673, -0.00032029, -0.03986941, -0.1816277 ,
        0.04672395,  0.01782438, -0.15455325, -0.04994693,  0.147877  ,
        0.05942931, -0.17510912, -0.08842883, -0.09098212,  0.12263536,
       -0.14447547,  0.06814875, -0.07894375, -0.00067529,  0.02474921,
        0.01541242, -0.1718468 , -0.0121068 , -0.0084113 , -0.08377558,
        0.00078501, -0.02814474,  0.01598635,  0.19416256, -0.06209496,
        0.19483422,  0.07029165, -0.16450347, -0.10841694,  0.0283405 ,
        0.09851441,  0.03884224,  0.03381258, -0.16646175, -0.0633166 ,
       -0.02029075, -0.09742698, -0.0827698 , -0.03703212, -0.00772073,
        0.09910872, -0.12129635,  0.15699853, -0.12124533, -0.06546462,
       -0.00630968, -0.0254756 ,  0.08579536, -0.00882947, -0.00762816,
        0.03582111,  0.06079223,  0.03300074, -0.1500538 ,  0.01795505,
       -0.09092139, -0.04543098, -0.0754821 ,  0.01237744, -0.19

In [42]:
model.save("models/doc2vec_gensim_100.model")