In [1]:
import gensim
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import utils
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json



In [2]:
eids, dfs = utils.get_twitter_from_dir("data/Twitter/", columns=["author_id", "created_at", "text"])

In [3]:
eids_, labels, _ = utils.get_twitter_conversations("data/Twitter.txt")
eid_label = {}
for eid, label in zip(eids_, labels):
    eid_label[eid] = label
with open("models/eid_label_dict.json", "w") as file:
    json.dump(eid_label, file, indent=4)

In [4]:
model = Doc2Vec.load("models/doc2vec_gensim_100.model")
with open("models/user_dict.json", "r") as file:
    user_dict = json.load(file)
with open("models/eid_label_dict.json", "r") as file:
    eid_label_dict = json.load(file)
with np.load("models/user_matrices.npz") as mats:
    user_mat = mats["user_mat"]
    user_bin_mat = mats["user_bin_mat"]

In [5]:
def get_document(texts, tokens_only=False):
    for i, text in enumerate(texts):
        tokens = gensim.utils.simple_preprocess(text)
        if tokens_only:
            yield tokens
        else:
            yield TaggedDocument(tokens, [i])

def add_embedding(df, doc2word_model):
    tokens = list(get_document(df["text"].to_list(), True))
    embeddings = [doc2word_model.infer_vector(tweet_tokens) for tweet_tokens in tokens]
    df["embedding"] = embeddings
    return df

def add_author(df, user_dict):
    df["author"] = df["author_id"].apply(lambda x: user_dict[str(x)])
    return df

def add_user_information(df, user_bin_mat):
    df["author_info"] = df["author"].apply(lambda x: user_bin_mat[x])
    return df

def to_model_input(df):
    foo = df[["bin", "timedelta_previous_bin", "embedding", "author_info"]]\
        .groupby(by="bin")\
        .agg(count=("bin", "size"),
             timedelta_previous_bin=("timedelta_previous_bin", "max"),
             embedding=("embedding", "mean"),
             author_info=("author_info", "mean"))\
        .reset_index()
    return np.vstack(foo.drop("bin", axis=1).apply(lambda x: np.hstack(x.values), axis=1).values)

def to_model_user_input(df, user_mat):
    return np.vstack(df["author"].apply(lambda x: user_mat[x]).values)


model_data = []
bin_size = 60 * 60 * 12 #  12 hours
threshold = 2 * 90 #  180 bins = 180 * 12 hours = 90 days
for i, (eid, df) in enumerate(zip(eids, dfs)):
    df = utils.cut_bins(utils.add_bins(utils.timedelta(df), bin_size), threshold)
    df = add_embedding(df, model)
    df = add_author(df, user_dict)
    df = add_user_information(df, user_bin_mat)
    model_data.append([eid, eid_label_dict[eid], to_model_input(df), to_model_user_input(df, user_mat)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["embedding"] = embeddings
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["author"] = df["author_id"].apply(lambda x: user_dict[str(x)])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["author_info"] = df["author"].apply(lambda x: user_bin_mat[x])


In [6]:
len(model_data)

992

In [7]:
dataset_df = pd.DataFrame(model_data, columns=["eid", "label", "x", "y"])

In [29]:
data_np = dataset_df.to_numpy()

In [27]:
dataset_df.to_csv("data/processed/data_bin12h_cut180_100w_20xu_50yu.npz", index=False)

In [8]:
np.save("data/processed/data_bin12h_cut180_100w_20xu_50yu.npy", dataset_df.to_numpy())

In [39]:
pd.DataFrame(np.load("data/processed/data_bin12h_cut180_100w_20xu_50yu.npy", allow_pickle=True))

Unnamed: 0,0,1,2,3
0,Airfrance,0,"[[31.0, 0.0, -0.04933199658989906, 0.100782543...","[[5.728417000232961e-07, 7.497009531531812e-12..."
1,Airliner,0,"[[35.0, 0.0, -0.09998174756765366, 0.010489981...","[[2.8766544422301893e-07, 6.536015973668471e-1..."
2,Amanda,0,"[[2.0, 0.0, -0.022407064214348793, -0.11954444...","[[0.0002461661780663931, 3.0744367038198896e-0..."
3,AnnieLe,0,"[[1.0, 0.0, 0.06573211401700974, 0.16498757898...","[[1.6659823040681458e-10, 2.4046260842342657e-..."
4,BarnesNobleObamaMonkey,0,"[[2.0, 0.0, -0.08572602272033691, 0.1293545961...","[[3.3745972849026395e-06, 2.2055798996612257e-..."
...,...,...,...,...
987,Turkishcrash,0,"[[596.0, 0.0, -0.018301477655768394, 0.0502989...","[[3.328299035345699e-06, 8.289281424581946e-11..."
988,twittersummize,0,"[[45.0, 0.0, -0.003588458988815546, 0.11465315...","[[4.780339256276481e-07, 8.799779182761624e-10..."
989,Vanessa,0,"[[22.0, 0.0, 0.03274635970592499, 0.1191900223...","[[0.003943954130925907, 1.7515275933934912e-06..."
990,WesternSpaghetti,0,"[[1.0, 0.0, -0.0018492384115234017, 0.08881287...","[[1.3914953201835611e-10, 1.5261085385096275e-..."
