In [1]:
import numpy as np
import pandas as pd
import utils
from sklearn.utils.extmath import randomized_svd
from pathlib import Path
from collections import defaultdict
import json

In [2]:
def get_user_engagement(eids, dfs):
    user_id = {}
    eid_engagement = {}
    for eid, df in zip(eids, dfs):
        engagement = {}
        count = df["author_id"].value_counts()
        for index, c in count.iteritems():
            try:
                u_id = user_id[index]
            except KeyError:
                u_id = len(user_id)
                user_id[index] = u_id
            engagement[u_id] = c
        eid_engagement[eid] = engagement

    return user_id, eid_engagement

def get_matrix_from_engagement(user_id, engagement):
    m = np.zeros((len(user_id), len(engagement)))
    for i, val in enumerate(engagement.values()):
        m[list(val.keys()), i] = list(val.values())
    return m, np.clip(m, 0, 1)

def get_reduced_svm_mat(mat, k_dim, random_state=42):
    u, s, vt = randomized_svd(mat, n_components=k_dim, random_state=random_state)
    return u.dot(np.diag(s))


In [13]:
eids, dfs = utils.get_twitter_from_dir("data/Twitter/", 5, ["author_id"])
user_dict, engagement = get_user_engagement(eids, dfs)
user_mat, user_article_mat = get_matrix_from_engagement(user_dict, engagement)

get_reduced_svm_mat(user_article_mat, 10).shape

In [14]:
print(user_mat[0])
print(user_article_mat[0])

[2. 0. 0. 0. 0. 0.]
[1. 0. 0. 0. 0. 0.]


In [3]:
def get_user_matrices(dir, n_components, random_state=42, return_user_dict=True):
    eids, dfs = utils.get_twitter_from_dir(dir, columns=["author_id"])
    user_dict, engagement = get_user_engagement(eids, dfs)
    user_mat, user_bin_mat = get_matrix_from_engagement(user_dict, engagement)
    user_mat_reduced = get_reduced_svm_mat(user_mat, n_components[0], random_state)
    user_bin_mat_reduced = get_reduced_svm_mat(user_bin_mat, n_components[1], random_state)
    if return_user_dict:
        return user_mat_reduced, user_bin_mat_reduced, user_dict
    return user_mat_reduced, user_bin_mat_reduced

user_mat, user_bin_mat, user_dict = get_user_matrices("data/Twitter/", (50, 20))

In [4]:
np.savez("models/user_matrices", user_mat=user_mat, user_bin_mat=user_bin_mat)

In [5]:
with open("models/user_dict.json", "w") as file:
    json.dump(user_dict, file, indent=4)