In [88]:
import numpy as np
import pandas as pd
import utils
from sklearn.utils.extmath import randomized_svd
from pathlib import Path
from collections import defaultdict

In [2]:
eids, labels, twitter_ids = utils.get_twitter_conversations("data/Twitter.txt")

In [3]:
dfs = []
for i in range(22):
    dfs.append(pd.read_csv(f"data/Twitter/{eids[i]}.csv").drop(["Unnamed: 0"], axis=1))

In [5]:
dfs[0].author_id

0       18235216
1       18235216
2      799246688
3       13719342
4      204428239
         ...    
74      10796562
75     477066816
76    2484679886
77    2331758420
78     552221011
Name: author_id, Length: 79, dtype: int64

In [58]:
authors = pd.concat([df.author_id for df in dfs], ignore_index=True)
authors, count = np.unique(authors, return_counts=True)
n_authors = len(authors)

In [61]:
count[count > 1]

array([ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2, 17, 24,  3,
        2,  9,  2,  2, 11,  2,  2,  2,  2,  2,  2,  2,  3, 18,  3,  2,  2,
        2,  4, 17,  2,  2,  2,  3,  2,  2,  2,  2,  3,  2,  2,  4,  2,  2,
        2,  3,  2, 18,  3,  2,  2,  4,  3,  2,  2,  2,  3,  3,  3,  3,  2,
        3,  3,  2,  3,  2,  3,  2,  2,  2,  3,  3,  2,  3,  3,  2,  2,  2,
        2,  2,  3,  2,  2,  3,  7,  2,  2,  2,  5,  2,  9,  2,  2,  4,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  4,  3,  3,  5,  2,  2,  2,  3,  2,
        2,  2,  2,  2,  2,  2,  2,  2,  3,  2,  2,  2,  2,  2,  3,  3,  2,
        6,  2,  4,  3, 10, 12,  2,  2,  4,  2,  2,  9,  2,  2,  2,  3,  2,
        3,  6,  4,  2,  4,  2,  2,  2,  3,  3,  2,  3,  5,  2,  2,  2,  3,
        7,  2,  2,  2,  2, 11,  2,  2,  2,  2, 10,  2,  3,  2,  3,  2,  2,
        3,  2,  4,  2,  2,  2,  3,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,
        4,  2,  2,  3,  2,  2,  3,  2,  2,  2,  2,  2,  2,  2,  3,  3,  2,
        2,  2,  2,  5,  2

In [14]:
author_id_dict = {}
for i, author in enumerate(authors):
    author_id_dict[author] = i

In [43]:
n_doc = 22
m = np.zeros((n_authors, n_doc))

In [44]:
for i, df in enumerate(dfs):
    author_idx = [author_id_dict[author] for author in df.author_id]
    m[author_idx, i] = 1

In [56]:
u,s,v = randomized_svd(m, n_components=10, n_iter=7, random_state=42)

In [57]:
np.dot(u, np.diag(s)).shape

(4521, 10)

In [114]:
def get_twitter_from_dir(dir):
    files = list(Path(dir).glob("*.csv"))
    eids = []
    dfs = []
    for f in files:
        # df = df.reindex(sorted(df.columns), axis=1)
        eids.append(f.stem)
        dfs.append(pd.read_csv(f))
    return eids, dfs

#eids, dfs = get_twitter_from_dir("data/Twitter/")

In [98]:
def get_user_engagement(eids, dfs):
    user_id = {}
    eid_engagement = {}
    for eid, df in zip(eids, dfs):
        engagement = {}
        count = df["author_id"].value_counts()
        for index, c in count.iteritems():
            try:
                u_id = user_id[index]
            except KeyError:
                u_id = len(user_id)
                user_id[index] = u_id
            engagement[u_id] = c
        eid_engagement[eid] = engagement

    return user_id, eid_engagement

user_dict, engagement = get_user_engagement(eids, dfs)

In [109]:
def get_matrix_from_engagement(user_id, engagement):
    m = np.zeros((len(user_id), len(engagement)))
    for i, val in enumerate(engagement.values()):
        m[list(val.keys()), i] = list(val.values())
    return m, np.clip(m, 0, 1)

user_mat, user_article_mat = get_matrix_from_engagement(user_dict, engagement)

In [112]:
def get_reduced_svm_mat(mat, k_dim, random_state=42):
    u, s, vt = randomized_svd(mat, n_components=k_dim, random_state=random_state)
    return u.dot(np.diag(s))

get_reduced_svm_mat(user_article_mat, 10)

array([[2.71331346e-03, 4.14388831e-04, 4.91648326e-02, ...,
        2.32631888e-01, 2.16046345e-01, 5.17130943e-01],
       [2.71331346e-03, 4.14388831e-04, 4.91648326e-02, ...,
        2.32631888e-01, 2.16046345e-01, 5.17130943e-01],
       [2.28862688e-02, 1.22755569e-02, 6.55109011e-01, ...,
        7.07769406e-01, 6.49125403e-01, 1.24303116e+00],
       ...,
       [7.41105841e-04, 3.51695693e-04, 3.97647564e-02, ...,
        1.20563391e-01, 1.10140572e-01, 1.85311725e-01],
       [7.41105841e-04, 3.51695693e-04, 3.97647564e-02, ...,
        1.20563391e-01, 1.10140572e-01, 1.85311725e-01],
       [7.41105841e-04, 3.51695693e-04, 3.97647564e-02, ...,
        1.20563391e-01, 1.10140572e-01, 1.85311725e-01]])

In [115]:
def get_user_matrices(dir, n_components, random_state=42, return_user_dict=True):
    eids, dfs = get_twitter_from_dir(dir)
    user_dict, engagement = get_user_engagement(eids, dfs)
    user_mat, user_article_mat = get_matrix_from_engagement(user_dict, engagement)
    user_mat_reduced = get_reduced_svm_mat(user_mat, n_components[0])
    user_article_mat_reduced = get_reduced_svm_mat(user_article_mat, n_components[1])
    return user_mat_reduced, user_article_mat_reduced

get_user_matrices("data/Twitter/", (10, 10))

(array([[ 1.26640704e-02,  5.56844689e-03, -8.21037344e-05, ...,
          5.72974196e-02,  1.40018374e-01,  1.07380362e+00],
        [ 1.26640704e-02,  5.56844689e-03, -8.21037343e-05, ...,
          5.72974196e-02,  1.40018374e-01,  1.07380362e+00],
        [ 2.05533950e+00,  1.79618333e-03, -2.29091239e-03, ...,
          2.54822749e-01,  4.99712253e-01,  3.49806976e+00],
        ...,
        [ 4.90517858e-03,  7.44784183e-04, -2.80940497e-06, ...,
          2.50154138e-02,  4.46212818e-02,  2.38419088e-01],
        [ 4.90517858e-03,  7.44784183e-04, -2.80940497e-06, ...,
          2.50154138e-02,  4.46212818e-02,  2.38419088e-01],
        [ 4.90517858e-03,  7.44784183e-04, -2.80940497e-06, ...,
          2.50154138e-02,  4.46212818e-02,  2.38419088e-01]]),
 array([[2.71331346e-03, 4.14388831e-04, 4.91648326e-02, ...,
         2.32631888e-01, 2.16046345e-01, 5.17130943e-01],
        [2.71331346e-03, 4.14388831e-04, 4.91648326e-02, ...,
         2.32631888e-01, 2.16046345e-01, 5.1713