In [1]:
import os
os.chdir('../..')

In [2]:
import convokit
import pickle
import json

In [3]:
DATA_DIR = "data_sliding_fixed"

In [4]:
os.chdir('convokit/tensors')

In [5]:
with open(os.path.join(DATA_DIR, 'rank_to_factors.p'), 'rb') as f:
    rank_to_factors = pickle.load(f)

In [6]:
with open(os.path.join(DATA_DIR, 'hg_features.p'), 'rb') as f:
    hg_features = pickle.load(f)

with open(os.path.join(DATA_DIR, 'subreddits.p'), 'rb') as f:
    subreddits = pickle.load(f)

In [7]:
from sklearn.preprocessing import StandardScaler
import numpy as np
scaler = StandardScaler()

In [8]:
def get_anomalous_points(factor_full, idx):
    factor = factor_full[:, idx]
    reshaped = factor.reshape((factor.shape[0], 1))
    scaled = scaler.fit_transform(reshaped)
    pos_pts = np.argwhere(scaled.reshape(factor.shape[0]) > 1.5).flatten()
    neg_pts = np.argwhere(scaled.reshape(factor.shape[0]) < -1.5).flatten()
    return pos_pts, neg_pts

In [10]:
time_factor = rank_to_factors[9][0] # (9, 9)
thread_factor = rank_to_factors[9][1] # (10000, 9)
feature_factor = rank_to_factors[9][2] # (140, 9)

In [11]:
with open(os.path.join(DATA_DIR, 'convo_ids.p'), 'rb') as f:
    thread_ids = pickle.load(f)

In [12]:
from convokit import Corpus, download
corpus = Corpus(filename="longreddit_construction/long-reddit-corpus")

In [13]:
for convo in corpus.iter_conversations():
    for idx, utt in enumerate(convo.get_chronological_utterance_list()):
        utt.meta['order'] = idx

In [14]:
def get_convo_details(convo):
    print("Subreddit: {}".format(convo.get_utterance(convo.id).meta['subreddit']))
    convo.print_conversation_structure(lambda utt: str(utt.meta['order']) + ". " + utt.user.id, limit=20)

In [15]:
convos = list(corpus.iter_conversations())

In [16]:
from convokit import HyperConvo

In [17]:
import random

## Inspecting threads from Factor 1

In [18]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [19]:
rank = time_factor.shape[1]
num_examples = 3

In [20]:
for idx in range(rank):
    print("#########################################################")
    print(color.BOLD + "Inspecting threads from factor {}".format(idx+1) + color.END)
    print()
    pos_threads, neg_threads = get_anomalous_points(thread_factor, idx)
    
    print(color.BOLD + color.GREEN + "Positive examples" + color.END)
    print()
    for thread_idx in random.sample(list(pos_threads), num_examples):
        get_convo_details(convos[thread_idx])
        print()
    
    print()
    print(color.BOLD + color.RED + "Negative examples" + color.END)
    for thread_idx in random.sample(list(neg_threads), num_examples):
        get_convo_details(convos[thread_idx])
        print()
        
    print("#########################################################")
    print()

#########################################################
[1mInspecting threads from factor 1[0m

[1m[92mPositive examples[0m

Subreddit: Random_Acts_Of_Amazon
1. browniebiznatch
    2. waxler9311
        3. browniebiznatch
            5. waxler9311
                6. browniebiznatch
                    9. waxler9311
                        13. browniebiznatch
                            14. waxler9311
    4. InThisHouse19
        7. browniebiznatch
            8. InThisHouse19
                10. browniebiznatch
                    11. InThisHouse19
                        12. browniebiznatch
                            15. InThisHouse19
                                16. browniebiznatch
                                    17. InThisHouse19
                                        18. browniebiznatch
                                            19. InThisHouse19
                                        20. wine-inacan

Subreddit: Libertarian
1. Spelchek860
    2. GenitalDiddler
   