In [1]:
import os
os.chdir('../../..')

In [2]:
import convokit

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
from convokit import Corpus, HyperConvo, TensorDecomposer

First we download the reddit corpus:

In [5]:
# corpus = Corpus(filename="convokit/thread_generator/fake-corpus-trajectory-40")
corpus = Corpus(filename="convokit/thread_generator/annotated-fake-trajectory-experiment")
# corpus = Corpus(filename="convokit/tensor_decomposer/experiments/reddit-trajectory-subset-annotated")

In [6]:
corpus.print_summary_stats()

Number of Speakers: 40
Number of Utterances: 375000
Number of Conversations: 15000


In [43]:
hyperconv_range = range(2, 20+1)

In [8]:
# def multi_hyperconv_transform(corpus, hyperconv_range):
#     hc_transformers = [HyperConvo(prefix_len=i, feat_name="hyperconvo-{}".format(i), invalid_val=-1) for i in hyperconv_range]
#     for idx, hc in enumerate(list(reversed(hc_transformers))):
#         print(hyperconv_range[-1]-idx)
#         hc.transform(corpus)
# multi_hyperconv_transform(corpus, hyperconv_range)
# corpus.dump('annotated-fake-trajectory-40', base_path="convokit/thread_generator")

In [44]:
# grown = 0
# thresh = 1.5
# for convo in corpus.iter_conversations():
#     utts40 = convo.get_chronological_utterance_list()
#     utts20 = utts40[:20]
#     num_spkrs_20 = len(set(utt.speaker.id for utt in utts20))
#     num_spkrs_40 = len(set(utt.speaker.id for utt in utts40))
#     convo.meta['grown'] = (num_spkrs_40 / num_spkrs_20) >= thresh
#     grown += (num_spkrs_40 / num_spkrs_20) >= thresh

In [8]:
grown = 0
thresh = 1.5
for convo in corpus.iter_conversations():
    try:
        recip_div = convo.meta['hyperconvo-25']['count[reciprocity motif]'] / convo.meta['hyperconvo-20']['count[reciprocity motif]']
    except ZeroDivisionError:
        recip_div = 2
    convo.meta['grown'] = recip_div >= thresh
    grown += convo.meta['grown']

In [9]:
grown

6233

In [11]:
from convokit import Classifier

In [10]:
# clf = Classifier(obj_type="conversation", pred_feats=['hyperconvo-20'], labeller=lambda convo: convo.meta['grown'],
#                 clf_feat_name='hyperconv-pred', clf_prob_feat_name='hyperconv-pred-score'
#                 )

In [12]:
clf = Classifier(obj_type="conversation", pred_feats=['hyperconvo-20'], labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='hyperconv-pred', clf_prob_feat_name='hyperconv-pred-score'
                )

In [13]:
res = clf.evaluate_with_cv(corpus)

Using corpus objects...
Running a cross-validated evaluation...
Done.


In [14]:
np.mean(res)

0.8206

## Tensor feats (rank 3)

In [15]:
td = TensorDecomposer(obj_type="conversation",
                      feature_set=["hyperconvo-{}".format(i) for i in range(2, 21)],
                      group_func=lambda convo: convo.get_utterance(convo.id).meta['subreddit'],
                      rank=3, tensor_func='tensorly'
                     )

In [16]:
td.fit(corpus)

Constructing tensor...Done.
Decomposing tensor...Done.


<convokit.tensor_decomposer.tensorDecomposer.TensorDecomposer at 0x12868f690>

In [17]:
td.transform(corpus)

<convokit.model.corpus.Corpus at 0x1283f4dd0>

In [18]:
from convokit import BoWClassifier

In [19]:
clf_tensor = BoWClassifier(obj_type="conversation", vector_name='tensor_factor', labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='tensor-pred', clf_prob_feat_name='tensor-pred-score'
                )

Initializing default classification model (standard scaled logistic regression)


In [20]:
res = clf_tensor.evaluate_with_cv(corpus)

Using corpus objects...

Running a cross-validated evaluation...
Done.


In [21]:
np.mean(res)

0.7932

In [63]:
for convo in corpus.iter_conversations():
    convo.meta['tensor_rank3'] = convo.meta['tensor_factor']

### Rank 9 decomp

In [28]:
td_9 = TensorDecomposer(obj_type="conversation",
                      feature_set=["hyperconvo-{}".format(i) for i in range(2, 21)],
                      group_func=lambda convo: convo.get_utterance(convo.id).meta['subreddit'],
                      rank=9, tensor_func='tensorly'
                     )

In [29]:
td_9.fit(corpus)

Constructing tensor...Done.
Decomposing tensor...Done.


<convokit.tensor_decomposer.tensorDecomposer.TensorDecomposer at 0x1286aae50>

In [30]:
td_9.transform(corpus)

<convokit.model.corpus.Corpus at 0x1283f4dd0>

In [31]:
clf_tensor_9 = BoWClassifier(obj_type="conversation", vector_name='tensor_factor', labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='tensor-pred', clf_prob_feat_name='tensor-pred-score'
                )

Initializing default classification model (standard scaled logistic regression)


In [32]:
res = clf_tensor_9.evaluate_with_cv(corpus)

Using corpus objects...

Running a cross-validated evaluation...
Done.


In [33]:
np.mean(res)

0.8355333333333335

### Baseline: class information

In [25]:
# for generated data
for idx, convo in enumerate(corpus.iter_conversations()):
    convo.meta['class1'] = int((idx / 5000) < 1)
    convo.meta['class2'] = int(1 <= (idx / 5000) < 2)
    convo.meta['class3'] = int(idx/5000 >= 2)

In [26]:
clf_base = Classifier(obj_type="conversation", pred_feats=['class1', 'class2', 'class3'], 
                 labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='base-pred', clf_prob_feat_name='base-pred-score'
                )

In [27]:
np.mean(clf_base.evaluate_with_cv(corpus))

Using corpus objects...
Running a cross-validated evaluation...
Done.


0.8785999999999999

## Combined feature sets

In [73]:
grown

788

In [74]:
corpus.print_summary_stats()

Number of Speakers: 25994
Number of Utterances: 53135
Number of Conversations: 1500


### Hyperconvo-20 + TCA (rank 9)

In [75]:
for convo in corpus.iter_conversations():
    convo.meta['tensor_dict'] = {idx: v for idx, v in enumerate(convo.meta['tensor_factor'])}

In [78]:
clf_all = Classifier(obj_type="conversation", pred_feats=['hyperconvo-15', 'tensor_dict'], labeller=lambda convo: convo.meta['grown'],
                )

In [79]:
res = clf_all.evaluate_with_cv(corpus)

Using corpus objects...
Running a cross-validated evaluation...
Done.


In [80]:
np.mean(res)

0.7333333333333334

### Hyperconvo-20 + TCA (rank 3)

In [81]:
for convo in corpus.iter_conversations():
    convo.meta['tensor_dict3'] = {idx: v for idx, v in enumerate(convo.meta['tensor_rank3'])}

In [82]:
clf_all_3 = Classifier(obj_type="conversation", pred_feats=['hyperconvo-15', 'tensor_dict3'], labeller=lambda convo: convo.meta['grown'],
                )

In [83]:
res = clf_all_3.evaluate_with_cv(corpus)
np.mean(res)

Using corpus objects...
Running a cross-validated evaluation...
Done.


0.7333333333333334