In [1]:
import os
os.chdir('../../..')

In [2]:
import convokit

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
from convokit import Corpus, HyperConvo, TensorDecomposer

First we download the reddit corpus:

In [5]:
# corpus = Corpus(filename="convokit/thread_generator/fake-corpus-trajectory-40")
corpus = Corpus(filename="convokit/thread_generator/annotated-fake-trajectory-40")

In [6]:
corpus.print_summary_stats()

Number of Speakers: 200
Number of Utterances: 240000
Number of Conversations: 6000


In [7]:
hyperconv_range = range(2, 20+1)

In [8]:
# def multi_hyperconv_transform(corpus, hyperconv_range):
#     hc_transformers = [HyperConvo(prefix_len=i, feat_name="hyperconvo-{}".format(i), invalid_val=-1) for i in hyperconv_range]
#     for idx, hc in enumerate(list(reversed(hc_transformers))):
#         print(hyperconv_range[-1]-idx)
#         hc.transform(corpus)
# multi_hyperconv_transform(corpus, hyperconv_range)
# corpus.dump('annotated-fake-trajectory-40', base_path="convokit/thread_generator")

In [8]:
grown = 0
thresh = 1.5
for convo in corpus.iter_conversations():
    utts40 = convo.get_chronological_utterance_list()
    utts20 = utts40[:20]
    num_spkrs_20 = len(set(utt.speaker.id for utt in utts20))
    num_spkrs_40 = len(set(utt.speaker.id for utt in utts40))
    convo.meta['grown'] = (num_spkrs_40 / num_spkrs_20) >= thresh
    grown += (num_spkrs_40 / num_spkrs_20) >= thresh

In [10]:
from convokit import Classifier

In [11]:
clf = Classifier(obj_type="conversation", pred_feats=['hyperconvo-20'], labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='hyperconv-pred', clf_prob_feat_name='hyperconv-pred-score'
                )

In [12]:
res = clf.evaluate_with_cv(corpus)

Using corpus objects...
Running a cross-validated evaluation...
Done.


In [13]:
np.mean(res)

0.8323333333333334

## Tensor feats (rank 3)

In [14]:
td = TensorDecomposer(obj_type="conversation",
                      feature_set=["hyperconvo-{}".format(i) for i in range(2, 21)],
                      group_func=lambda convo: convo.get_utterance(convo.id).meta['subreddit'],
                      rank=3, tensor_func='tensorly'
                     )

In [15]:
td.fit(corpus)

Constructing tensor...Done.
Decomposing tensor...Done.


<convokit.tensor_decomposer.tensorDecomposer.TensorDecomposer at 0x14a2e9bd0>

In [16]:
td.transform(corpus)

<convokit.model.corpus.Corpus at 0x13480f2d0>

In [17]:
from convokit import BoWClassifier

In [18]:
clf_tensor = BoWClassifier(obj_type="conversation", vector_name='tensor_factor', labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='tensor-pred', clf_prob_feat_name='tensor-pred-score'
                )

Initializing default classification model (standard scaled logistic regression)


In [20]:
res = clf_tensor.evaluate_with_cv(corpus)

Using corpus objects...

Running a cross-validated evaluation...
Done.


In [21]:
np.mean(res)

0.8030000000000002

In [22]:
for convo in corpus.iter_conversations():
    convo.meta['tensor_rank3'] = convo.meta['tensor_factor']

### Rank 9 decomp

In [23]:
td_9 = TensorDecomposer(obj_type="conversation",
                      feature_set=["hyperconvo-{}".format(i) for i in range(2, 21)],
                      group_func=lambda convo: convo.get_utterance(convo.id).meta['subreddit'],
                      rank=9, tensor_func='tensorly'
                     )

In [24]:
td_9.fit(corpus)

Constructing tensor...Done.
Decomposing tensor...Done.


<convokit.tensor_decomposer.tensorDecomposer.TensorDecomposer at 0x134833410>

In [25]:
td_9.transform(corpus)

<convokit.model.corpus.Corpus at 0x13480f2d0>

In [26]:
clf_tensor_9 = BoWClassifier(obj_type="conversation", vector_name='tensor_factor', labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='tensor-pred', clf_prob_feat_name='tensor-pred-score'
                )

Initializing default classification model (standard scaled logistic regression)


In [28]:
res = clf_tensor_9.evaluate_with_cv(corpus)

Using corpus objects...

Running a cross-validated evaluation...
Done.


In [29]:
np.mean(res)

0.8046666666666666

## Combined feature sets

In [35]:
grown

2720

In [36]:
corpus.print_summary_stats()

Number of Speakers: 200
Number of Utterances: 240000
Number of Conversations: 6000


### Hyperconvo-20 + TCA (rank 9)

In [38]:
for convo in corpus.iter_conversations():
    convo.meta['tensor_dict'] = {idx: v for idx, v in enumerate(convo.meta['tensor_factor'])}

In [None]:
clf_all = Classifier(obj_type="conversation", pred_feats=['hyperconvo-20', 'tensor_dict'], labeller=lambda convo: convo.meta['grown'],
                )

In [33]:
res = clf_all.evaluate_with_cv(corpus)

NameError: name 'clf_all' is not defined

In [None]:
np.mean(res)

### Hyperconvo-20 + TCA (rank 3)

In [30]:
for convo in corpus.iter_conversations():
    convo.meta['tensor_dict3'] = {idx: v for idx, v in enumerate(convo.meta['tensor_rank3'])}

In [34]:
clf_all_3 = Classifier(obj_type="conversation", pred_feats=['hyperconvo-20', 'tensor_dict3'], labeller=lambda convo: convo.meta['grown'],
                )

In [35]:
res = clf_all_3.evaluate_with_cv(corpus)
np.mean(res)

Using corpus objects...
Running a cross-validated evaluation...
Done.


0.8323333333333334