In [1]:
import os
os.chdir('../../..')

In [2]:
import convokit

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
from convokit import Corpus, HyperConvo, TensorDecomposer, download

First we download the reddit corpus:

In [12]:
# corpus = Corpus(filename="convokit/thread_generator/fake-corpus-trajectory-40")
# corpus = Corpus(filename="convokit/thread_generator/annotated-fake-trajectory-40")
corpus = Corpus(download('reddit-corpus'))
# corpus = Corpus(filename="convokit/tensor_decomposer/experiments/reddit-trajectory-subset-annotated")

Dataset already exists at /Users/calebchiam/.convokit/downloads/reddit-corpus


In [13]:
corpus.print_summary_stats()

Number of Speakers: 521777
Number of Utterances: 2004262
Number of Conversations: 84979


### Data filtering

In [15]:
convo_ids = set(corpus.get_conversation_ids())

In [16]:
from convokit import Utterance, Speaker
filler_utts = [Utterance(id=cid, conversation_id=cid, speaker=Speaker(id='534rehwh3h')) for cid in convo_ids]

In [19]:
for utt in filler_utts:
    utt.timestamp = 0

In [17]:
corpus.add_utterances(filler_utts, warnings=True, with_checks=False)

<convokit.model.corpus.Corpus at 0x1354b46d0>

In [18]:
full_convos = {c.id for c in corpus.iter_conversations() if c.check_integrity(verbose=False)}
len(full_convos)

84979

In [21]:
top_level_comment_ids = [utt.id for utt in corpus.iter_utterances() if utt.reply_to in convo_ids]
corpus = corpus.reindex_conversations(top_level_comment_ids)

In [22]:
corpus.print_summary_statsary_stats()

Number of Speakers: 521777
Number of Utterances: 2004262
Number of Conversations: 100000


In [24]:
import random
longest_path_ids = []
for convo in corpus.iter_conversations():
    longest_path = random.choice(convo.get_longest_paths())
    convo.meta['longest_path'] = len(longest_path)
    longest_path_ids.extend([utt.id for utt in longest_path])

In [26]:
longest_path_ids = set(longest_path_ids)

In [28]:
path_lengths = [convo.meta['longest_path'] for convo in corpus.iter_conversations()]

In [30]:
%matplotlib qt
import seaborn as sns
sns.countplot(path_lengths)

<matplotlib.axes._subplots.AxesSubplot at 0x1b7f41910>

In [31]:
corpus = corpus.filter_utterances_by(lambda utt: utt.id in longest_path_ids and 
                                     utt.get_conversation().meta['longest_path'] >= 8)

In [32]:
corpus.print_summary_stats()

Number of Speakers: 129207
Number of Utterances: 508962
Number of Conversations: 45881


In [34]:
corpus.dump('reddit-focused-8', base_path='convokit/tensor_decomposer/experiments')

In [5]:
corpus = Corpus('convokit/tensor_decomposer/experiments/reddit-focused-8')

In [6]:
corpus.random_conversation().print_conversation_structure()

RomanWillNeverReign
    waltdanger
        The_EA_Nazi
            waltdanger
                The_EA_Nazi
                    trumpetspieler
                        sock_lover
                            Mercwithapen
                                The_EA_Nazi


In [7]:
hyperconv_range = range(2, 14+1)

In [8]:
def multi_hyperconv_transform(corpus, hyperconv_range):
    hc_transformers = [HyperConvo(prefix_len=i, feat_name="hyperconvo-{}".format(i), invalid_val=-1) for i in hyperconv_range]
    for idx, hc in enumerate(list(reversed(hc_transformers))):
        print(hyperconv_range[-1]-idx)
        hc.transform(corpus)
multi_hyperconv_transform(corpus, hyperconv_range)
# corpus.dump('annotated-fake-trajectory-40', base_path="convokit/thread_generator")

14
13
12
11
10
9
8
7
6
5
4
3
2


In [9]:
grown = 0
thresh = 1.5
for convo in corpus.iter_conversations(lambda convo: convo.meta['longest_path'] >= 14):
    utts16 = convo.get_chronological_utterance_list()[:14]
    utts8 = utts16[:7]
    num_spkrs_8 = len(set(utt.speaker.id for utt in utts8))
    num_spkrs_16 = len(set(utt.speaker.id for utt in utts16))
    convo.meta['grown'] = (num_spkrs_16 / num_spkrs_8) >= thresh
    grown += (num_spkrs_16 / num_spkrs_8) >= thresh

In [10]:
len(list(corpus.iter_conversations(lambda convo: convo.meta['longest_path'] >= 14)))

7304

In [11]:
grown

730

## Hyperconvo classifier

In [12]:
from convokit import Classifier

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
model = Pipeline([("standardScaler", StandardScaler(with_mean=False)),
                            ("logreg", LogisticRegression(solver='liblinear'))])
svm_model = svm.SVC(C=0.02, kernel='linear', probability=True)

In [15]:
clf = Classifier(obj_type="conversation", pred_feats=['hyperconvo-7'], labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='hyperconv-pred', clf_prob_feat_name='hyperconv-pred-score',  clf=model
                )

In [19]:
long14 = lambda convo: convo.meta['longest_path'] >= 14

In [16]:
hc = HyperConvo(prefix_len=14, min_thread_len=14, feat_name='hyperconvo-7', invalid_val=-1)

In [20]:
res = clf.evaluate_with_cv(corpus, selector=long14)

Using corpus objects...
Running a cross-validated evaluation...
Done.


In [21]:
np.mean(res)

0.9000551320637957

In [24]:
clf.fit(corpus, selector=long14)

<convokit.classifier.classifier.Classifier at 0x17fe6ab10>

In [25]:
from sklearn.metrics import roc_auc_score

In [30]:
from convokit import extract_feats_dict

In [31]:
obj_id_to_feats = extract_feats_dict(corpus, "conversation", ["hyperconvo-7"], long14)

In [34]:
feats_df = pd.DataFrame.from_dict(obj_id_to_feats, orient='index').reindex(index = list(obj_id_to_feats))

In [37]:
from scipy.sparse import csr_matrix

In [51]:
pred_proba = clf.get_model().predict_proba(csr_matrix(feats_df.values.astype('float64')))

[0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699822656,
 0.9000539699

In [55]:
y_true = [int(convo.meta['grown']) for convo in corpus.iter_conversations(long14)]

In [82]:
roc_auc_score(y_true=y_true, y_score=[x[1] for x in pred_proba])

0.5

In [39]:
csr_matrix(feats_df.values)

TypeError: no supported conversion for types: (dtype('O'),)

In [29]:
clf.fit_transform(corpus, selector=long14)

TypeError: no supported conversion for types: (dtype('O'),)

## Tensor feats (rank 20)

In [57]:
from sklearn.preprocessing import scale
def scale_by_comment_idx(tensor):
    tensor = tensor.copy()
    for i in range(tensor.shape[0]):
        tensor[0, :, :] = scale(tensor[0, :, :])
    return tensor

In [58]:
td = TensorDecomposer(obj_type="conversation",
                      feature_set=["hyperconvo-{}".format(i) for i in range(2, 7)],
                      group_func=lambda convo: convo.get_utterance(convo.id).meta['subreddit'],
                      rank=20, tensor_func='tensortools-ncp-bcd', normalize_func=scale_by_comment_idx
                     )

In [59]:
td.fit(corpus, selector=lambda convo: convo.meta['longest_path'] >= 14)

Constructing tensor...Done.
Decomposing tensor...NCP_BCD: iteration 1, objective 0.44282763932820807, improvement inf.
NCP_BCD: iteration 2, objective 0.4260414028868913, improvement 0.01678623644131677.
NCP_BCD: iteration 3, objective 0.4161910664826495, improvement 0.009850336404241822.
NCP_BCD: iteration 4, objective 0.40563875109138603, improvement 0.010552315391263445.
NCP_BCD: iteration 5, objective 0.39293784579740865, improvement 0.012700905293977383.
NCP_BCD: iteration 6, objective 0.3793400349338271, improvement 0.013597810863581528.
NCP_BCD: iteration 7, objective 0.36597763904385683, improvement 0.013362395889970291.
NCP_BCD: iteration 8, objective 0.3539849134825516, improvement 0.011992725561305206.
NCP_BCD: iteration 9, objective 0.34229220468190075, improvement 0.011692708800650875.
NCP_BCD: iteration 10, objective 0.3288377804258168, improvement 0.013454424256083952.
NCP_BCD: iteration 11, objective 0.31529143957075806, improvement 0.013546340855058736.
NCP_BCD: iterat

NCP_BCD: iteration 94, objective 0.2259586892574092, improvement 6.471855397657822e-05.
NCP_BCD: iteration 95, objective 0.225896000008996, improvement 6.268924841318357e-05.
NCP_BCD: iteration 96, objective 0.22583443941954545, improvement 6.156058945055998e-05.
NCP_BCD: iteration 97, objective 0.22577321198143052, improvement 6.122743811493025e-05.
NCP_BCD: iteration 98, objective 0.22571191602169677, improvement 6.129595973375368e-05.
NCP_BCD: iteration 99, objective 0.22565103685399177, improvement 6.087916770500246e-05.
NCP_BCD: iteration 100, objective 0.2255902759453301, improvement 6.0760908661666724e-05.
NCP_BCD: iteration 101, objective 0.22553001425171082, improvement 6.026169361927902e-05.
NCP_BCD: iteration 102, objective 0.22547028058605983, improvement 5.9733665650985035e-05.
NCP_BCD: iteration 103, objective 0.22541050527278264, improvement 5.9775313277193876e-05.
NCP_BCD: iteration 104, objective 0.22535083901652875, improvement 5.966625625389299e-05.
NCP_BCD: iteratio

<convokit.tensor_decomposer.tensorDecomposer.TensorDecomposer at 0x128080210>

In [60]:
td.transform(corpus, selector=lambda convo: convo.meta['longest_path'] >= 14)

<convokit.model.corpus.Corpus at 0x1279dc650>

In [61]:
from convokit import BoWClassifier

In [62]:
clf_tensor = BoWClassifier(obj_type="conversation", vector_name='tensor_factor', labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='tensor-pred', clf_prob_feat_name='tensor-pred-score', clf=model
                )

In [63]:
from sklearn.model_selection import KFold

In [74]:
res = clf_tensor.evaluate_with_cv(corpus, selector=lambda convo: convo.meta['longest_path'] >= 14, cv=KFold(n_splits=5, shuffle=True))

Using corpus objects...

Running a cross-validated evaluation...
Done.


In [75]:
np.mean(res)

0.9000547570157427

In [66]:
clf_tensor.fit(corpus, selector=long14)

<convokit.classifier.bow_classifier.BoWClassifier at 0x128094590>

In [69]:
from scipy.sparse import vstack, issparse
X = []
y = []
for obj in corpus.iter_objs('conversation', long14):
    X.append(obj.meta['tensor_factor'])
    y.append(obj.meta['grown'])
if issparse(X[0]): # for csr_matrix
    X = vstack(X)
else: # for non-compressed numpy arrays
    X = np.vstack(X)

In [76]:
predict_proba = clf_tensor.get_model().predict_proba(X)

In [78]:
predict_proba

array([[0.87302093, 0.12697907],
       [0.95869484, 0.04130516],
       [0.87260513, 0.12739487],
       ...,
       [0.87276994, 0.12723006],
       [0.87252147, 0.12747853],
       [0.87299799, 0.12700201]])

In [83]:
roc_auc_score(y_true=y_true, y_score=[x[1] for x in predict_proba])

0.6211686969422923

In [87]:
min([x[0] for x in predict_proba])

0.5163512331332524

In [67]:
clf_tensor.transform(corpus, selector=long14)

ValueError: blocks must be 2-D

In [95]:
for convo in corpus.iter_conversations(selector=lambda convo: convo.meta['longest_path'] >= 14):
    convo.meta['tensor_rank20'] = convo.meta['tensor_factor']

## Combined feature sets

In [83]:
grown

730

### Hyperconvo-7 + TCA (rank 20)

In [49]:
for convo in corpus.iter_conversations(selector=lambda convo: convo.meta['longest_path'] >= 14):
    convo.meta['tensor_dict'] = {idx: v for idx, v in enumerate(convo.meta['tensor_factor'])}

In [50]:
clf_all = Classifier(obj_type="conversation", pred_feats=['hyperconvo-7', 'tensor_dict'], labeller=lambda convo: convo.meta['grown'],
                )

In [51]:
res = clf_all.evaluate_with_cv(corpus, selector=lambda convo: convo.meta['longest_path'] >= 14)

Using corpus objects...
Running a cross-validated evaluation...
Done.


In [52]:
np.mean(res)

0.9000540069196366