In [1]:
import os
os.chdir('../../..')

In [2]:
import convokit

In [3]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [4]:
from convokit import Corpus, HyperConvo, TensorDecomposer

First we download the reddit corpus:

In [5]:
# corpus = Corpus(filename="convokit/thread_generator/fake-corpus-trajectory-40")
corpus = Corpus(filename="convokit/thread_generator/annotated-fake-trajectory-experiment-2")
# corpus = Corpus(filename="convokit/tensor_decomposer/experiments/reddit-trajectory-subset-annotated")

In [6]:
corpus.print_summary_stats()

Number of Speakers: 40
Number of Utterances: 75000
Number of Conversations: 3000


In [7]:
hyperconv_range = range(2, 20+1)

In [8]:
grown = 0
thresh = 1.5
for convo in corpus.iter_conversations():
    try:
        recip_div = convo.meta['hyperconvo-25']['count[reciprocity motif]'] / convo.meta['hyperconvo-20']['count[reciprocity motif]']
    except ZeroDivisionError:
        if convo.meta['hyperconvo-20']['count[reciprocity motif]'] == 0:
            recip_div = 0
        else:
            recip_div = 10
    convo.meta['grown'] = recip_div >= thresh
    grown += convo.meta['grown']

In [9]:
grown2 = 0
thresh = 3.0
for convo in corpus.iter_conversations():
    recip_diff = convo.meta['hyperconvo-25']['count[reciprocity motif]'] - convo.meta['hyperconvo-20']['count[reciprocity motif]']
    convo.meta['grown2'] = recip_diff >= thresh
    grown2 += convo.meta['grown2']

In [10]:
grown

1173

In [11]:
grown2

937

In [12]:
from convokit import Classifier
from sklearn.model_selection import KFold

In [13]:
# clf = Classifier(obj_type="conversation", pred_feats=['hyperconvo-20'], labeller=lambda convo: convo.meta['grown'],
#                 clf_feat_name='hyperconv-pred', clf_prob_feat_name='hyperconv-pred-score'
#                 )

In [95]:
clf = Classifier(obj_type="conversation", pred_feats=['hyperconvo-20'], labeller=lambda convo: convo.meta['grown'],
                clf_feat_name='hyperconv-pred', clf_prob_feat_name='hyperconv-pred-score'
                )

In [59]:
res = clf.evaluate_with_cv(corpus, cv=KFold(n_splits=5, shuffle=True))

Using corpus objects...
Running a cross-validated evaluation...
Done.


In [96]:
clf.fit(corpus)

<convokit.classifier.classifier.Classifier at 0x131274f90>

In [97]:
clf_feats = list(corpus.random_conversation().meta['hyperconvo-20'])
clf.get_coefs(clf_feats, lambda model: model.coef_.toarray()[0])

Unnamed: 0_level_0,coef
feat_name,Unnamed: 1_level_1
is-present[reciprocity motif],0.308368
is-present[dyadic interaction motif],0.288368
max[indegree over C->C mid-thread responses],0.218516
count[reciprocity motif over mid-thread],0.210676
mean-nonzero[outdegree over C->c mid-thread responses],0.151885
mean-nonzero[outdegree over C->C responses],0.118984
max[outdegree over C->c responses],0.110646
is-present[reciprocity motif over mid-thread],0.109935
is-present[dyadic interaction motif over mid-thread],0.109935
argmax[outdegree over C->c mid-thread responses],0.109028


In [60]:
np.mean(res)

0.8973333333333333

## Tensor feats (rank 3)

In [14]:
td = TensorDecomposer(obj_type="conversation",
                      feature_set=["hyperconvo-{}".format(i) for i in range(2, 21)],
                      group_func=lambda convo: convo.get_utterance(convo.id).meta['subreddit'],
                      rank=3, tensor_func='tensortools-ncp-hals'
                     )

In [15]:
td.fit(corpus)

Constructing tensor...Done.
Decomposing tensor...NCP_HALS: iteration 1, objective 0.4705557696201128, improvement inf.
NCP_HALS: iteration 2, objective 0.38718167779788215, improvement 0.08337409182223066.
NCP_HALS: iteration 3, objective 0.37909679479507163, improvement 0.008084883002810517.
NCP_HALS: iteration 4, objective 0.3756453415165847, improvement 0.003451453278486949.
NCP_HALS: iteration 5, objective 0.37381467358895404, improvement 0.001830667927630647.
NCP_HALS: iteration 6, objective 0.37286048661276505, improvement 0.0009541869761889865.
NCP_HALS: iteration 7, objective 0.37233013239138185, improvement 0.0005303542213832002.
NCP_HALS: iteration 8, objective 0.37202999361193284, improvement 0.0003001387794490107.
NCP_HALS: iteration 9, objective 0.37186436976239035, improvement 0.00016562384954249554.
NCP_HALS: iteration 10, objective 0.3717697571236038, improvement 9.461263878657311e-05.
NCP_HALS: iteration 11, objective 0.37171019342378886, improvement 5.956369981491294e

<convokit.tensor_decomposer.tensorDecomposer.TensorDecomposer at 0x127e94210>

In [16]:
td.transform(corpus)

<convokit.model.corpus.Corpus at 0x127e8c690>

In [17]:
from convokit import BoWClassifier

In [18]:
clf_tensor = BoWClassifier(obj_type="conversation", vector_name='tensor_factor', 
                           labeller=lambda convo: convo.meta['grown2'],
                clf_feat_name='tensor-pred', clf_prob_feat_name='tensor-pred-score'
                )

Initializing default classification model (standard scaled logistic regression)


In [19]:
res = clf_tensor.evaluate_with_cv(corpus, cv=KFold(n_splits=5, shuffle=True))

Using corpus objects...

Running a cross-validated evaluation...
Done.


In [20]:
np.mean(res)

0.7183333333333334

In [24]:
for convo in corpus.iter_conversations():
    convo.meta['tensor_rank3'] = convo.meta['tensor_factor']

### Rank 9 decomp

In [25]:
td_9 = TensorDecomposer(obj_type="conversation",
                      feature_set=["hyperconvo-{}".format(i) for i in range(2, 21)],
                      group_func=lambda convo: convo.get_utterance(convo.id).meta['subreddit'],
                      rank=9, tensor_func='tensorly'
                     )

In [74]:
td_9.fit(corpus)

Constructing tensor...Done.
Decomposing tensor...Done.


<convokit.tensor_decomposer.tensorDecomposer.TensorDecomposer at 0x131263390>

In [75]:
td_9.transform(corpus)

<convokit.model.corpus.Corpus at 0x131238c50>

In [79]:
clf_tensor_9 = BoWClassifier(obj_type="conversation", vector_name='tensor_factor', 
                             labeller=lambda convo: convo.meta['grown2'],
                clf_feat_name='tensor-pred', clf_prob_feat_name='tensor-pred-score'
                )

Initializing default classification model (standard scaled logistic regression)


In [80]:
res = clf_tensor_9.evaluate_with_cv(corpus, cv=KFold(n_splits=5, shuffle=True))
print(res)

Using corpus objects...

Running a cross-validated evaluation...
Done.
[0.84       0.85333333 0.85       0.86166667 0.845     ]


In [81]:
np.mean(res)

0.85

### Baseline: class information

In [55]:
# for generated data
for idx, convo in enumerate(corpus.iter_conversations()):
    convo.meta['class1'] = int((idx / 1000) < 1)
    convo.meta['class2'] = int(1 <= (idx / 1000) < 2)
    convo.meta['class3'] = int(idx/1000 >= 2)

In [72]:
clf_base = Classifier(obj_type="conversation", pred_feats=['class1', 'class2', 'class3'], 
                 labeller=lambda convo: convo.meta['grown2'],
                clf_feat_name='base-pred', clf_prob_feat_name='base-pred-score'
                )

In [73]:
np.mean(clf_base.evaluate_with_cv(corpus, cv=KFold(n_splits=5, shuffle=True)))

Using corpus objects...
Running a cross-validated evaluation...
Done.


0.9690000000000001

## Benchmarks

In [34]:
for convo in corpus.iter_conversations():
    convo.meta['concat'] = dict()
    for idx in range(2, 20+1):
        d = convo.meta['hyperconvo-{}'.format(idx)].copy()
        convo.meta['concat'].update({k+"_"+str(idx): v for k, v in d.items()})

In [35]:
len(corpus.random_conversation().meta['concat'])

2660

In [36]:
concat_data = dict()
for convo in corpus.iter_conversations():
    concat_data[convo.id] = convo.meta['concat']
concat_df = pd.DataFrame(concat_data).T

In [37]:
np.sum(np.sum(pd.isnull(concat_df))) # NaN values

0

In [38]:
from sklearn.preprocessing import StandardScaler

In [39]:
concat_vals = concat_df.values.astype('float64')
concat_vals_std = StandardScaler().fit_transform(concat_vals)

### Benchmark: PCA-3[Hyperconvo-2 to Hyperconvo-20]

In [40]:
from sklearn.model_selection import KFold

In [82]:
y = []
for convo in corpus.iter_conversations():
    y.append(int(convo.meta['grown2']))

In [42]:
from sklearn.decomposition import PCA

In [43]:
X = concat_vals_std

In [83]:
y = np.array(y)

In [84]:
from sklearn import svm

In [85]:
pca_3 = PCA(n_components=3)
kf = KFold(n_splits=5, shuffle=True)
acc = []
for train_index, test_index in kf.split(X):
    print(train_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_pca = pca_3.fit_transform(X_train)
    X_test_pca = pca_3.transform(X_test)
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    acc.append(np.mean(y_pred == y_test))

[   0    1    2 ... 2996 2997 2999]
[   0    1    2 ... 2997 2998 2999]
[   0    1    2 ... 2997 2998 2999]
[   1    3    4 ... 2995 2996 2998]
[   0    2    3 ... 2997 2998 2999]


In [86]:
print(np.mean(acc))

0.6876666666666666


### Benchmark: PCA-9[Hyperconvo-2 to Hyperconvo-20]

In [87]:
pca_9 = PCA(n_components=9)
kf = KFold(n_splits=5, shuffle=True)
acc_9 = []
for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    X_train_pca = pca_9.fit_transform(X_train)
    X_test_pca = pca_9.transform(X_test)
    clf = svm.SVC(C=0.02, kernel='linear', probability=True)
    clf.fit(X_train_pca, y_train)
    y_pred = clf.predict(X_test_pca)
    acc_9.append(np.mean(y_pred == y_test))

In [88]:
print(np.mean(acc_9))

0.7603333333333333


### Benchmark: average[Hyperconvo-2 to Hyperconvo-20]

In [50]:
from collections import defaultdict

convo_to_avg = dict()
for convo in corpus.iter_conversations():
    avg_dict = defaultdict(int)
    for idx in range(2, 20+1):
        d = convo.meta['hyperconvo-{}'.format(idx)]
        for k, v in d.items():
            avg_dict[k] += v
    convo_to_avg[convo.id] = avg_dict

In [51]:
for convo_id, avg_dict in convo_to_avg.items():
    corpus.get_conversation(convo_id).meta['avg'] = dict(avg_dict)

In [89]:
clf_avg = Classifier(obj_type="conversation", pred_feats=['avg'], 
                 labeller=lambda convo: convo.meta['grown2']
                )

In [90]:
res = clf_avg.evaluate_with_cv(corpus, cv=KFold(n_splits=5, shuffle=True))

Using corpus objects...
Running a cross-validated evaluation...
Done.


In [91]:
print(res)
print(np.mean(res))

[0.83833333 0.77833333 0.83       0.79833333 0.8       ]
0.8089999999999999


## Combined feature sets

In [92]:
grown

6233

In [91]:
corpus.print_summary_stats()

Number of Speakers: 40
Number of Utterances: 375000
Number of Conversations: 15000


### Hyperconvo-20 + TCA (rank 9)

In [93]:
for convo in corpus.iter_conversations():
    convo.meta['tensor_dict'] = {idx: v for idx, v in enumerate(convo.meta['tensor_factor'])}

In [94]:
clf_all = Classifier(obj_type="conversation", pred_feats=['hyperconvo-20', 'tensor_dict'], labeller=lambda convo: convo.meta['grown'],
                )

In [95]:
res = clf_all.evaluate_with_cv(corpus)

Using corpus objects...
Running a cross-validated evaluation...
Done.


In [96]:
np.mean(res)

0.8207333333333333

### Hyperconvo-20 + TCA (rank 3)

In [81]:
for convo in corpus.iter_conversations():
    convo.meta['tensor_dict3'] = {idx: v for idx, v in enumerate(convo.meta['tensor_rank3'])}

In [82]:
clf_all_3 = Classifier(obj_type="conversation", pred_feats=['hyperconvo-15', 'tensor_dict3'], labeller=lambda convo: convo.meta['grown'],
                )

In [83]:
res = clf_all_3.evaluate_with_cv(corpus)
np.mean(res)

Using corpus objects...
Running a cross-validated evaluation...
Done.


0.7333333333333334