1. Focused -> expansionary
2. Expansionary, non-reciprocal -> Focused, reciprocal
3. Dyadic chain

In [1]:
import os
os.chdir('../../..')
import convokit

In [17]:
from convokit import Corpus, download, HyperConvo

In [148]:
corpus = Corpus(download('reddit-corpus'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/reddit-corpus


In [150]:
corpus.print_summary_stats()

Number of Speakers: 521777
Number of Utterances: 2004262
Number of Conversations: 84979


In [155]:
convo = corpus.random_conversation()

In [158]:
convo.check_integrity()

Checking reply-to chain of Conversation 8v9iko


False

In [159]:
convo = corpus.get_conversation('7r2x7x')

In [161]:
convo_ids = set(corpus.get_conversation_ids())

In [168]:
from convokit import Utterance, Speaker

In [169]:
filler_utts = [Utterance(id=cid, conversation_id=cid, speaker=Speaker(id='534rehwh3h')) for cid in convo_ids]

In [170]:
corpus.add_utterances(filler_utts, warnings=True, with_checks=False)

<convokit.model.corpus.Corpus at 0x1651b79d0>

In [171]:
corpus.print_summary_stats()

Number of Speakers: 521778
Number of Utterances: 2089241
Number of Conversations: 84979


In [172]:
top_level_comment_ids = [utt.id for utt in corpus.iter_utterances() if utt.reply_to in convo_ids]

In [173]:
corpus = corpus.reindex_conversations(top_level_comment_ids)

In [174]:
corpus.print_summary_stats()

Number of Speakers: 521777
Number of Utterances: 2004262
Number of Conversations: 100000


In [175]:
corpus.filter_conversations_by(lambda convo: len(list(convo.iter_utterances())) >= 20)

In [176]:
corpus.print_summary_stats()

Number of Speakers: 344223
Number of Utterances: 1085877
Number of Conversations: 29487


In [177]:
for convo in corpus.iter_conversations():
    for idx, utt in enumerate(convo.get_chronological_utterance_list()):
        utt.meta['order'] = idx+1

In [178]:
hc10 = HyperConvo(prefix_len=10, feat_name='hyperconvo-10')
hc20 = HyperConvo(prefix_len=20, feat_name='hyperconvo-20')

In [179]:
hc10.fit_transform(corpus)
hc20.fit_transform(corpus)

<convokit.model.corpus.Corpus at 0x193828950>

In [180]:
corpus.random_conversation().meta['hyperconvo-10']

{'max[indegree over c->c responses]': 2,
 'argmax[indegree over c->c responses]': 1,
 'norm.max[indegree over c->c responses]': 0.2222222222222222,
 '2nd-largest[indegree over c->c responses]': 2,
 '2nd-argmax[indegree over c->c responses]': 3,
 'norm.2nd-largest[indegree over c->c responses]': 0.2222222222222222,
 'mean[indegree over c->c responses]': 0.9,
 'mean-nonzero[indegree over c->c responses]': 1.2857142857142858,
 'prop-nonzero[indegree over c->c responses]': 0.7,
 'prop-multiple[indegree over c->c responses]': 0.2857142857142857,
 'entropy[indegree over c->c responses]': 1.8891591637540217,
 '2nd-largest / max[indegree over c->c responses]': 1.0,
 'max[outdegree over C->c responses]': 4,
 'max[indegree over C->c responses]': 2,
 'argmax[outdegree over C->c responses]': 1,
 'argmax[indegree over C->c responses]': 1,
 'norm.max[outdegree over C->c responses]': 0.4444444444444444,
 'norm.max[indegree over C->c responses]': 0.2222222222222222,
 '2nd-largest[outdegree over C->c r

In [181]:
corpus.dump('annotated-reddit-corpus', base_path='convokit/tensor_decomposer/experiments')

## Group 1: focused -> expansionary

In [380]:
def group1(convo):
    return convo.meta['hyperconvo-10']['max[indegree over C->c responses]'] <= 3 and \
           convo.meta['hyperconvo-20']['max[indegree over C->c responses]'] >= 7

In [382]:
group1_convos = list(corpus.iter_conversations(group1))
print(len(group1_convos))

585


In [385]:
group1_convos[50].print_conversation_structure(lambda utt: str(utt.meta['order']) + ". " + utt.speaker.id, limit=20)

1. kittylovesblog
    2. blubeeds
    3. pookskii
        4. kittylovesblog
            5. pookskii
                6. kittylovesblog
            7. 4b3ats
                8. kittylovesblog
                    9. 4b3ats
            13. ashley_the_otter
            15. scorpiohkg
    10. BrandNewSidewalk
        11. kittylovesblog
            19. memebigboiii
    12. Saynotoshityouhate
        14. pottymouthgrl
    16. [deleted]
    17. cheu
    18. memebigboiii
    20. kittymeowmixi


## Group 2: expansionary -> focused

In [280]:
def group2(convo):
    hc10 = convo.meta['hyperconvo-10']
    hc20 = convo.meta['hyperconvo-20']
    return hc10['count[external reciprocity motif]'] + hc10['count[reciprocity motif]'] <= 3 and \
        hc20['count[external reciprocity motif]'] + hc20['count[reciprocity motif]'] >= 11

In [283]:
group2_convos = list(corpus.iter_conversations(group2))

In [284]:
len(group2_convos)

986

In [285]:
group2_convos[10].print_conversation_structure(lambda utt: str(utt.meta['order']) + ". " + utt.speaker.id, limit=20)

1. xThe-Legend-Killerx
    2. Shamrock5
    3. LukeNeverShaves
    4. Mister_Jay_Peg
        8. sirtinykins
            9. Mister_Jay_Peg
            12. awakins
                13. Jasmith85
                    14. awakins
                        15. Jasmith85
                            16. awakins
                                17. Jasmith85
                                    18. awakins
            19. HurricaneErickson
            20. Falconinati
    5. fronkensteen
    6. IrishFuryHD
    7. Catdaddypanther97
    10. BukkakeKing69
    11. Bigforsumthin


## Group 3

In [267]:
def group3(convo):
    return convo.meta['hyperconvo-10']['count[reciprocity motif]'] >= 7 and \
           convo.meta['hyperconvo-20']['count[reciprocity motif]'] >= 16

In [268]:
group3_convos = list(corpus.iter_conversations(group3))
print(len(group3_convos))

1262


In [269]:
group3_convos[100].print_conversation_structure(lambda utt: str(utt.meta['order']) + ". " + utt.speaker.id, limit=20)

1. Agent_Phantom
    2. phillwilk
        3. Agent_Phantom
            4. phillwilk
                5. Agent_Phantom
                    6. phillwilk
                        7. Agent_Phantom
                            8. phillwilk
                                9. Agent_Phantom
                                    10. phillwilk
                                        11. Agent_Phantom
                                            12. phillwilk
                                                14. Agent_Phantom
                                            13. phillwilk
                                                15. Agent_Phantom
                                                    16. phillwilk
                                                    17. phillwilk
                                                        18. Agent_Phantom
                                                            19. phillwilk
                                                            20. phillwilk


## Constructing corpus

In [387]:
len(group1_convos)

585

In [388]:
len(group2_convos)

986

In [389]:
len(group3_convos)

1262

In [392]:
group1_ids = set([c.id for c in group1_convos])
group2_ids = set([c.id for c in group2_convos])
group3_ids = set([c.id for c in group3_convos])

In [393]:
group1_ids.intersection(group2_ids)

set()

In [394]:
group1_ids.intersection(group3_ids)

set()

In [395]:
group2_ids.intersection(group3_ids)

set()

A GOOD SIGN!

Take 500 of each.

In [399]:
import random

In [404]:
subset = random.sample(group1_ids, 500) + random.sample(group2_ids, 500) + random.sample(group3_ids, 500)

In [405]:
len(subset)

1500

In [415]:
list(subset)[:500]

['e52fu0v',
 'e6r170g',
 'e4cm58e',
 'dxsbfq9',
 'dxh8854',
 'e4y96kr',
 'dsefjnn',
 'dwb23hx',
 'drc2ksp',
 'dy1rgnx',
 'dxu23iw',
 'dtv9az1',
 'dsmsvnm',
 'dwc91r8',
 'drocyz9',
 'dr0ax0m',
 'dujz8rf',
 'e11uho6',
 'dvvmuin',
 'dxobldc',
 'dt9rqjd',
 'dxbictg',
 'dyl9bqb',
 'duemcw2',
 'dycl6m1',
 'dut3hlq',
 'e0lse3v',
 'du495kg',
 'dptxn0g',
 'e0u7dic',
 'dum4yjh',
 'dyf6bq7',
 'dzr9tp1',
 'e2weero',
 'dpeif8u',
 'e0e7lv7',
 'dr889x6',
 'dzjictc',
 'e69zt8p',
 'e6v7ky2',
 'e2v73dk',
 'drlg81q',
 'e1l1i35',
 'dtk0vvh',
 'dzca4kq',
 'dtoalms',
 'e5livao',
 'dw3qe5c',
 'dss2gre',
 'dtomh1s',
 'e1ccuqb',
 'dsvtmn6',
 'dydw9s1',
 'e29lu91',
 'e0ts652',
 'e38uef0',
 'dsrzn3d',
 'dw2ht9e',
 'dv91ssg',
 'e0pg1jb',
 'e6wmhxx',
 'dq8l354',
 'e0zubyt',
 'e64xnrb',
 'doet1o7',
 'do6c1aa',
 'e170jcl',
 'dp538uy',
 'e6vh7fe',
 'dqnbdgr',
 'dnsp5ci',
 'dyjuxox',
 'dto2di9',
 'dv56jpf',
 'ds4g49j',
 'e5qwney',
 'e3v20et',
 'e576gqe',
 'e0xxbhd',
 'dvvr86n',
 'dpdzhk7',
 'dr6oz31',
 'e3o2j2z',
 'du

In [416]:
group1_ids

{'dnr6ggh',
 'dnrjwvc',
 'dnrpl12',
 'dnsks1t',
 'dnthzsj',
 'dntjcdw',
 'dntvatf',
 'dnurgxw',
 'dnvkdiz',
 'dnw0av2',
 'dnwql4a',
 'dnxsq99',
 'dnxw04f',
 'dnzy2nx',
 'do09ju8',
 'do170c7',
 'do1gaz8',
 'do3dley',
 'do3w4z8',
 'do40096',
 'do4i0os',
 'do4qdq7',
 'do6rzmu',
 'do7sxpz',
 'do8fd2x',
 'doanp28',
 'dobnm5q',
 'dobntas',
 'docr045',
 'dodzqpc',
 'dofaw5b',
 'dofqsue',
 'dog0xh0',
 'dog8pa7',
 'dogsbnq',
 'dokl1os',
 'doob601',
 'doqofd8',
 'dorxa9d',
 'dot3uvf',
 'douc90e',
 'dowdmyx',
 'dox4ecm',
 'doxbla5',
 'doxl9yv',
 'doxmj58',
 'doz2ed3',
 'dp0zu7s',
 'dp1qs91',
 'dp3p457',
 'dp7c4y1',
 'dp8xy12',
 'dp97kxy',
 'dpalmoj',
 'dpapw9b',
 'dpdzhk7',
 'dpgombj',
 'dpgsp76',
 'dpi64db',
 'dpiy9dq',
 'dpj4sid',
 'dpjd805',
 'dpkmcbd',
 'dpks1wj',
 'dplwhp2',
 'dplwq40',
 'dpm3uyt',
 'dpmor4i',
 'dpo0axv',
 'dpo274i',
 'dpo2f3m',
 'dppernd',
 'dpqsbsw',
 'dptwfoi',
 'dpu19f8',
 'dpuu2g4',
 'dpvn7vg',
 'dpvnec4',
 'dpwfjfk',
 'dpx5itk',
 'dq1ikcn',
 'dq2pm5b',
 'dq2t8o7',
 'dq

In [406]:
subset = set(subset)

In [407]:
corpus.filter_conversations_by(lambda convo: convo.id in subset)

In [417]:
corpus.print_summary_stats()

Number of Speakers: 25994
Number of Utterances: 53135
Number of Conversations: 1500


In [419]:
corpus.print_summary_stats()

Number of Speakers: 25994
Number of Utterances: 53135
Number of Conversations: 1500


In [420]:
group1_ids = set(group1_ids)
group2_ids = set(group2_ids)
group3_ids = set(group3_ids)

In [421]:
group1_utts, group2_utts, group3_utts = [], [], []

In [422]:
for convo in corpus.iter_conversations():
    if convo.id in group1_ids:
        group1_utts.extend(convo.get_chronological_utterance_list())
    elif convo.id in group2_ids:
        group2_utts.extend(convo.get_chronological_utterance_list())
    elif convo.id in group3_ids:
        group3_utts.extend(convo.get_chronological_utterance_list())
    else:
        raise ValueError(convo.id)

In [423]:
len(group1_utts)

19320

In [424]:
len(group2_utts)

18594

In [425]:
len(group3_utts)

15221

In [426]:
corpus.print_summary_stats()

Number of Speakers: 25994
Number of Utterances: 53135
Number of Conversations: 1500


In [427]:
new_corpus = Corpus(utterances=group1_utts + group2_utts + group3_utts)

In [432]:
new_corpus.dump('reddit-trajectory-subset', base_path='convokit/tensor_decomposer/experiments')