In [1]:
import convokit

In [39]:
ROOT_DIR = '/kitchen/convokit_corpora/tennis-mini/'

In [2]:
corpus = convokit.Corpus(ROOT_DIR)

In [3]:
ls $ROOT_DIR

conversations.json  corpus.json  index.json  users.json  utterances.jsonl


In [4]:
corpus.processed_text

{}

In [50]:
test_utt_id = '1681_14.a'

In [51]:
corpus.get_utterance(test_utt_id).text

"Yeah, but many friends went with me, Japanese guy. So I wasn't -- I wasn't like homesick. But now sometimes I get homesick."

In [7]:
def preprocess_text(text):
    text = text.replace(' -- ', ' ')
    return text

In [8]:
from convokit.text_processing import TextProcessor

In [9]:
text_prep = TextProcessor(preprocess_text, 'text')
corpus = text_prep.transform(corpus)

In [10]:
corpus.processed_text.keys()

dict_keys(['text'])

In [11]:
corpus.get_processed_text(test_utt_id, 'text')

"Yeah, but many friends went with me, Japanese guy. So I wasn't I wasn't like homesick. But now sometimes I get homesick."

In [12]:
from convokit.text_processing import TextParser

In [13]:
textparser = TextParser('parsed', input_field='text', verbosity=50)
corpus = textparser.transform(corpus)

050/200 utterances processed
100/200 utterances processed
150/200 utterances processed


In [14]:
corpus.processed_text.keys()

dict_keys(['text', 'parsed'])

In [15]:
test_parse = corpus.get_processed_text(test_utt_id, 'parsed')

In [16]:
test_parse[0]

{'rt': 5,
 'toks': [{'dep': 'intj', 'dn': [], 'tag': 'UH', 'tok': 'Yeah', 'up': 5},
  {'dep': 'punct', 'dn': [], 'tag': ',', 'tok': ',', 'up': 5},
  {'dep': 'cc', 'dn': [], 'tag': 'CC', 'tok': 'but', 'up': 5},
  {'dep': 'amod', 'dn': [], 'tag': 'JJ', 'tok': 'many', 'up': 4},
  {'dep': 'nsubj', 'dn': [3, 10], 'tag': 'NNS', 'tok': 'friends', 'up': 5},
  {'dep': 'ROOT', 'dn': [0, 1, 2, 4, 6, 8, 11], 'tag': 'VBD', 'tok': 'went'},
  {'dep': 'prep', 'dn': [7], 'tag': 'IN', 'tok': 'with', 'up': 5},
  {'dep': 'pobj', 'dn': [], 'tag': 'PRP', 'tok': 'me', 'up': 6},
  {'dep': 'punct', 'dn': [], 'tag': ',', 'tok': ',', 'up': 5},
  {'dep': 'amod', 'dn': [], 'tag': 'JJ', 'tok': 'Japanese', 'up': 10},
  {'dep': 'appos', 'dn': [9], 'tag': 'NN', 'tok': 'guy', 'up': 4},
  {'dep': 'punct', 'dn': [], 'tag': '.', 'tok': '.', 'up': 5}]}

In [17]:
texttagger = TextParser('tagged', 'tag', input_field='text')
corpus = texttagger.transform(corpus)

In [18]:
corpus.get_processed_text(test_utt_id, 'tagged')

[{'toks': [{'tag': 'UH', 'tok': 'Yeah'},
   {'tag': ',', 'tok': ','},
   {'tag': 'CC', 'tok': 'but'},
   {'tag': 'JJ', 'tok': 'many'},
   {'tag': 'NNS', 'tok': 'friends'},
   {'tag': 'VBD', 'tok': 'went'},
   {'tag': 'IN', 'tok': 'with'},
   {'tag': 'PRP', 'tok': 'me'},
   {'tag': ',', 'tok': ','},
   {'tag': 'JJ', 'tok': 'Japanese'},
   {'tag': 'NN', 'tok': 'guy'},
   {'tag': '.', 'tok': '.'}]},
 {'toks': [{'tag': 'RB', 'tok': 'So'},
   {'tag': 'PRP', 'tok': 'I'},
   {'tag': 'VBD', 'tok': 'was'},
   {'tag': 'RB', 'tok': "n't"},
   {'tag': 'PRP', 'tok': 'I'},
   {'tag': 'VBD', 'tok': 'was'},
   {'tag': 'RB', 'tok': "n't"},
   {'tag': 'UH', 'tok': 'like'},
   {'tag': 'NN', 'tok': 'homesick'},
   {'tag': '.', 'tok': '.'}]},
 {'toks': [{'tag': 'CC', 'tok': 'But'},
   {'tag': 'RB', 'tok': 'now'},
   {'tag': 'RB', 'tok': 'sometimes'},
   {'tag': 'PRP', 'tok': 'I'},
   {'tag': 'VBP', 'tok': 'get'},
   {'tag': 'NN', 'tok': 'homesick'},
   {'tag': '.', 'tok': '.'}]}]

In [19]:
from convokit.text_processing import TokensToString

In [20]:
tok_to_str = TokensToString('tok_str')
corpus = tok_to_str.transform(corpus)

In [21]:
print(corpus.get_processed_text(test_utt_id, 'tok_str'))

Yeah , but many friends went with me , Japanese guy .
So I was n't
I was n't like homesick .
But now sometimes I get homesick .


In [22]:
tag_to_str = TokensToString('tok_tag', token_formatter=lambda x: '%s_%s' % (x['tok'].lower(), x['tag']),
                           token_filter=lambda x: sum(ch.isalpha() for ch in x['tok'])>0)
corpus = tag_to_str.transform(corpus)

In [23]:
print(corpus.get_processed_text(test_utt_id, 'tok_tag'))

yeah_UH but_CC many_JJ friends_NNS went_VBD with_IN me_PRP japanese_JJ guy_NN
so_RB i_PRP was_VBD n't_RB
i_PRP was_VBD n't_RB like_UH homesick_NN
but_CC now_RB sometimes_RB i_PRP get_VBP homesick_NN


In [24]:
from convokit.text_processing import TextToArcs

In [25]:
text_to_arc = TextToArcs('arcs')
corpus = text_to_arc.transform(corpus)

In [26]:
corpus.get_processed_text(test_utt_id, 'arcs')

[['friends_*',
  'friends_guy',
  'friends_many',
  'guy_*',
  'guy_japanese',
  'japanese_*',
  'many_*',
  'me_*',
  'went_*',
  'went_friends',
  'went_with',
  'went_yeah',
  'with_*',
  'with_me',
  'yeah_*'],
 ['i_*', 'so>i', 'so_*', 'was_*', 'was_i', 'was_so'],
 ['homesick_*', 'i_*', 'like_*', 'was_*', 'was_homesick', 'was_i', 'was_like'],
 ['but>now',
  'get_*',
  'get_homesick',
  'get_i',
  'get_now',
  'get_sometimes',
  'homesick_*',
  'i_*',
  'now_*',
  'sometimes_*']]

In [27]:
text_to_arc_mini = TextToArcs('arcs_mini', root_only=True)
corpus = text_to_arc_mini.transform(corpus)

In [28]:
corpus.get_processed_text(test_utt_id, 'arcs_mini')

[['went_*', 'went_friends', 'went_with', 'went_yeah'],
 ['so>i', 'was_*', 'was_i', 'was_so'],
 ['was_*', 'was_homesick', 'was_i', 'was_like'],
 ['but>now', 'get_*', 'get_homesick', 'get_i', 'get_now', 'get_sometimes']]

In [29]:
from convokit.phrasing_motifs import CensorNouns

In [30]:
censor_nouns = CensorNouns('parsed_censored')
corpus = censor_nouns.transform(corpus)

In [31]:
corpus.get_processed_text(test_utt_id, 'parsed_censored')

[{'rt': 5,
  'toks': [{'dep': 'intj', 'dn': [], 'tag': 'UH', 'tok': 'yeah', 'up': 5},
   {'dep': 'punct', 'dn': [], 'tag': ',', 'tok': ',', 'up': 5},
   {'dep': 'cc', 'dn': [], 'tag': 'CC', 'tok': 'but', 'up': 5},
   {'dep': 'amod', 'dn': [], 'tag': 'JJ', 'tok': 'many', 'up': 4},
   {'dep': 'nsubj', 'dn': [3, 10], 'tag': 'NNS', 'tok': 'NN~', 'up': 5},
   {'dep': 'ROOT', 'dn': [0, 1, 2, 4, 6, 8, 11], 'tag': 'VBD', 'tok': 'went'},
   {'dep': 'prep', 'dn': [7], 'tag': 'IN', 'tok': 'with', 'up': 5},
   {'dep': 'pobj', 'dn': [], 'tag': 'PRP', 'tok': 'NN~', 'up': 6},
   {'dep': 'punct', 'dn': [], 'tag': ',', 'tok': ',', 'up': 5},
   {'dep': 'amod', 'dn': [], 'tag': 'JJ', 'tok': 'japanese', 'up': 10},
   {'dep': 'appos', 'dn': [9], 'tag': 'NN', 'tok': 'NN~', 'up': 4},
   {'dep': 'punct', 'dn': [], 'tag': '.', 'tok': '.', 'up': 5}]},
 {'rt': 2,
  'toks': [{'dep': 'advmod', 'dn': [], 'tag': 'RB', 'tok': 'so', 'up': 2},
   {'dep': 'nsubj', 'dn': [], 'tag': 'PRP', 'tok': 'NN~', 'up': 2},
   {'dep

In [32]:
text_to_arc_mini_censored = TextToArcs('arcs_censored', input_field='parsed_censored', root_only=True)
corpus = text_to_arc_mini_censored.transform(corpus)

In [33]:
corpus.get_processed_text(test_utt_id, 'arcs_censored')

[['went_*', 'went_with', 'went_yeah'],
 ['was_*', 'was_so'],
 ['was_*', 'was_like'],
 ['but>now', 'get_*', 'get_now', 'get_sometimes']]

In [34]:
def join_tokens_and_sentences(sents, aux_input={'sent_sep': '\n', 'tok_sep': ' '}):
    return aux_input.get('sent_sep','\n').join(aux_input.get('tok_sep',' ')
                         .join(sent) for sent in sents)

In [35]:
arc_to_string = TextProcessor(join_tokens_and_sentences, 
                              output_field='arc_string', input_field='arcs',
                             aux_input={'sent_sep': '\n', 'tok_sep': ', '})
corpus = arc_to_string.transform(corpus)

In [36]:
print(corpus.get_processed_text(test_utt_id, 'arc_string'))

friends_*, friends_guy, friends_many, guy_*, guy_japanese, japanese_*, many_*, me_*, went_*, went_friends, went_with, went_yeah, with_*, with_me, yeah_*
i_*, so>i, so_*, was_*, was_i, was_so
homesick_*, i_*, like_*, was_*, was_homesick, was_i, was_like
but>now, get_*, get_homesick, get_i, get_now, get_sometimes, homesick_*, i_*, now_*, sometimes_*


In [37]:
corpus.processed_text.keys()

dict_keys(['text', 'parsed', 'tagged', 'tok_str', 'tok_tag', 'arcs', 'arcs_mini', 'parsed_censored', 'arcs_censored', 'arc_string'])

In [38]:
corpus.dump_processed_text()

In [41]:
ls $ROOT_DIR

conversations.json                   processed_text.parsed.json
corpus.json                          processed_text.tagged.json
index.json                           processed_text.text.json
processed_text.arcs_censored.json    processed_text.tok_str.json
processed_text.arcs.json             processed_text.tok_tag.json
processed_text.arcs_mini.json        users.json
processed_text.arc_string.json       utterances.jsonl
processed_text.parsed_censored.json


In [42]:
new_corpus = convokit.Corpus(ROOT_DIR)

In [43]:
new_corpus.processed_text

{}

In [45]:
new_corpus.load_processed_text(['arcs', 'tok_str', 'parsed_censored'])

In [46]:
new_corpus.processed_text.keys()

dict_keys(['arcs', 'tok_str', 'parsed_censored'])

In [47]:
new_corpus.get_processed_text(test_utt_id, 'parsed_censored')

[{'rt': 5,
  'toks': [{'dep': 'intj', 'dn': [], 'tag': 'UH', 'tok': 'yeah', 'up': 5},
   {'dep': 'punct', 'dn': [], 'tag': ',', 'tok': ',', 'up': 5},
   {'dep': 'cc', 'dn': [], 'tag': 'CC', 'tok': 'but', 'up': 5},
   {'dep': 'amod', 'dn': [], 'tag': 'JJ', 'tok': 'many', 'up': 4},
   {'dep': 'nsubj', 'dn': [3, 10], 'tag': 'NNS', 'tok': 'NN~', 'up': 5},
   {'dep': 'ROOT', 'dn': [0, 1, 2, 4, 6, 8, 11], 'tag': 'VBD', 'tok': 'went'},
   {'dep': 'prep', 'dn': [7], 'tag': 'IN', 'tok': 'with', 'up': 5},
   {'dep': 'pobj', 'dn': [], 'tag': 'PRP', 'tok': 'NN~', 'up': 6},
   {'dep': 'punct', 'dn': [], 'tag': ',', 'tok': ',', 'up': 5},
   {'dep': 'amod', 'dn': [], 'tag': 'JJ', 'tok': 'japanese', 'up': 10},
   {'dep': 'appos', 'dn': [9], 'tag': 'NN', 'tok': 'NN~', 'up': 4},
   {'dep': 'punct', 'dn': [], 'tag': '.', 'tok': '.', 'up': 5}]},
 {'rt': 2,
  'toks': [{'dep': 'advmod', 'dn': [], 'tag': 'RB', 'tok': 'so', 'up': 2},
   {'dep': 'nsubj', 'dn': [], 'tag': 'PRP', 'tok': 'NN~', 'up': 2},
   {'dep