In [1]:
import convokit

In [2]:
import pandas as pd
import numpy as np
from scipy import stats

In [3]:
### comment out to download the corpus
# filename = download('subreddit-changemyview')

### comment out to load your own copy
# filename = '<YOUR FILENAME>'
# corpus = convokit.Corpus(filename)

In [None]:
### comment out to tokenize the corpus.
# tokenizer = convokit.Tokenizer(verbosity=10000)
# corpus = tokenizer.fit_transform(corpus)

### comment out to load your pre-tokenized copy
filename = '/kitchen/experimental_justine/convokit_dev/cmv-test'
corpus = convokit.Corpus(filename)

In [None]:
corpus.meta

In [None]:
# also remove moderators and deleted users
USER_BLACKLIST = ['[deleted]', 'DeltaBot','AutoModerator']
def utterance_is_valid(utterance):
    return (utterance.id != utterance.root) and (utterance.user.name not in USER_BLACKLIST)

In [None]:
uchistory = convokit.user_convo_helpers.user_convo_history.UserConvoHistory(utterance_filter=utterance_is_valid)
corpus = uchistory.fit_transform(corpus)

In [None]:
corpus.get_user('holosport').meta['conversations']

In [None]:
corpus.get_user('holosport').meta['n_convos']

In [None]:
corpus.get_user('holosport').meta['start_time']

In [None]:
wordcount = convokit.WordCount()
corpus = wordcount.fit_transform(corpus)

In [None]:
uc_wordcount = convokit.user_convo_helpers.user_convo_attrs.UserConvoAttrs('wordcount', agg_fn=np.mean)
corpus = uc_wordcount.fit_transform(corpus)

In [None]:
corpus.get_user('holosport').meta['conversations']

In [None]:
user_convo_len_df = convokit.user_convo_helpers.user_convo_utils.get_user_convo_attribute_table(corpus, ['wordcount', 'n_utterances'])
user_convo_len_df.head()

In [None]:
stage_wc_df = convokit.user_convo_helpers.user_convo_utils.get_lifestage_attributes(user_convo_len_df, 'wordcount', 10, 20)
stage_wc_df.head()

In [None]:
stage_wc_df.mean()

In [None]:
def print_lifestage_comparisons(stage_df):
    for i in range(stage_df.columns.max()):
        
        mask = stage_df[i+1].notnull() & stage_df[i].notnull()
        c1 = stage_df[i+1][mask]
        c0 = stage_df[i][mask]
        
        print('stages %d vs %d (%d users)' % (i + 1, i, sum(mask)))
        n_more = sum(c1 > c0)
        n = sum(c1 != c0)
        print('\tprop more: %.3f, binom_p=%.2f' % (n_more/n, stats.binom_test(n_more,n)))

In [None]:
print_lifestage_comparisons(stage_wc_df)

In [None]:
stage_convo_len_df = convokit.user_convo_helpers.user_convo_utils.get_lifestage_attributes(user_convo_len_df, 'n_utterances', 10, 20)

In [None]:
stage_convo_len_df.mean()

In [None]:
print_lifestage_comparisons(stage_convo_len_df)

In [None]:
dt = convokit.UserConvoDiversity(10, 20, n_iters=10, test=False)

In [None]:
corpus = dt.fit_transform(corpus)

In [None]:
div_df = convokit.user_convo_helpers.user_convo_utils.get_user_convo_attribute_table(corpus, ['n_utterances','self_div','other_div',
                                             'adj_other_div','tokens','wordcount'])
div_df = div_df[div_df.self_div.notnull()]

In [None]:
div_df.head()

In [None]:
div_df.shape

In [None]:
div_df['stage_idx'] = div_df.convo_idx // 10

In [None]:
test_subset = div_df[div_df.user.apply(lambda x: ('bot' not in x.lower()) and ('subredditreports' not in x.lower()))]\
    .groupby(['stage_idx','user']).filter(lambda x: len(x)>=3)

In [None]:
stage_self_div_df = convokit.user_convo_helpers.user_convo_utils.get_lifestage_attributes(test_subset, 'self_div', 10,40)

In [None]:
print_lifestage_comparisons(stage_self_div_df)

In [None]:
stage_other_div_df = convokit.user_convo_helpers.user_convo_utils.get_lifestage_attributes(test_subset, 'other_div', 10,40)

In [None]:
print_lifestage_comparisons(stage_other_div_df)

In [None]:
stage_adj_other_div_df = convokit.user_convo_helpers.user_convo_utils.get_lifestage_attributes(test_subset, 'adj_other_div', 10,40)

In [None]:
print_lifestage_comparisons(stage_adj_other_div_df)

In [None]:
sorted_divs = test_subset[test_subset.stage_idx == 0].groupby('user').adj_other_div.mean().sort_values()
least_div = sorted_divs.head(10).tail(5).index
most_div = sorted_divs.tail(10).head(5).index