In [1]:
import convokit
import numpy as np
from convokit import Corpus, download, Surprise

In [2]:
corpus = Corpus(filename=download('reddit-corpus-small'))
corpus.print_summary_stats()

Dataset already exists at C:\Users\rgang\.convokit\downloads\reddit-corpus-small
Number of Speakers: 119889
Number of Utterances: 297132
Number of Conversations: 8286


## Example 1: How surprising is an utterance compared to the rest of the conversation it belongs to?

Let's first take a look at how long typical utterances are to get a sense of what we should set `target_sample_size` and `context_sample_size` to.

Let's look at the average number of tokens in utterances in a conversation.

In [7]:
utterances = corpus.get_utterances_dataframe()
utterances['num_tokens'] = utterances['text'].map(lambda x: len(x.split(' ')))
print(utterances['num_tokens'].max())
print(utterances['num_tokens'].min())
print(utterances['num_tokens'].mean())

5203
1
34.141226794825194


We see that the utterances have an average of around 34 tokens.

To get an idea of what to set `context_sample_size` to, let's take a look at the total number of tokens in conversations.

In [8]:
convo_lengths = utterances.groupby('conversation_id')['num_tokens'].sum()
print(convo_lengths.max())
print(convo_lengths.min())
print(convo_lengths.mean())

68498
11
1224.288076273232


On average, a conversation contains around 1200 tokens.

To speed up the demo, we'll look at utterances from the top 10 conversations with the most utterances only.

In [9]:
top_convos = set(utterances.groupby('conversation_id').apply(len).sort_values(ascending=False)[:10].index)

In [10]:
subset_corpus = corpus.filter_utterances_by(lambda utt: utt.text and utt.conversation_id in top_convos)
subset_corpus.print_summary_stats()

Number of Speakers: 6346
Number of Utterances: 12264
Number of Conversations: 10


In [11]:
transformer = Surprise(target_sample_size=30, context_sample_size=200, n_samples=50)

In [12]:
transformer.fit(corpus, group_models_by=['conversation'])

<convokit.surprise.surprise.Surprise at 0x1f6ebff7108>

In [13]:
transformed_corpus = transformer.transform(subset_corpus, 'utterance', group_target_by=['conversation', 'utterance'], context_selector=lambda s,t: (s.index.get_level_values('conversation_id') == t[0] and s.index.get_level_values('id') != t[1]), model_selector=lambda ind: ind[0])

The most surprising utterances are below.

In [14]:
transformed_corpus.get_utterances_dataframe().sort_values('meta.surprise', ascending=False).head(10)

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.score,meta.top_level_comment,meta.retrieved_on,meta.gilded,meta.gildings,meta.subreddit,meta.stickied,meta.permalink,meta.author_flair_text,meta.surprise
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
e6mskll,1537908515,10.0 feet ≈ 3.0 metres ^(1 foot ≈ 0.3m)\n\n\n\...,Bot_Metric,e6msk4j,9itezj,2,e6m98ca,1539527904,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",cringe,False,/r/cringe/comments/9itezj/un_audience_laughs_a...,,4.94314
e6t39yd,1538155461,**Direct link**: https://www.theguardian.com/w...,amp-is-watching-you,e6t38zb,9jgr31,1,e6rkv1a,1539633737,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",pics,False,/r/pics/comments/9jgr31/brett_kavanaugh_the_wo...,,4.93701
e6wfu7x,1538300161,[Local cop killed a 15 year old because he was...,DeathMonkey6969,e6w4jnl,9k1ydy,19,e6w2jru,1539690040,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",videos,False,/r/videos/comments/9k1ydy/tesla_model_3_tricks...,,4.92551
e6mnmbs,1537904561,Turns out importing a surplus of low skilled w...,politicusmaximus,e6mgooz,9itezj,4,e6m98ca,1539525639,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",cringe,False,/r/cringe/comments/9itezj/un_audience_laughs_a...,,4.5337
e6ms2sc,1537908112,I read the_dumpsterfire a lot. Wanna really kn...,PMme_slave_leia_pics,e6mnn9n,9itezj,19,e6mbat3,1539527694,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",cringe,False,/r/cringe/comments/9itezj/un_audience_laughs_a...,,4.48032
e6mekpq,1537897256,"Federally, the minimum wage was last raised ab...",hexqueen,e6mebo1,9itezj,45,e6m98ca,1539521315,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",cringe,False,/r/cringe/comments/9itezj/un_audience_laughs_a...,,4.43135
e6mm6xi,1537903433,"That's because it's Vox, and Vox is fucking jo...",politicusmaximus,e6miz8i,9itezj,7,e6m98ca,1539524977,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",cringe,False,/r/cringe/comments/9itezj/un_audience_laughs_a...,,4.42079
e5bu0sb,1535994776,Well it certainly works better than what my an...,OzzieBloke777,e5bp7cl,9cmazx,3,e5bo75z,1538624701,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",pics,False,/r/pics/comments/9cmazx/they_noticed_there_was...,,4.3292
e5cbb0z,1536010875,"Nope, Doing something right. It’s lazy, dated ...",fringerella,e5c97x2,9cmazx,2,e5bo75z,1538632738,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",pics,False,/r/pics/comments/9cmazx/they_noticed_there_was...,,4.32511
e5rq1lj,1536643606,Not to mention counterfeit drugs (and other it...,JardinSurLeToit,e5qm5zf,9enmff,2,e5q8q50,1538928677,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",todayilearned,False,/r/todayilearned/comments/9enmff/til_that_chea...,,4.29813


These are the least surprising utterances.

In [15]:
transformed_corpus.get_utterances_dataframe().sort_values('meta.surprise').head(10)

Unnamed: 0_level_0,timestamp,text,speaker,reply_to,conversation_id,meta.score,meta.top_level_comment,meta.retrieved_on,meta.gilded,meta.gildings,meta.subreddit,meta.stickied,meta.permalink,meta.author_flair_text,meta.surprise
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
e589li3,1535832300,She isn't the one making it about her. He is. ...,CaptainHarleyStorm,e5890a1,9c4wc2,22,e581pbs,1538564694,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",cringepics,False,/r/cringepics/comments/9c4wc2/mr_hands_over_he...,,1.54767
e5bs1ii,1535992938,I promise you the branding is good enough. And...,Larry-Man,e5bry78,9cmazx,1,e5bo75z,1538623805,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",pics,False,/r/pics/comments/9cmazx/they_noticed_there_was...,,1.75255
e58iauu,1535840680,"This is not AMBIGUOUS, you lubed up dildo. \n\...",Eboo143,e58gybu,9c4wc2,8,e58as0b,1538568769,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",cringepics,False,/r/cringepics/comments/9c4wc2/mr_hands_over_he...,,1.80216
e58dvq6,1535836384,It amazes me that he can look her in the eye w...,Challa6,e583fp8,9c4wc2,2,e581pbs,1538566678,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",cringepics,False,/r/cringepics/comments/9c4wc2/mr_hands_over_he...,,1.81582
e5m3kzc,1536418562,Censorship of speech. You can paint it however...,ppc127,e5lbfu6,9e0nal,1,e5lbfu6,1538834049,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",technology,False,/r/technology/comments/9e0nal/apple_just_perma...,,1.8177
e58rzs0,1535850627,"I'll make it easier for you, then: you did it ...",SabbathViper,e58kigr,9c4wc2,-1,e5844mv,1538573263,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",cringepics,False,/r/cringepics/comments/9c4wc2/mr_hands_over_he...,,1.81774
e5jmfq9,1536306421,The thing is youtube claims system is not base...,Ekint,e5iivyl,9dixh1,1,e5ibybt,1538770435,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",Music,False,/r/Music/comments/9dixh1/the_future_is_here_to...,,1.82878
e5lyy7b,1536412777,Is there something for me to drink but only ha...,parrot_in_hell,e5k5np8,9durlp,1,e5k5np8,1538831861,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",explainlikeimfive,False,/r/explainlikeimfive/comments/9durlp/eli5_caff...,,1.83889
e5kg6go,1536342368,Adding to this; the crash you feel after the c...,Spider-Ian,e5k5np8,9durlp,1,e5k5np8,1538798451,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",explainlikeimfive,False,/r/explainlikeimfive/comments/9durlp/eli5_caff...,,1.86629
e5qm7tu,1536606025,We had international students from China when ...,TheProphecyIsNigh,e5q8q50,9enmff,1,e5q8q50,1538910016,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",todayilearned,False,/r/todayilearned/comments/9enmff/til_that_chea...,,1.87036


## Example 2: How surprising is a conversation compared to other conversations in a corpus?

To speed up this demo, let's take a look at just the top 100 conversations with the most utterances.

In [3]:
utterances = corpus.get_utterances_dataframe()
top_convos = set(utterances.groupby('conversation_id').apply(len).sort_values(ascending=False)[:100].index)

In [4]:
subset_corpus = corpus.filter_utterances_by(lambda utt: utt.text and utt.conversation_id in top_convos)
subset_corpus.print_summary_stats()

Number of Speakers: 25309
Number of Utterances: 51704
Number of Conversations: 100


In [5]:
transformer = Surprise(target_sample_size=200, context_sample_size=1000, n_samples=50)

In [6]:
transformer.fit(corpus, group_models_by=[])

<convokit.surprise.surprise.Surprise at 0x1f1e625eac8>

Let's add the surprise score to conversations instead of utterances for this example.

In [7]:
transformed_corpus = transformer.transform(subset_corpus, 'conversation', group_target_by=['conversation'], context_selector=lambda s,t: s.index.get_level_values('conversation_id') != t[0], model_selector=lambda ind: 0)

In [8]:
transformed_corpus.get_conversations_dataframe()

Unnamed: 0_level_0,vectors,meta.title,meta.num_comments,meta.domain,meta.timestamp,meta.subreddit,meta.gilded,meta.gildings,meta.stickied,meta.author_flair_text,meta.surprise
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9euhs0,[],TIFU by uploading my old exams on the internet.,2174,self.tifu,1536640834,tifu,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.9154
9guiia,[],TIFU by making a joke and losing the right to ...,2578,self.tifu,1537274818,tifu,5,"{'gid_1': 0, 'gid_2': 5, 'gid_3': 0}",False,FUOTW 9/16/2018,1.98383
9c4wc2,[],Mr. Hands over here. On a NATIONALLY televised...,3945,i.redd.it,1535823104,cringepics,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.958
9frcxa,[],A doctor who raped a sedated female patient ge...,1661,abcnews.go.com,1536926142,TwoXChromosomes,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.6775
9fsct8,[],A new political party emerges in Canada!,2724,peoplespartyofcanada.ca,1536934540,canada,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.06906
...,...,...,...,...,...,...,...,...,...,...,...
9k1ydy,[],Tesla Model 3 Tricks Cop,3102,youtu.be,1538267436,videos,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.02866
9hqq07,[],The difference between my rice cooking skills ...,1150,v.redd.it,1537541836,funny,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.30393
9clbf7,[],What common item has a feature that most peopl...,21181,self.AskReddit,1535979914,AskReddit,1,"{'gid_1': 0, 'gid_2': 1, 'gid_3': 0}",False,,2.21341
9fhecm,[],What main character didn't deserve a happy end...,19197,self.AskReddit,1536840424,AskReddit,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.94239


In [9]:
transformed_corpus.get_conversations_dataframe().sort_values('meta.surprise', ascending=False).head(10)

Unnamed: 0_level_0,vectors,meta.title,meta.num_comments,meta.domain,meta.timestamp,meta.subreddit,meta.gilded,meta.gildings,meta.stickied,meta.author_flair_text,meta.surprise
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9ee5jm,[],One of the unspoken (and worst) parts of flood...,2296,i.redd.it,1536509657,WTF,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.6405
9fbcix,[],Reddit bans QAnon subreddit,3250,thehill.com,1536783754,technology,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.49026
9k5lj5,[],A perfect salt crystal I grew,1276,imgur.com,1538306149,pics,1,"{'gid_1': 0, 'gid_2': 1, 'gid_3': 0}",False,,2.46831
9itdw9,[],ELI5: How do we know counting rings in a tree ...,391,self.explainlikeimfive,1537891192,explainlikeimfive,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.44398
9fwk4s,[],"TIL: ""He never married"" was a commonly used by...",1778,en.wikipedia.org,1536964885,todayilearned,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.44116
9fr0z3,[],"In Deadpool 2 Wade tells the Juggernaut ""It ha...",608,i.redd.it,1536922840,MovieDetails,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.40241
9cjvjq,[],"ELI5: Why do flies ""like"" us? What do they fin...",1365,self.explainlikeimfive,1535964567,explainlikeimfive,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.40004
9gj8zs,[],Daily exposure to BPA in amounts that are FDA-...,672,munews.missouri.edu,1537181188,science,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,MD-PhD-MBA | Clinical Professor/Medicine,2.38835
9jv6hs,[],Evidence that increased BMI causes lower menta...,2759,bristol.ac.uk,1538212546,science,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.38065
9iwh3d,[],Do (fighter) airplanes really have an onboard ...,849,self.askscience,1537912286,askscience,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,2.37034


In [10]:
transformed_corpus.get_conversations_dataframe().sort_values('meta.surprise').head(10)

Unnamed: 0_level_0,vectors,meta.title,meta.num_comments,meta.domain,meta.timestamp,meta.subreddit,meta.gilded,meta.gildings,meta.stickied,meta.author_flair_text,meta.surprise
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
9h71xy,[],Comment the name of a book that you've been on...,3025,self.books,1537377232,books,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.65027
9ijdc4,[],To all theists: What would it take to change y...,656,self.DebateReligion,1537806262,DebateReligion,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.67559
9frcxa,[],A doctor who raped a sedated female patient ge...,1661,abcnews.go.com,1536926142,TwoXChromosomes,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.6775
9k3jbf,[],Voters Are Trading Honesty for Loyalty - Novel...,1609,bloomberg.com,1538282407,science,1,"{'gid_1': 0, 'gid_2': 1, 'gid_3': 0}",False,MD-PhD-MBA | Clinical Professor/Medicine,1.80085
9h5oi0,[],I'm a Catholic Bishop and Philosopher Who Love...,11925,self.IAmA,1537367736,IAmA,2,"{'gid_1': 0, 'gid_2': 2, 'gid_3': 0}",False,,1.81484
9ijtbj,[],"I am Alexandria Brown, a dominatrix who recent...",2501,self.IAmA,1537809195,IAmA,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.88119
9jmdvl,[],Rape Allegations: American Woman Files Complai...,3603,spiegel.de,1538136446,soccer,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,Borussia Dortmund,1.88558
9jka5q,[],Sen. Klobuchar asks if Kavanaugh has a drinkin...,1733,youtube.com,1538112540,cringe,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.88843
9c2qhd,[],Have you ever read the first page or two of a ...,3326,self.books,1535805464,books,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.89343
9g0vnj,[],"If numbers can be infinitely large, can they a...",913,self.askscience,1537012141,askscience,0,"{'gid_1': 0, 'gid_2': 0, 'gid_3': 0}",False,,1.90237
