In [1]:
import os
os.getcwd()
os.chdir("../../..")
os.getcwd()

'/Users/calebchiam/Documents/GitHub/Cornell-Conversational-Analysis-Toolkit'

In [2]:
import convokit

In [3]:
from convokit.hyperconvo.threadRandomizer import randomize_thread

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

First we download the reddit corpus:

In [5]:
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small"))

Dataset already exists at /Users/calebchiam/.convokit/downloads/reddit-corpus-small


We will compute features over only the first 10 comments that occur in a thread, for  threads which are at least 10 comments long (controling for thread length in this way):

In [6]:
threads = corpus.utterance_threads(prefix_len=10, include_root=False)

In [7]:
from convokit.util import display_thread

The following are threads that come from the /r/canada subreddit. 

In [8]:
demo_threads = [
    # https://www.reddit.com/r/canada/comments/mmyld/polygamypolyamoury_laws_upheld_in_supreme_court/c32bcq8/
                'e5n2rm0', 
    # https://www.reddit.com/r/canada/comments/mmd20/look_what_i_found_today_yes_and_it_is_in_canada/c322oa7/
                'e60alv2',
    # https://www.reddit.com/r/canada/comments/mo0lt/happy_thursday/c32hv0h/
                'e6fc787'
               ]

In [9]:
for thread in demo_threads:
    print(thread)
    print('---')
    display_thread(threads, thread)
    print()

e5n2rm0
---
macyonachos
    flirtiesers
        macyonachos
            flirtiesers
                macyonachos
    Grex27
        macyonachos
    nicktanisok
        macyonachos
    thexynapse

e60alv2
---
microtek789
    altobng
        Burnz2p
            rockythebalboa1990
        DavidGraaskov
            JJJaso
            mee_sua
    WalfAkaiTsuki
        ChillySaus
    DrMrMadmanSr

e6fc787
---
Kukantiz
    gently_into_the_dark
        Juvyn00b
        [deleted]
    626c6f775f6d65
        KAWAII_SATAN_666
            Steffi128
    3dgemaster
    The_RTV
    jiaaa



Let's randomize these threads and see the (random) results:

In [10]:
for thread in demo_threads:
    randomized = randomize_thread(thread, threads[thread])
    print(thread)
    print('---')
    display_thread({thread: randomized}, thread)
    print()

e5n2rm0
---
macyonachos
    flirtiesers
        macyonachos
            flirtiesers
                macyonachos
                    nicktanisok
                        macyonachos
                            thexynapse
            Grex27
                macyonachos

e60alv2
---
microtek789
    altobng
        Burnz2p
            DavidGraaskov
                rockythebalboa1990
        JJJaso
        WalfAkaiTsuki
            mee_sua
        ChillySaus
    DrMrMadmanSr

e6fc787
---
Kukantiz
    gently_into_the_dark
        The_RTV
    626c6f775f6d65
        Juvyn00b
            3dgemaster
                [deleted]
    KAWAII_SATAN_666
    Steffi128
        jiaaa



Let's randomize all threads to create a random baseline:

In [11]:
randomized = {root:randomize_thread(root, threads[root]) for root in threads}  

Construct a new Corpus using randomized threads:

In [12]:
utts = [utt for thread in randomized for utt in randomized[thread].values()]

In [13]:
utts[:1]

[Utterance({'id': 'e58slx0', 'user': User([('name', 'RedFaceGeneral')]), 'root': 'e58slx0', 'reply_to': None, 'timestamp': 0, 'text': None, 'meta': {}})]

In [14]:
random_corpus = convokit.Corpus(utterances=utts)

We extract hypergraph features for this new Corpus, as well as for the original Corpus.

In [15]:
random_hc = convokit.HyperConvo()

In [18]:
random_hc.retrieve_motif_counts(random_corpus, include_root=True)

{'e58slx0': {'NO_EDGE_TRIADS': 34,
  'SINGLE_EDGE_TRIADS': 38,
  'INCOMING_TRIADS': 4,
  'OUTGOING_TRIADS': 1,
  'DYADIC_TRIADS': 0,
  'UNIDIRECTIONAL_TRIADS': 8,
  'INCOMING_2TO3_TRIADS': 1,
  'INCOMING_1TO3_TRIADS': 0,
  'DIRECTED_CYCLE_TRIADS': 0,
  'OUTGOING_3TO1_TRIADS': 0,
  'INCOMING_RECIPROCAL_TRIADS': 0,
  'OUTGOING_RECIPROCAL_TRIADS': 0,
  'DIRECTED_CYCLE_1TO3_TRIADS': 0,
  'DIRECIPROCAL_TRIADS': 0,
  'DIRECIPROCAL_2TO3_TRIADS': 0,
  'TRIRECIPROCAL_TRIADS': 0},
 'e594ur8': {'NO_EDGE_TRIADS': 13,
  'SINGLE_EDGE_TRIADS': 9,
  'INCOMING_TRIADS': 3,
  'OUTGOING_TRIADS': 0,
  'DYADIC_TRIADS': 1,
  'UNIDIRECTIONAL_TRIADS': 1,
  'INCOMING_2TO3_TRIADS': 0,
  'INCOMING_1TO3_TRIADS': 0,
  'DIRECTED_CYCLE_TRIADS': 0,
  'OUTGOING_3TO1_TRIADS': 0,
  'INCOMING_RECIPROCAL_TRIADS': 0,
  'OUTGOING_RECIPROCAL_TRIADS': 0,
  'DIRECTED_CYCLE_1TO3_TRIADS': 0,
  'DIRECIPROCAL_TRIADS': 0,
  'DIRECIPROCAL_2TO3_TRIADS': 1,
  'TRIRECIPROCAL_TRIADS': 0},
 'e5988ip': {'NO_EDGE_TRIADS': 34,
  'SINGLE_EDGE

In [None]:
random_feats['t1_c32bcq8']

In [None]:
type(random_feats)

original corpus:

In [None]:
# create a hyperconvo object and use it to extract features
hc = convokit.HyperConvo(corpus)
threads_feats = hc.fit_transform()

In [None]:
def clean_value(x):
    if np.isinf(x) or np.isnan(x):
        return -1
    return x

In [None]:
feat_names = list(next(iter(threads_feats.values())).keys())

In [None]:
feat_names[:10]

In [None]:
random_thread_ids = []
r_feats = []
for key, feat_dict in random_feats.items():
    random_thread_ids.append(key)
    r_feats.append([clean_value(feat_dict[k]) for k in feat_names])

In [None]:
len(r_feats)

In [None]:
thread_ids = []
feats = []
for key, feat_dict in threads_feats.items():
    thread_ids.append(key)
    feats.append([clean_value(feat_dict[k]) for k in feat_names])

For later convenience we will store feature values in a dataframe:

In [None]:
random_feat_df = pd.DataFrame(data=r_feats, index=random_thread_ids, columns=feat_names)

In [None]:
feat_df = pd.DataFrame(data=feats, index=thread_ids, columns=feat_names)

Here are some examples of features computed over the three example threads from before:

In [None]:
motif_count_feats = [x for x in feat_names if ('count' in x) and ('mid' not in x) and ('present' not in x)]
prob_feats = [x for x in feat_names if ('trans' in x)]

Let's get an aggregate statistic:

In [None]:
display_thread(threads, 't1_c0odlio')

# Deviations

In [None]:
overall_mean_diff = feat_df.mean() - random_feat_df.mean()
overall_mean_diff[:10]

In [None]:
mean_deviation = overall_mean_diff / random_feat_df.std()
mean_deviation[:10]

In [None]:
mean_deviation[motif_count_feats].sort_values()

- All the paths leading up to DIRECIPROCAL TRIADS, except for UNIDIRECTIONAL TRIADS, occur in the real dataset at rates better than chance. 

- All the triads with closure (and OUTGOING TRIADS) occur at rates less than chance.

In fact, if you think about it, UNIDIRECTIONAL and OUTGOING triads (the two types without closure) are still triads that represent an interaction 'beyond the dyadic relationship'.  And they both occur at rates less than chance.

## Specific subreddits

At the aggregate level, it might seem that triad motifs with closure simply do not happen. But perhaps at the level of specific subreddits, we see a different trend unfold.

Let's group by subreddit then use cosine similarity to find the subreddit that is most unlike the aggregate means.

In [None]:
motif_feat_df = feat_df[motif_count_feats]
motif_feat_df_mean = motif_feat_df.mean()
motif_feat_df_sd = motif_feat_df.std()

Getting subreddit labels:

In [None]:
def get_subreddit(threads, thread_id):
    if thread_id not in threads:
        return None
    return threads[thread_id][thread_id].other["user-info"]["subreddit"]

subreddits = [get_subreddit(threads, thread_id) for thread_id in threads]
# 99145 threads, from 100 subreddits. Roughly 1000 threads per subreddit

In [None]:
motif_feat_df['subreddit'] = subreddits

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
subreddit_means = motif_feat_df.groupby('subreddit').mean()

In [None]:
subreddit_means.loc['AdviceAnimals'].shape

In [None]:
import numpy as np

In [None]:
def cosine_sim(X, Y):
    assert(len(X) == len(Y))
    return np.array(X).dot(np.array(Y)) / (np.linalg.norm(X)*np.linalg.norm(Y))

In [None]:
cosine_sims = []
for subreddit_name in subreddit_means.index:
    cosine_sims.append(cosine_sim(subreddit_means.loc[subreddit_name][:-1], motif_feat_df_mean))
cosine_sims[:10]

In [None]:
subreddit_means['sim'] = cosine_sims

Top 15 most dissimilar (compared to mean) subreddits:

In [None]:
subreddit_means.sort_values(by='sim')[:15]

Let's ignore subreddits based on exchanges / swaps in favour of those that are discussion-based.

In the above list, this would be MLPLounge, DebateReligion, electronic_cigarette, POLITIC (smaller sample), MensRights, conspiracy, teenagers. We exclude POLITIC because it has a smaller sample size than the rest.

### MLPLounge

In [None]:
((subreddit_means.loc['MLPLounge'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

Interestingly, all the features that feature more commonly in MLPLounge are mainly the features involving triadic closure. (Direciprocal triads is an exception to this.) Though, some of the features involving triadic closure are fewer as well.

### DebateReligion

In [None]:
((subreddit_means.loc['DebateReligion'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

### electronic_cigarette

In [None]:
((subreddit_means.loc['electronic_cigarette'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

### MensRights

In [None]:
((subreddit_means.loc['MensRights'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

### conspiracy

In [None]:
((subreddit_means.loc['conspiracy'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

### teenagers

In [None]:
((subreddit_means.loc['teenagers'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

These subreddits are all **similarly** different from the average subreddit. They all have:
- Slightly higher inclusion of direciprocal, direciprocal_2to3, trireciprocal triads, suggesting high discussion engagement for particular users.
- Much lower no_edge, single_edge triads, implying fewer drive-by commenters, i.e. these are not expansionary high-activity threads but threads for a small group of participants.

In [None]:
tgts = ["DebateReligion", "electronic_cigarette", "MensRights", "conspiracy", "teenagers"]

In [None]:

for tgt in tgts:
    
    a.append((subreddit_means.loc[tgt] - motif_feat_df_mean).sort_values())

In [None]:
a

In [None]:
subreddit_means.index

In [None]:
random_feat_df['subreddit'] = subreddits
feat_df['subreddit'] = subreddits

In [None]:
motif_prob_feats = motif_count_feats + prob_feats

In [None]:
random_subreddit_means = random_feat_df.groupby('subreddit').mean()


In [None]:
#random_subreddit_means[motif_prob_feats].to_csv("subreddits_x_motifs_random.csv")

In [None]:
#random_subreddit_means[motif_prob_feats].T.to_csv("motifs_x_subreddits_random.csv")

In [None]:
random_subreddit_sd = random_feat_df.groupby('subreddit').std()

In [None]:
random_subreddit_sd.loc['AdviceAnimals'][motif_count_feats]

In [None]:
random_subreddit_sd[motif_count_feats].T.sort_index()

In [None]:
random_subreddit_sd[prob_feats].T.sort_index()

Let's examine how the empirical values deviate from the random thread statistics:

In [None]:
def get_deviations(threads, thread_id, feats):
    """
    Returns a pandas series of % SD deviations
    """
    subreddit = get_subreddit(threads, thread_id)
    feat_stats = feat_df.loc[thread_id][feats]
    
    random_means = random_subreddit_means.loc[subreddit][feats]
    random_sds = random_subreddit_sd.loc[subreddit][feats]
    deviations = (feat_stats - random_means) / random_sds
    return deviations

As an example:

In [None]:
get_deviations(threads, 't1_c32bcq8', motif_prob_feats)

With this, let's create a dataframe where each thread is represented as a series of deviations / non-deviations. We can then group by subreddit and see which subreddit has the greatest percentage of deviations for various feats.

In [None]:
deviation_df = pd.DataFrame(dtype=float)

In [None]:
for thread in threads:
    deviation_df[thread] = get_deviations(threads, thread, motif_prob_feats)

In [None]:
deviation_df.dtypes

In [None]:
deviation_df

In [None]:
deviation_df_T = deviation_df.T

In [None]:
for k in deviation_df_T:
    deviation_df_T[k] = pd.to_numeric(deviation_df_T[k])

In [None]:
deviation_df_T['subreddit'] = subreddits

In [None]:
dev_means = deviation_df_T.groupby('subreddit').mean()

In [None]:
#dev_means.to_csv("deviation_means.csv")

In [None]:
#random_feat_df.mean(axis=0).to_csv("total_avg_feats.csv")