In [1]:
import os
os.getcwd()
os.chdir("..")
os.chdir("..")
os.getcwd()

'/Users/calebchiam/Documents/GitHub/Cornell-Conversational-Analysis-Toolkit'

In [2]:
import convokit

In [3]:
from convokit import randomize_thread

In [4]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

First we download the reddit corpus:

In [5]:
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus"))

Dataset already exists at /Users/calebchiam/.convokit/downloads/reddit-corpus


In [6]:
corpus.print_summary_stats()

Number of Users: 521777
Number of Utterances: 2004262
Number of Conversations: 84979


In [7]:
threads = corpus.utterance_threads(prefix_len=10, include_root=False)

The following are threads that come from the /r/canada subreddit. 

Let's randomize all threads to create a random baseline:

In [8]:
randomized = {root:randomize_thread(root, threads[root]) for root in threads}  

Construct a new Corpus using randomized threads:

In [9]:
utts = [utt for t in randomized for utt in randomized[t].values()]

In [10]:
corpus_randomized = convokit.model.Corpus(utterances=utts)

In [12]:
random_threads = corpus_randomized.utterance_threads(prefix_len=10, include_root=False)

In [13]:
from convokit import display_thread

In [14]:
from random import sample

In [25]:
for thread_id in sample(list(random_threads), 10):
    print("==Actual==")
    display_thread(threads, thread_id)
    print()
    print("==Randomized==")
    display_thread(random_threads, thread_id)
    print()

==Actual==
0-3ahab
    Carlbye
        0-3ahab
            -Kern-
            Carlbye
            Carlbye
                Mike_davvy
                    Carlbye
            Mike_davvy
    Mike_davvy

==Randomized==
0-3ahab
    Carlbye
        0-3ahab
            Carlbye
            Mike_davvy
    -Kern-
        Carlbye
            Mike_davvy
    Mike_davvy
        Carlbye

==Actual==
RushDaSlush02
    Safari_Master
        RushDaSlush02
        RushDaSlush02
            Safari_Master
                RushDaSlush02
                    Safari_Master
                        RushDaSlush02
                            Safari_Master
                                RushDaSlush02

==Randomized==
RushDaSlush02
    Safari_Master
        RushDaSlush02
            Safari_Master
                RushDaSlush02
                    Safari_Master
                        RushDaSlush02
        RushDaSlush02
            Safari_Master
                RushDaSlush02

==Actual==
fadasd1
    blorgenheim
    gogog

In [24]:
display_thread(random_threads, 'e64u2fq')

freeman_lambda
    vietjesus95
    TheLumpyLump
        freeman_lambda
            Kaldricus
        Animastryfe
    explosivecurry13
        prof0ak
        TheLumpyLump
    bbbsoldierbbb


We extract hypergraph features for this new Corpus, as well as for the original Corpus.

In [20]:
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10, include_root=False)

In [21]:
motif_counts = hc.retrieve_motif_counts(corpus)

In [22]:
motif_counts_random = hc.retrieve_motif_counts(corpus_randomized)

In [26]:
motif_counts_df = pd.DataFrame.from_dict(motif_counts, orient='index')
motif_counts_random_df = pd.DataFrame.from_dict(motif_counts_random, orient='index')

In [27]:
def get_subreddit(threads, thread_id):
    if thread_id not in threads:
        raise ValueError
    return threads[thread_id][thread_id].meta["subreddit"]

In [28]:
subreddits = {thread_id: get_subreddit(threads, thread_id) for thread_id in threads}

In [29]:
subreddits_df = pd.DataFrame.from_dict(subreddits, columns=['subreddit'], orient='index')

In [30]:
motif_counts_df = pd.concat([motif_counts_df, subreddits_df], axis=1)
motif_counts_random_df = pd.concat([motif_counts_random_df, subreddits_df], axis=1)

In [33]:
x_normal = motif_counts_df.groupby('subreddit').mean().mean()

In [41]:
x_random = motif_counts_random_df.groupby('subreddit').mean().mean()

In [52]:
diff_by_percent = pd.DataFrame((x_normal - x_random)*100 / x_random, columns=['count'])
diff_by_percent['triad-motif'] = list(diff_by_percent.index)
diff_by_percent.index = range(len(diff_by_percent))

In [76]:
%matplotlib qt
g = sns.barplot(x="triad-motif", y="count", data=diff_by_percent)
g.set_title("Empirical count means difference from randomized count means (%)")
g.set_ylabel("% difference")
g.set_xticklabels(g.get_xticklabels(), rotation=40, horizontalalignment='right', fontsize='x-small')
plt.tight_layout()
plt.show()

In [31]:
import seaborn as sns

In [32]:
import random

In [42]:
# def get_mean_stats_from_sampled_threads(threads, n=100):
#     sampled_ids = set(random.sample(list(threads), n))
#     randomized = {root:randomize_thread(root, threads[root]) for root in threads if root in sampled_ids}
#     utts = [utt for t in randomized for utt in randomized[t].values()]
#     corpus_randomized = convokit.Corpus(utterances=utts)
#     hc = convokit.HyperConvo(min_thread_len=10, prefix_len=10, include_root=False)
#     motif_counts = hc.retrieve_motif_counts(corpus_randomized)
#     df = pd.DataFrame.from_dict(motif_counts, orient='index')
#     return df.mean()
    

In [45]:
# bootstrap_sample = pd.DataFrame([get_mean_stats_from_sampled_threads(threads) for _ in range(1000)])

In [58]:
# bootstrap_sample['type'] = 'randomized'

In [None]:
# bootstrap_data = []
# for row_idx in range(len(bootstrap_sample)):
#     row = bootstrap_sample.iloc[row_idx]
#     bootstrap_data.extend(list(zip(list(row.index)[:-1], row.values[:-1], (['randomized']*len(row.index))[:-1])))

In [34]:
random_data_long = []
for row_idx in range(len(motif_counts_random_df)):
    row = motif_counts_random_df.iloc[row_idx]
    random_data_long.extend(list(zip(list(row.index)[:-1], row.values[:-1], (['randomized']*len(row.index))[:-1])))

In [35]:
# bootstrap_data.extend(list(zip(list(x_normal.index), x_normal.values, ['empirical']*len(x_normal.index))))
random_data_long.extend(list(zip(list(x_normal.index), x_normal.values, ['empirical']*len(x_normal.index))))

In [36]:
graph_data = pd.DataFrame(random_data_long, columns=['triad-motif', 'count', 'type'])

In [40]:
graph_data.groupby('type').groupby('triad-motif').mean()

AttributeError: Cannot access callable attribute 'groupby' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [37]:
graph_data.head(5)

Unnamed: 0,triad-motif,count,type
0,NO_EDGE_TRIADS,16.0,randomized
1,SINGLE_EDGE_TRIADS,5.0,randomized
2,INCOMING_TRIADS,1.0,randomized
3,OUTGOING_TRIADS,0.0,randomized
4,DYADIC_TRIADS,3.0,randomized


In [175]:
# g = sns.barplot(x='triad-motif', y='count', hue='type', data=graph_data, ci=95)
%matplotlib qt5
g = sns.catplot(x='triad-motif', y='count', hue="type", data=graph_data, kind='bar', ci=95, legend_out=False, palette='pastel')
g.set_xticklabels(rotation=40, horizontalalignment='right', fontsize=8)
plt.tight_layout()
plt.show()

## Subreddit specific

In [134]:
subreddit_list = {'WTF', 'magicTCG', 'techsupport', 'changemyview', 'AskTrumpSupporters'}
motif_list = ['UNIDIRECTIONAL_TRIADS', 'INCOMING_2TO3_TRIADS', 'INCOMING_1TO3_TRIADS','DIRECIPROCAL_TRIADS']

In [135]:
motif_count_limited = motif_counts_df.groupby('subreddit').mean().loc[subreddit_list][motif_list]

In [151]:
motif_count_random_limited = motif_counts_random_df[motif_counts_random_df['subreddit'].apply(lambda x: x in subreddit_list)][motif_list + ["subreddit"]]

In [153]:
data = []
for idx in range(len(motif_count_limited)):
    row = motif_count_limited.iloc[idx]
    data.extend(list(zip([row.name]*len(motif_list), list(row.index), row.values, ['empirical']*len(motif_list))))
for idx in range(len(motif_count_random_limited)):
    row = motif_count_random_limited.iloc[idx]
    data.extend(list(zip([row['subreddit']]*len(motif_list), 
                         list(row.index)[:-1], 
                         row.values[:-1], 
                         ['randomized']*len(motif_list))))

In [154]:
graph_data2 = pd.DataFrame(data, columns=['subreddit', 'triad-motif', 'count', 'type'])

In [155]:
graph_data2

Unnamed: 0,subreddit,triad-motif,count,type
0,WTF,UNIDIRECTIONAL_TRIADS,3.033,empirical
1,WTF,INCOMING_2TO3_TRIADS,0.026,empirical
2,WTF,INCOMING_1TO3_TRIADS,1.672,empirical
3,WTF,DIRECIPROCAL_TRIADS,0.332,empirical
4,changemyview,UNIDIRECTIONAL_TRIADS,0.268,empirical
5,changemyview,INCOMING_2TO3_TRIADS,0.008,empirical
6,changemyview,INCOMING_1TO3_TRIADS,1.368,empirical
7,changemyview,DIRECIPROCAL_TRIADS,0.976,empirical
8,magicTCG,UNIDIRECTIONAL_TRIADS,2.335,empirical
9,magicTCG,INCOMING_2TO3_TRIADS,0.146,empirical


In [176]:
%matplotlib qt5
g = sns.catplot(x="triad-motif", y="count", hue="type", col="subreddit",
                data=graph_data2, kind="bar", legend_out=False, ci=95)

g.set_xticklabels(rotation=40, horizontalalignment='right', fontsize='x-small')
plt.tight_layout()
plt.show()

In [None]:
thread_ids = []
feats = []
for key, feat_dict in threads_feats.items():
    thread_ids.append(key)
    feats.append([clean_value(feat_dict[k]) for k in feat_names])

For later convenience we will store feature values in a dataframe:

In [None]:
random_feat_df = pd.DataFrame(data=r_feats, index=random_thread_ids, columns=feat_names)

In [None]:
feat_df = pd.DataFrame(data=feats, index=thread_ids, columns=feat_names)

Here are some examples of features computed over the three example threads from before:

In [None]:
motif_count_feats = [x for x in feat_names if ('count' in x) and ('mid' not in x) and ('present' not in x)]
prob_feats = [x for x in feat_names if ('trans' in x)]

Let's get an aggregate statistic:

In [None]:
display_thread(threads, 't1_c0odlio')

# Deviations

In [None]:
overall_mean_diff = feat_df.mean() - random_feat_df.mean()
overall_mean_diff[:10]

In [None]:
mean_deviation = overall_mean_diff / random_feat_df.std()
mean_deviation[:10]

In [None]:
mean_deviation[motif_count_feats].sort_values()

- All the paths leading up to DIRECIPROCAL TRIADS, except for UNIDIRECTIONAL TRIADS, occur in the real dataset at rates better than chance. 

- All the triads with closure (and OUTGOING TRIADS) occur at rates less than chance.

In fact, if you think about it, UNIDIRECTIONAL and OUTGOING triads (the two types without closure) are still triads that represent an interaction 'beyond the dyadic relationship'.  And they both occur at rates less than chance.

This perhaps implies that triad motifs with closure are some kind of 'anti-phenomenon'. Are they still 'real' then?

## Specific subreddits

At the aggregate level, it might seem that triad motifs with closure simply do not happen. But perhaps at the level of specific subreddits, we would see a different trend unfold.

Let's group by subreddit then use cosine similarity to find the subreddit that is most unlike the aggregate means.

In [None]:
motif_feat_df = feat_df[motif_count_feats]
motif_feat_df_mean = motif_feat_df.mean()
motif_feat_df_sd = motif_feat_df.std()

Getting subreddit labels:

In [None]:
def get_subreddit(threads, thread_id):
    if thread_id not in threads:
        return None
    return threads[thread_id][thread_id].other["user-info"]["subreddit"]

subreddits = [get_subreddit(threads, thread_id) for thread_id in threads]
# 99145 threads, from 100 subreddits. Roughly 1000 threads per subreddit

In [None]:
motif_feat_df['subreddit'] = subreddits

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
subreddit_means = motif_feat_df.groupby('subreddit').mean()

In [None]:
subreddit_means.loc['AdviceAnimals'].shape

In [None]:
import numpy as np

In [None]:
def cosine_sim(X, Y):
    assert(len(X) == len(Y))
    return np.array(X).dot(np.array(Y)) / (np.linalg.norm(X)*np.linalg.norm(Y))

In [None]:
cosine_sims = []
for subreddit_name in subreddit_means.index:
    cosine_sims.append(cosine_sim(subreddit_means.loc[subreddit_name][:-1], motif_feat_df_mean))
cosine_sims[:10]

In [None]:
subreddit_means['sim'] = cosine_sims

Top 15 most dissimilar (compared to mean) subreddits:

In [None]:
subreddit_means.sort_values(by='sim')[:15]

Let's ignore subreddits based on exchanges / swaps in favour of those that are discussion-based.

In the above list, this would be MLPLounge, DebateReligion, electronic_cigarette, POLITIC (smaller sample), MensRights, conspiracy, teenagers. We exclude POLITIC because it has a smaller sample size than the rest.

### MLPLounge

In [None]:
((subreddit_means.loc['MLPLounge'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

Interestingly, all the features that feature more commonly in MLPLounge are mainly the features involving triadic closure. (Direciprocal triads is an exception to this.) Though, some of the features involving triadic closure are fewer as well.

### DebateReligion

In [None]:
((subreddit_means.loc['DebateReligion'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

### electronic_cigarette

In [None]:
((subreddit_means.loc['electronic_cigarette'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

### MensRights

In [None]:
((subreddit_means.loc['MensRights'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

### conspiracy

In [None]:
((subreddit_means.loc['conspiracy'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

### teenagers

In [None]:
((subreddit_means.loc['teenagers'] - motif_feat_df_mean)/motif_feat_df_sd).sort_values()

These subreddits are all **similarly** different from the average subreddit. They all have:
- Slightly higher inclusion of direciprocal, direciprocal_2to3, trireciprocal triads, suggesting high discussion engagement for particular users.
- Much lower no_edge, single_edge triads, implying fewer drive-by commenters, i.e. these are not expansionary high-activity threads but threads for a small group of participants.

In [None]:
tgts = ["DebateReligion", "electronic_cigarette", "MensRights", "conspiracy", "teenagers"]

In [None]:

for tgt in tgts:
    
    a.append((subreddit_means.loc[tgt] - motif_feat_df_mean).sort_values())

In [None]:
a

In [None]:
subreddit_means.index

In [None]:
random_feat_df['subreddit'] = subreddits
feat_df['subreddit'] = subreddits

In [None]:
motif_prob_feats = motif_count_feats + prob_feats

In [None]:
random_subreddit_means = random_feat_df.groupby('subreddit').mean()


In [None]:
#random_subreddit_means[motif_prob_feats].to_csv("subreddits_x_motifs_random.csv")

In [None]:
#random_subreddit_means[motif_prob_feats].T.to_csv("motifs_x_subreddits_random.csv")

In [None]:
random_subreddit_sd = random_feat_df.groupby('subreddit').std()

In [None]:
random_subreddit_sd.loc['AdviceAnimals'][motif_count_feats]

In [None]:
random_subreddit_sd[motif_count_feats].T.sort_index()

In [None]:
random_subreddit_sd[prob_feats].T.sort_index()

Let's examine how the empirical values deviate from the random thread statistics:

In [None]:
def get_deviations(threads, thread_id, feats):
    """
    Returns a pandas series of % SD deviations
    """
    subreddit = get_subreddit(threads, thread_id)
    feat_stats = feat_df.loc[thread_id][feats]
    
    random_means = random_subreddit_means.loc[subreddit][feats]
    random_sds = random_subreddit_sd.loc[subreddit][feats]
    deviations = (feat_stats - random_means) / random_sds
    return deviations

As an example:

In [None]:
get_deviations(threads, 't1_c32bcq8', motif_prob_feats)

With this, let's create a dataframe where each thread is represented as a series of deviations / non-deviations. We can then group by subreddit and see which subreddit has the greatest percentage of deviations for various feats.

In [None]:
deviation_df = pd.DataFrame(dtype=float)

In [None]:
for thread in threads:
    deviation_df[thread] = get_deviations(threads, thread, motif_prob_feats)

In [None]:
deviation_df.dtypes

In [None]:
deviation_df

In [None]:
deviation_df_T = deviation_df.T

In [None]:
for k in deviation_df_T:
    deviation_df_T[k] = pd.to_numeric(deviation_df_T[k])

In [None]:
deviation_df_T['subreddit'] = subreddits

In [None]:
dev_means = deviation_df_T.groupby('subreddit').mean()

In [None]:
#dev_means.to_csv("deviation_means.csv")

In [None]:
#random_feat_df.mean(axis=0).to_csv("total_avg_feats.csv")