In [1]:
import os
os.getcwd()
os.chdir("..")
os.chdir("..")
os.getcwd()

'/Users/calebchiam/Documents/GitHub/Cornell-Conversational-Analysis-Toolkit'

In [2]:
import convokit
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

In [3]:
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small"))

Dataset already exists at /Users/calebchiam/.convokit/downloads/reddit-corpus-small


In [4]:
threads = corpus.utterance_threads(prefix_len=10, include_root=False)

In [5]:
# 'e58slx0'

In [6]:
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10, include_root=False)


In [7]:
threads_motifs = hc.retrieve_motifs(corpus)

In [8]:
threads_motif_path_stats = hc.retrieve_motif_pathway_stats(corpus)

In [10]:
for thread_id, path_dict in threads_motifs.items():
    if thread_motif_path_stats[thread_id][('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS')] >= 1 and \
    thread_motif_path_stats[thread_id][('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS')]

NameError: name 'path_dict' is not defined

In [None]:
end_state

In [None]:
motif2 = path_dict[('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'DYADIC_TRIADS', 'INCOMING_1TO3_TRIADS', 'DIRECIPROCAL_TRIADS', 'DIRECIPROCAL_2TO3_TRIADS')][2]

In [None]:
spec = threads['t1_c2zdwxx']
hg = hc._make_hypergraph(uts=spec)
hg.extract_motifs()

In [None]:
motif3 = hg.extract_motifs()['DIRECIPROCAL_2TO3_TRIADS'][0]

In [None]:
viz_motif(motif3, text_limit=40)

In [None]:
display_motif(motif3)

In [None]:
replay_motif(motif3)

In [None]:
viz_motif(motif1, text_limit = 40)

In [None]:
replay_motif(motif1)

# Relaxing conditions

If we do the math, we find that we have 1220 motifs with triadic closure. Probably a fair bit less if we exclude motifs that have edges with deleted texts. Let's relax the condition by having the "User posting a top level comment" be relaxed to "User posting any comment" and having two other Users respond to that.

In [None]:
path_dict2 = defaultdict(list)

In [None]:
triad_dict.keys()

In [None]:
for motif_type, motif_instances in triad_dict.items():
    for motif_instance in motif_instances:
        if len(motif_instance.edges) == 0: continue 
        num_replies_to_root = 0
        
        utts_replied_to = [edge_set[0]['reply_to'] for edge_set in motif_instance.edges]
        if max(Counter(utts_replied_to).values()) == 2:
            path_dict2[get_development_path(motif_instance)].append(motif_instance)

We now have the machinery to do more interesting analyses:

In [None]:
get_num_paths(path_dict2)

In [None]:
print(len(path_dict2))

In [None]:
pd2 = {k: len(v) for k, v in path_dict2.items()}
end_state = defaultdict(int)
for path, count in pd2.items():
    end_state[path[-1]] += count

In [None]:
end_state

In [None]:
pd2

In [None]:
for path in path_dict2:
    print(path)
    print(len(path_dict2[path]))

Excluding non-triadic closure motifs, this gives a total of 2046 motifs with triadic closure to work with.

Once we filter out [deleted]'s, we remove 614 motif instances, giving us 1432 motifs to work with. (Filtering step not shown here.)

In [None]:
def has_deleted_text(motif_inst):
    for edge in motif_inst.edges:
        if edge[0]['text'] == "[deleted]":
            return True
    return False

To visualize some of these to verify correctness:

In [None]:
# for path, instances in path_dict2_trunc.items():
#     motif_a = instances[0]
#     print(motif_a.edges[0][0]['top_level_comment'])
#     viz_motif(motif_a, text_limit=40)
#     replay_motif(motif_a)
#     print()
#     input()

# Predictive tasks

Let's see if we can predict when triadic closure happens. As triadic closure happens at different points in a triad's development, we have to take care to compare like-to-like and that we are examining the same kind of triadic closure; incoming / unidirectional / outgoing -> incoming_2to3 are three different types of triadic closure, for example.

Let's focus more on the '2to3' type of triadic closure, as in: 
- incoming -> incoming_2to3 
- direciprocal -> direciprocal_2to3
- incoming_1to3 -> outgoing_reciprocal

In [None]:
# for k in path_dict2_trunc:
#     print(k)

## Incoming vs incoming_2to3

In [None]:
threads_motifs['t1_c2yz6ed']

In [None]:
# Generate a dict of thread_ids to motifs (disambiguated by paths)
threads_paths = defaultdict(dict)
for thread, motif_dict in threads_motifs.items():
    for motif_type_instances in motif_dict.values():
        for motif_inst in motif_type_instances:
            path = get_development_path(motif_inst)
            if path not in threads_motifs[thread]:
                threads_paths[thread][path] = [motif_inst]
            else:
                threads_paths[thread][path].append(motif_inst)

In [None]:
pd2

In [None]:
def responds_to_same_utt(motif_instance):
    utts_replied_to = [edge_set[0]['reply_to'] for edge_set in motif_instance.edges]
    return max(Counter(utts_replied_to).values())
        

In [None]:
import random

def generate_pos_neg(threads_paths, pos_types, neg_types):
    pos, neg = [], []
    for thread, paths in threads_paths.items(): # thread is top-level-comment
#         print(paths)
        pos_instances = [triad_instance for pos_type in pos_types for triad_instance in paths.get(pos_type, []) 
                         if not has_deleted_text(triad_instance) and responds_to_same_utt(triad_instance)]
        neg_instances = [triad_instance for neg_type in neg_types for triad_instance in paths.get(neg_type, []) 
                         if not has_deleted_text(triad_instance) and responds_to_same_utt(triad_instance)]
        
#         print(len(pos_instances), len(neg_instances))
        if len(pos_instances) == 0 or len(neg_instances) == 0: continue
        
        pos.append(random.choice(pos_instances))
        neg.append(random.choice(neg_instances))
        
    print("- {} positive, {} negative pts".format(len(pos), len(neg)))  
    return pos, neg


In [None]:
pos_types = []
for k in pd2:
    if str(k).startswith("('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS', 'INCOMING_2TO3_TRIADS'"):
        pos_types.append(k)
pos_types

In [None]:
pos, neg = generate_pos_neg(threads_paths, 
                            pos_types = pos_types, 
                            neg_types = [('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS')]
                           )

In [None]:
viz_motif(random.choice(pos))

### BOW

In [None]:
ys = [1]*len(pos) + [0]*len(neg)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split

random.seed(42)

motifs_text = []
for motif_inst in pos + neg:
    # BOW baseline text
    # motif_text taken from first two edges
    time_sorted_edges = sorted([e[0] for e in motif_inst.edges], key=lambda x: x['timestamp'])
    text1 = " ".join(["1_"+w for w in time_sorted_edges[0]['text'].split(" ")])
    text2 = " ".join(["2_"+w for w in time_sorted_edges[1]['text'].split(" ")])
    motif_text = text1 + " " + text2
    motifs_text.append(motif_text)
    
text_train, text_test, y_train, y_test = train_test_split(motifs_text, ys, test_size=0.3, random_state=42)
cv = CountVectorizer(min_df=0.05, max_df=0.8, ngram_range=(1, 3)) # excluding stop_words field improves performance
X_train = cv.fit_transform(text_train)
X_test = cv.transform(text_test)
        
clf = LogisticRegression(solver="liblinear")
clf.fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print("- BOW: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

print()

In [None]:
words_coefs = sorted(list(zip(cv.get_feature_names(), clf.coef_[0])), key=lambda x: abs(x[1]), reverse=True)

In [None]:
words_coefs

In [None]:
X_test

### Time diff between first/second edge, length of first edge text, length of second edge text

In [None]:
def get_features_from_motif(motif_inst):
    time_sorted_edges = sorted([e[0] for e in motif_inst.edges], key=lambda x: x['timestamp'])
    time_diff = time_sorted_edges[1]['timestamp'] - time_sorted_edges[0]['timestamp']
    num_words_1 = len(list(time_sorted_edges[0]['text'].split(" ")))
    num_words_2 = len(list(time_sorted_edges[1]['text'].split(" ")))
    return [time_diff, num_words_1, num_words_2]

In [None]:
X2 = []
for motif_inst in pos + neg:
    X2.append(get_features_from_motif(motif_inst))

X2 = np.array(X2)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, ys, test_size=0.2, random_state=42)
        
clf = LogisticRegression(solver="liblinear")
clf.fit(X2_train, y2_train)

train_acc = clf.score(X2_train, y2_train)
test_acc = clf.score(X2_test, y2_test)
print("- Basic features: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

### BOW + Basic features

In [None]:
X2 = []
for motif_inst in pos + neg:
    X2.append(get_features_from_motif(motif_inst))

X_combi = list(zip(motifs_text, X2))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combi, ys, test_size=0.2, random_state=42)
cv = CountVectorizer(min_df=0.05, max_df=0.8, ngram_range=(1,2)) 
X_bow_train = cv.fit_transform([x[0] for x in X_train]).todense()
X_bow_test = cv.transform([x[0] for x in X_test]).todense()

X_combi_train = []
X_combi_test = []
# print(X_bow_train[0].tolist()[0])

for row in range(len(X_train)):
    X_combi_train.append(X_bow_train[row].tolist()[0] + X_train[row][1])

for row in range(len(X_test)):
    X_combi_test.append(X_bow_test[row].tolist()[0] + X_test[row][1])

X_combi_train = np.array(X_combi_train)
X_combi_test = np.array(X_combi_test)
# print(X_combi_train)
clf = LogisticRegression(solver="liblinear")
clf.fit(X_combi_train, y_train)

print(X_combi_train.shape)

train_acc = clf.score(X_combi_train, y_train)
test_acc = clf.score(X_combi_test, y_test)
print("- Combined: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

### Tightening conditions

No better than chance. Maybe if we focus on those that include the User that posts a top-level-comment?

In [None]:
def responds_to_same_toplvlcomm(motif_instance):
    num_replies_to_root = 0
    for edge_set in motif_instance.edges:
        if edge_set[0]['root']: 
            num_replies_to_root += 1
    return num_replies_to_root==2

In [None]:
def generate_pos_neg2(threads_paths, pos_types, neg_types):
    pos, neg = [], []
    for thread in threads_paths: # thread is top-level-comment
        paths = threads_paths[thread]
#         print(paths)
        pos_instances = [triad_instance for pos_type in pos_types for triad_instance in paths.get(pos_type, []) 
                         if not has_deleted_text(triad_instance) and responds_to_same_toplvlcomm(triad_instance)]
        neg_instances = [triad_instance for neg_type in neg_types for triad_instance in paths.get(neg_type, []) 
                         if not has_deleted_text(triad_instance) and responds_to_same_toplvlcomm(triad_instance)]
        
#         print(len(pos_instances), len(neg_instances))
        if len(pos_instances) == 0 or len(neg_instances) == 0: continue
        
        pos.append(random.choice(pos_instances))
        neg.append(random.choice(neg_instances))
        
    print("- {} positive, {} negative pts".format(len(pos), len(neg)))  
    return pos, neg



In [None]:
pos, neg = generate_pos_neg2(threads_paths, 
                            pos_types = pos_types, 
                            neg_types = [('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS')]
                           )

### BOW

In [None]:
ys = [1]*len(pos) + [0]*len(neg)
random.seed(42)

motifs_text = []
for motif_inst in pos + neg:
    # BOW baseline text
    # motif_text taken from first two edges
    time_sorted_edges = sorted([e[0] for e in motif_inst.edges], key=lambda x: x['timestamp'])
    text1 = " ".join([w for w in time_sorted_edges[0]['text'].split(" ")])
    text2 = " ".join([w for w in time_sorted_edges[1]['text'].split(" ")])
    motif_text = text1 + " " + text2
    motifs_text.append(motif_text)
    
text_train, text_test, y_train, y_test = train_test_split(motifs_text, ys, test_size=0.2, random_state=42)
cv = CountVectorizer(min_df=0.05, max_df=0.8, ngram_range=(1, 3)) # excluding stop_words field improves performance
X_train = cv.fit_transform(text_train)
X_test = cv.transform(text_test)
        
clf = LogisticRegression(solver="liblinear")
clf.fit(X_train, y_train)

train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print("- BOW: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

#57% train and test accuracy (text1/text2 not distinguished, 0.05 / 0.8, no stop_words)

In [None]:
words_coefs = sorted(list(zip(cv.get_feature_names(), clf.coef_[0])), key=lambda x: abs(x[1]), reverse=True)

In [None]:
words_coefs

### Basic features

In [None]:
X2 = []
for motif_inst in pos + neg:
    X2.append(get_features_from_motif(motif_inst))

X2 = np.array(X2)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, ys, test_size=0.3, random_state=42)
        
clf = LogisticRegression(solver="liblinear")
clf.fit(X2_train, y2_train)

train_acc = clf.score(X2_train, y2_train)
test_acc = clf.score(X2_test, y2_test)
print("- Basic features: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

### Combined

In [None]:
X2 = []
for motif_inst in pos + neg:
    X2.append(get_features_from_motif(motif_inst))

X_combi = list(zip(motifs_text, X2))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combi, ys, test_size=0.3, random_state=42)
cv = CountVectorizer(min_df=0.05, max_df=0.8, ngram_range=(1,3)) 
X_bow_train = cv.fit_transform([x[0] for x in X_train]).todense()
X_bow_test = cv.transform([x[0] for x in X_test]).todense()

X_combi_train = []
X_combi_test = []
# print(X_bow_train[0].tolist()[0])

for row in range(len(X_train)):
    X_combi_train.append(X_bow_train[row].tolist()[0] + X_train[row][1])

for row in range(len(X_test)):
    X_combi_test.append(X_bow_test[row].tolist()[0] + X_test[row][1])

X_combi_train = np.array(X_combi_train)
X_combi_test = np.array(X_combi_test)
# print(X_combi_train)
clf = LogisticRegression(solver="liblinear")
clf.fit(X_combi_train, y_train)

print(X_combi_train.shape)

train_acc = clf.score(X_combi_train, y_train)
test_acc = clf.score(X_combi_test, y_test)
print("- Combined: {:.4f} train, {:.4f} test".format(train_acc, test_acc))