In [1]:
import os
os.getcwd()
os.chdir("..")
os.chdir("..")
os.getcwd()

'/home/caleb/Cornell-Conversational-Analysis-Toolkit'

In [2]:
import convokit
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

In [3]:
changemyview_dir = '/sauna/reddit_201810_raw/corpus/cats3~-~changemyview/changemyview'

In [4]:
os.chdir('../../../sauna/reddit_201810_raw/reddit-corpus-2/reddit-corpus')

In [4]:
corpus = convokit.Corpus(filename=changemyview_dir)

In [5]:
threads = corpus.utterance_threads(prefix_len=10, include_root=False)

In [6]:
# 'e58slx0'

In [7]:
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10, include_root=False)


In [8]:
threads_motifs = hc.retrieve_motifs(corpus)

In [9]:
threads_motif_path_stats = hc.retrieve_motif_pathway_stats(corpus)

In [10]:
threads_paths = hc.retrieve_motif_paths(corpus)

In [11]:
from typing import List

In [12]:
from convokit import TriadMotif

In [13]:
def validate_motif(motif: TriadMotif):
    utts_replied_to = [edge_set[0]['reply_to'] for edge_set in motif.edges]
    return max(Counter(utts_replied_to).values()) == 2

In [14]:
incoming_id = ('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS')
incoming_2to3_id = ('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS', 'INCOMING_2TO3_TRIADS')

In [15]:
from random import choice

In [16]:
neg = []
pos = []
for thread_id, motif_paths in threads_paths.items():
    valid_incoming = [motif for motif in motif_paths[incoming_id] if validate_motif(motif)]
    valid_2to3 = [motif for motif in motif_paths[incoming_2to3_id] if validate_motif(motif)]

    if valid_incoming and valid_2to3:
        neg.append(choice(valid_incoming))
        pos.append(choice(valid_2to3))

In [17]:
len(neg)

1683

In [18]:
len(pos)

1683

378 / 408 of the pairs satisfy the criteria

In [19]:
pos_bow_feats = dict()
neg_bow_feats = dict()

In [20]:
def get_tlc(motif: TriadMotif):
    return motif.edges[0][0]['top_level_comment']

In [21]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split

pos_text = []
neg_text = []
for motif in pos:
    # BOW baseline text
    # motif_text taken from first two edges
    time_sorted_edges = sorted([e[0] for e in motif.edges], key=lambda x: x['timestamp'])
    text1 = " ".join(["1_"+w.strip() for w in time_sorted_edges[0]['text'].split(" ")])
    text2 = " ".join(["2_"+w.strip() for w in time_sorted_edges[1]['text'].split(" ")])
    pos_text.append(text1 + " " + text2)
    
for motif in neg:
    # BOW baseline text
    # motif_text taken from first two edges
    time_sorted_edges = sorted([e[0] for e in motif.edges], key=lambda x: x['timestamp'])
    text1 = " ".join(["1_"+w.strip() for w in time_sorted_edges[0]['text'].split(" ")])
    text2 = " ".join(["2_"+w.strip() for w in time_sorted_edges[1]['text'].split(" ")])
    neg_text.append(text1 + " " + text2)

# pos_train, pos_test, neg_train, neg_test = train_test_split(pos, neg, test_size=0.2, random_state=42)
pos_ids, neg_ids = [get_tlc(motif) for motif in pos], [get_tlc(motif) for motif in neg]


In [22]:
pos_id_to_text = {pos_ids[i]: pos_text[i] for i in range(len(pos_ids))}
neg_id_to_text = {neg_ids[i]: neg_text[i] for i in range(len(neg_ids))}



In [23]:
train_ids, test_ids = train_test_split(list(pos_id_to_text), test_size=0.2, random_state=42)

In [24]:
pos_train = [pos_id_to_text[id] for id in train_ids]
neg_train = [neg_id_to_text[id] for id in train_ids]
pos_test = [pos_id_to_text[id] for id in test_ids]
neg_test = [neg_id_to_text[id] for id in test_ids]

In [25]:
cv = CountVectorizer(min_df=0.05, max_df=0.8, ngram_range=(1, 3)) # excluding stop_words field improves performance
cv.fit(pos_train + neg_train)



CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=None, min_df=0.05,
                ngram_range=(1, 3), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [26]:
pos_data = cv.transform(pos_train + pos_test).toarray()
neg_data = cv.transform(neg_train + neg_test).toarray()
cols = cv.get_feature_names()
pos_df = pd.DataFrame(pos_data, index=train_ids + test_ids, columns=cols)
neg_df = pd.DataFrame(neg_data, index=train_ids + test_ids, columns=cols)

In [27]:
pos_df.shape

(1594, 376)

In [28]:
def generate_paired_X_y(pos_df, neg_df):
    df = pd.DataFrame(columns=pos_df.columns)
    y = []
    for idx in range(pos_df.shape[0]):
        if idx % 2 == 0:
            df = df.append(pos_df.iloc[idx] - neg_df.iloc[idx])
            y.append(1)
        else:
            df = df.append(neg_df.iloc[idx] - pos_df.iloc[idx])
            y.append(0)
    y = pd.DataFrame(y, index=train_ids+test_ids)
    return df, y

In [29]:
X, y = generate_paired_X_y(pos_df, neg_df)

In [30]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import normalize, StandardScaler, Normalizer



In [31]:
from sklearn.linear_model import LogisticRegression
clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
X_train = X.loc[train_ids]
y_train = y.loc[train_ids]
X_test = X.loc[test_ids]
y_test = y.loc[test_ids]
clf.fit(X.loc[train_ids], y.loc[train_ids])

Pipeline(memory=None,
         steps=[('standardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [32]:
train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print("- BOW: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

print()

- BOW: 0.7765 train, 0.5423 test



In [33]:
def print_extreme_coefs(clf, feats, k):
    coefs = clf.named_steps['logreg'].coef_[0].tolist()
    
    assert len(feats) == len(coefs)
    feats_coefs = sorted(list(zip(feats, coefs)), key=lambda x: x[1], reverse=True)
    
    print("TOP {} FEATURES".format(k))
    for ft, coef in feats_coefs[:k]:
        print("{}: {:.3f}".format(ft, coef))
    print()
    print("BOTTOM {} FEATURES".format(k))
    for ft, coef in feats_coefs[-k:]:
        print("{}: {:.3f}".format(ft, coef))
    print()

In [48]:
print_extreme_coefs(clf, list(cv.get_feature_names()), k=30)

TOP 30 FEATURES
http: 1.362
1_or: 1.267
2_still: 1.168
2_have 2_to: 1.042
2_be: 1.019
2_the 2_same: 0.972
2_other: 0.915
2_the 2_: 0.914
2_that: 0.908
re: 0.880
1_very: 0.838
1_still: 0.833
1_ 2_ gt: 0.815
2_with: 0.756
2_those: 0.747
1_i 1_think: 0.744
ll: 0.743
1_go: 0.740
1_at: 0.704
2_as 2_a: 0.701
1_is 1_that: 0.697
1_if: 0.692
1_i 1_don: 0.674
deleted: 0.665
2_so: 0.646
2_never: 0.645
1_most: 0.627
1_many: 0.616
2_ deleted: 0.612
2_own: 0.598

BOTTOM 30 FEATURES
2_they: -0.595
1_not: -0.612
2_you: -0.630
1_ 1_: -0.636
1_some: -0.645
2_get: -0.657
2_in 2_the: -0.662
2_would: -0.671
2_you 2_can: -0.692
2_take: -0.697
2_it 2_is: -0.705
2_you re: -0.745
1_more: -0.760
2_didn: -0.818
1_ 2_: -0.822
2_see: -0.850
1_been: -0.903
and: -0.915
2_people: -0.917
1_of: -0.939
1_think: -0.940
2_same: -0.949
1_how: -1.011
1_then: -1.056
1_you: -1.066
2_while: -1.104
1_to: -1.138
com: -1.146
1_don: -1.257
2_fact: -1.668



### Time diff between first/second edge, length of first edge text, length of second edge text

In [35]:
def get_features_from_motif(motif_inst):
    time_sorted_edges = sorted([e[0] for e in motif_inst.edges], key=lambda x: x['timestamp'])
    time_diff = time_sorted_edges[1]['timestamp'] - time_sorted_edges[0]['timestamp']
    num_words_1 = len(list(time_sorted_edges[0]['text'].split(" ")))
    num_words_2 = len(list(time_sorted_edges[1]['text'].split(" ")))
    return [time_diff, num_words_1, num_words_2]

In [36]:
pos_feats = {get_tlc(motif): get_features_from_motif(motif) for motif in pos}
neg_feats = {get_tlc(motif): get_features_from_motif(motif) for motif in neg}

In [37]:
pos_feats_df = pd.DataFrame.from_dict(pos_feats).T
neg_feats_df = pd.DataFrame.from_dict(neg_feats).T

In [38]:
pos_feats_df.columns = ['time_diff', 'first_utt_len', 'second_utt_len']
neg_feats_df.columns = ['time_diff', 'first_utt_len', 'second_utt_len']

In [39]:
X, y = generate_paired_X_y(pos_feats_df, neg_feats_df)

In [40]:
X_train2 = X.loc[train_ids]
y_train2 = y.loc[train_ids]
X_test2 = X.loc[test_ids]
y_test2 = y.loc[test_ids]

clf2 = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
clf2.fit(X_train2, y_train2)

Pipeline(memory=None,
         steps=[('standardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [41]:
y_train2.shape

(1275, 1)

In [42]:
X_train2.shape

(1275, 3)

In [43]:
train_acc = clf2.score(X_train2, y_train2)
test_acc = clf2.score(X_test2, y_test2)
print("- Basic features: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

- Basic features: 0.4980 train, 0.4953 test


### BOW + Basic features

In [44]:
X_train_combined = pd.concat([X_train, X_train2], axis=1)

In [45]:
clf3 = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
clf3.fit(X_train_combined, y_train)

Pipeline(memory=None,
         steps=[('standardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [46]:
X_test_combined = pd.concat([X_test, X_test2], axis=1)

In [47]:
train_acc = clf3.score(X_train_combined, y_train)
test_acc = clf3.score(X_test_combined, y_test)
print("- Basic features: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

- Basic features: 0.7765 train, 0.5423 test


In [50]:
print_extreme_coefs(clf3, list(X_train_combined.columns), k=30)

TOP 30 FEATURES
http: 1.374
1_or: 1.282
2_still: 1.206
2_have 2_to: 1.060
2_other: 0.957
2_be: 0.947
2_the 2_: 0.909
2_the 2_same: 0.891
2_that: 0.871
1_very: 0.863
1_still: 0.848
1_ 2_ gt: 0.809
re: 0.806
2_those: 0.794
1_go: 0.785
1_i 1_think: 0.741
1_at: 0.730
1_if: 0.724
1_is 1_that: 0.714
2_with: 0.714
ll: 0.711
2_never: 0.670
1_i 1_don: 0.669
2_as 2_a: 0.662
deleted: 0.661
1_many: 0.622
2_ deleted: 0.621
1_most: 0.618
2_no: 0.607
2_of 2_the: 0.596

BOTTOM 30 FEATURES
1_where: -0.586
2_you: -0.621
1_not: -0.637
1_some: -0.648
2_get: -0.655
2_you 2_can: -0.664
1_ 1_: -0.682
2_would: -0.682
2_in 2_the: -0.690
2_it 2_is: -0.691
2_take: -0.712
1_more: -0.724
2_you re: -0.741
2_didn: -0.833
1_ 2_: -0.833
2_same: -0.870
2_see: -0.879
and: -0.890
1_been: -0.891
1_think: -0.942
1_of: -0.959
2_people: -0.975
com: -0.994
1_how: -1.009
1_you: -1.037
1_then: -1.058
2_while: -1.120
1_to: -1.159
1_don: -1.265
2_fact: -1.737

