In [1]:
import os
os.getcwd()
os.chdir("..")
os.chdir("..")
os.getcwd()

'/home/caleb/Cornell-Conversational-Analysis-Toolkit'

In [2]:
import convokit
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
from collections import Counter

In [4]:
os.chdir('../../../sauna/reddit_201810_raw/reddit-corpus-2/reddit-corpus')

In [5]:
corpus = convokit.Corpus(filename='reddit-corpus')

In [6]:
threads = corpus.utterance_threads(prefix_len=10, include_root=False)

In [7]:
# 'e58slx0'

In [8]:
hc = convokit.HyperConvo(prefix_len=10, min_thread_len=10, include_root=False)


In [9]:
threads_motifs = hc.retrieve_motifs(corpus)

In [10]:
threads_motif_path_stats = hc.retrieve_motif_pathway_stats(corpus)

In [11]:
threads_paths = hc.retrieve_motif_paths(corpus)

In [12]:
from typing import List

In [13]:
from convokit import TriadMotif

In [14]:
def validate_motif(motif: TriadMotif):
    utts_replied_to = [edge_set[0]['reply_to'] for edge_set in motif.edges]
    return max(Counter(utts_replied_to).values()) == 2

In [15]:
incoming_id = ('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS')
incoming_2to3_id = ('NO_EDGE_TRIADS', 'SINGLE_EDGE_TRIADS', 'INCOMING_TRIADS', 'INCOMING_2TO3_TRIADS')

In [16]:
from random import choice

In [17]:
neg = []
pos = []
for thread_id, motif_paths in threads_paths.items():
    valid_incoming = [motif for motif in motif_paths[incoming_id] if validate_motif(motif)]
    valid_2to3 = [motif for motif in motif_paths[incoming_2to3_id] if validate_motif(motif)]

    if valid_incoming and valid_2to3:
        neg.append(choice(valid_incoming))
        pos.append(choice(valid_2to3))

In [18]:
len(neg)

2692

In [19]:
len(pos)

2692

378 / 408 of the pairs satisfy the criteria

In [20]:
pos_bow_feats = dict()
neg_bow_feats = dict()

In [21]:
def get_tlc(motif: TriadMotif):
    return motif.edges[0][0]['top_level_comment']

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
from sklearn.model_selection import train_test_split

pos_text = []
neg_text = []
for motif in pos:
    # BOW baseline text
    # motif_text taken from first two edges
    time_sorted_edges = sorted([e[0] for e in motif.edges], key=lambda x: x['timestamp'])
    text1 = " ".join(["1_"+w.strip() for w in time_sorted_edges[0]['text'].split(" ")])
    text2 = " ".join(["2_"+w.strip() for w in time_sorted_edges[1]['text'].split(" ")])
    pos_text.append(text1 + " " + text2)
    
for motif in neg:
    # BOW baseline text
    # motif_text taken from first two edges
    time_sorted_edges = sorted([e[0] for e in motif.edges], key=lambda x: x['timestamp'])
    text1 = " ".join(["1_"+w.strip() for w in time_sorted_edges[0]['text'].split(" ")])
    text2 = " ".join(["2_"+w.strip() for w in time_sorted_edges[1]['text'].split(" ")])
    neg_text.append(text1 + " " + text2)

# pos_train, pos_test, neg_train, neg_test = train_test_split(pos, neg, test_size=0.2, random_state=42)
pos_ids, neg_ids = [get_tlc(motif) for motif in pos], [get_tlc(motif) for motif in neg]


In [23]:
pos_id_to_text = {pos_ids[i]: pos_text[i] for i in range(len(pos_ids))}
neg_id_to_text = {neg_ids[i]: neg_text[i] for i in range(len(neg_ids))}



In [24]:
train_ids, test_ids = train_test_split(list(pos_id_to_text), test_size=0.2, random_state=42)

In [25]:
pos_train = [pos_id_to_text[id] for id in train_ids]
neg_train = [neg_id_to_text[id] for id in train_ids]
pos_test = [pos_id_to_text[id] for id in test_ids]
neg_test = [neg_id_to_text[id] for id in test_ids]

In [26]:
cv = CountVectorizer(min_df=0.05, max_df=0.8, ngram_range=(1, 3)) # excluding stop_words field improves performance
cv.fit(pos_train + neg_train)



CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=0.8, max_features=None, min_df=0.05,
                ngram_range=(1, 3), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [27]:
pos_data = cv.transform(pos_train + pos_test).toarray()
neg_data = cv.transform(neg_train + neg_test).toarray()
cols = cv.get_feature_names()
pos_df = pd.DataFrame(pos_data, index=train_ids + test_ids, columns=cols)
neg_df = pd.DataFrame(neg_data, index=train_ids + test_ids, columns=cols)

In [28]:
pos_df.shape

(2652, 192)

In [29]:
def generate_paired_X_y(pos_df, neg_df):
    df = pd.DataFrame(columns=pos_df.columns)
    y = []
    for idx in range(pos_df.shape[0]):
        if idx % 2 == 0:
            df = df.append(pos_df.iloc[idx] - neg_df.iloc[idx])
            y.append(1)
        else:
            df = df.append(neg_df.iloc[idx] - pos_df.iloc[idx])
            y.append(0)
    y = pd.DataFrame(y, index=train_ids+test_ids)
    return df, y

In [30]:
X, y = generate_paired_X_y(pos_df, neg_df)

In [31]:
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import normalize, StandardScaler, Normalizer



In [32]:
from sklearn.linear_model import LogisticRegression
clf = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
X_train = X.loc[train_ids]
y_train = y.loc[train_ids]
X_test = X.loc[test_ids]
y_test = y.loc[test_ids]
clf.fit(X.loc[train_ids], y.loc[train_ids])

Pipeline(memory=None,
         steps=[('standardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [33]:
train_acc = clf.score(X_train, y_train)
test_acc = clf.score(X_test, y_test)
print("- BOW: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

print()

- BOW: 0.6068 train, 0.5612 test



In [34]:
def print_extreme_coefs(clf, feats, k):
    coefs = clf.named_steps['logreg'].coef_[0].tolist()
    
    assert len(feats) == len(coefs)
    feats_coefs = sorted(list(zip(feats, coefs)), key=lambda x: x[1], reverse=True)
    
    print("TOP {} FEATURES".format(k))
    for ft, coef in feats_coefs[:k]:
        print("{}: {:.3f}".format(ft, coef))
    print()
    print("BOTTOM {} FEATURES".format(k))
    for ft, coef in feats_coefs[-k:]:
        print("{}: {:.3f}".format(ft, coef))
    print()

In [35]:
print_extreme_coefs(clf, list(cv.get_feature_names()), k=20)

TOP 20 FEATURES
1_the: 0.337
1_ 1_: 0.246
2_: 0.194
2_why: 0.182
1_i: 0.173
1_have: 0.166
2_do: 0.165
2_make: 0.150
1_way: 0.144
2_all: 0.143
1_who: 0.138
2_one: 0.120
1_really: 0.119
1_has: 0.116
2_ gt: 0.115
2_you: 0.103
2_at: 0.102
2_being: 0.099
2_because: 0.099
1_people: 0.097

BOTTOM 20 FEATURES
1_my: -0.072
1_than: -0.072
2_not: -0.078
com: -0.088
2_most: -0.089
1_on 1_the: -0.089
1_they: -0.092
1_most: -0.094
2_if 2_you: -0.095
2_or: -0.097
2_of: -0.100
2_to: -0.104
2_he: -0.112
2_really: -0.117
1_if: -0.122
2_like: -0.128
1_and: -0.135
1_he: -0.137
1_to: -0.142
1_of: -0.164



### Time diff between first/second edge, length of first edge text, length of second edge text

In [36]:
def get_features_from_motif(motif_inst):
    time_sorted_edges = sorted([e[0] for e in motif_inst.edges], key=lambda x: x['timestamp'])
    time_diff = time_sorted_edges[1]['timestamp'] - time_sorted_edges[0]['timestamp']
    num_words_1 = len(list(time_sorted_edges[0]['text'].split(" ")))
    num_words_2 = len(list(time_sorted_edges[1]['text'].split(" ")))
    return [time_diff, num_words_1, num_words_2]

In [37]:
pos_feats = {get_tlc(motif): get_features_from_motif(motif) for motif in pos}
neg_feats = {get_tlc(motif): get_features_from_motif(motif) for motif in neg}

In [38]:
pos_feats_df = pd.DataFrame.from_dict(pos_feats).T
neg_feats_df = pd.DataFrame.from_dict(neg_feats).T

In [39]:
pos_feats_df.columns = ['time_diff', 'first_utt_len', 'second_utt_len']
neg_feats_df.columns = ['time_diff', 'first_utt_len', 'second_utt_len']

In [40]:
X, y = generate_paired_X_y(pos_feats_df, neg_feats_df)

In [41]:
X_train2 = X.loc[train_ids]
y_train2 = y.loc[train_ids]
X_test2 = X.loc[test_ids]
y_test2 = y.loc[test_ids]

clf2 = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
clf2.fit(X_train2, y_train2)

Pipeline(memory=None,
         steps=[('standardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [42]:
y_train2.shape

(2121, 1)

In [43]:
X_train2.shape

(2121, 3)

In [44]:
train_acc = clf2.score(X_train2, y_train2)
test_acc = clf2.score(X_test2, y_test2)
print("- Basic features: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

- Basic features: 0.5158 train, 0.4896 test


### BOW + Basic features

In [45]:
X_train_combined = pd.concat([X_train, X_train2], axis=1)

In [46]:
clf3 = Pipeline([("standardScaler", StandardScaler()), ("logreg", LogisticRegression(solver='liblinear'))])
clf3.fit(X_train_combined, y_train)

Pipeline(memory=None,
         steps=[('standardScaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='liblinear', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)

In [47]:
X_test_combined = pd.concat([X_test, X_test2], axis=1)

In [48]:
train_acc = clf3.score(X_train_combined, y_train)
test_acc = clf3.score(X_test_combined, y_test)
print("- Basic features: {:.4f} train, {:.4f} test".format(train_acc, test_acc))

- Basic features: 0.6096 train, 0.5612 test
