In [22]:
from convokit import Corpus, download, TextCleaner
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer as CV
from xgboost import XGBClassifier
from cleantext import clean

In [60]:
corpus = Corpus(download('reddit-coarse-discourse-corpus'))

Dataset already exists at /Users/calebchiam/.convokit/downloads/reddit-coarse-discourse-corpus


In [61]:
clean_str = lambda s: clean(s,
                            fix_unicode=True,               # fix various unicode errors
                            to_ascii=True,                  # transliterate to closest ASCII representation
                            lower=True,                     # lowercase text
                            no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
                            no_urls=True,                  # replace all URLs with a special token
                            no_emails=True,                # replace all email addresses with a special token
                            no_phone_numbers=True,         # replace all phone numbers with a special token
                            no_numbers=False,               # replace all numbers with a special token
                            no_digits=False,                # replace all digits with a special token
                            no_currency_symbols=True,      # replace all currency symbols with a special token
                            no_punct=False,                 # fully remove punctuation
                            replace_with_url="<URL>",
                            replace_with_email="<EMAIL>",
                            replace_with_phone_number="<PHONE>",
                            replace_with_number="<NUMBER>",
                            replace_with_digit="0",
                            replace_with_currency_symbol="<CUR>",
                            lang="en"
                            )

In [62]:
tc = TextCleaner(text_cleaner=clean_str, replace_text=False, verbosity=10000)

In [63]:
tc.transform(corpus)

10000/115827 utterances processed
20000/115827 utterances processed
30000/115827 utterances processed
40000/115827 utterances processed
50000/115827 utterances processed
60000/115827 utterances processed
70000/115827 utterances processed
80000/115827 utterances processed
90000/115827 utterances processed
100000/115827 utterances processed
110000/115827 utterances processed
115827/115827 utterances processed


<convokit.model.corpus.Corpus at 0x142f6d710>

In [64]:
corpus.random_utterance()

Utterance({'obj_type': 'utterance', '_owner': <convokit.model.corpus.Corpus object at 0x142f6d710>, 'meta': {'post_depth': 3, 'majority_type': 'answer', 'majority_link': 't1_cgn9n5r', 'annotation-types': ['answer', 'answer'], 'annotation-links': ['t1_cgn9n5r', 't1_cgn9n5r'], 'ups': 1, 'cleaned': 'no, when we investigated further we found that their had been a outbreak of some disease, (my memory fails me it was 35 years or so ago) whooping cough maybe? but the entire family except the mother had died 3 kids and her husband, and she just never went back and refused to allow anyone else to do anything with the place.'}, '_id': 't1_cgnaxi7', 'speaker': Speaker({'obj_type': 'speaker', '_owner': <convokit.model.corpus.Corpus object at 0x142f6d710>, 'meta': {}, '_id': 'Mordredbas'}), 'conversation_id': 't3_22hvqh', '_root': 't3_22hvqh', 'reply_to': 't1_cgn9n5r', 'timestamp': None, 'text': 'No, when we investigated further we found that their had been a outbreak of some disease, (my memory fa

### Some data cleaning

Not every utterance has a majority type, so we randomly assign them one from the annotations

In [65]:
corpus.random_utterance().meta

{'post_depth': 2,
 'majority_type': 'elaboration',
 'majority_link': 't1_cgjq67z',
 'annotation-types': ['elaboration', 'elaboration', 'elaboration'],
 'annotation-links': ['t1_cgjq67z', 't1_cgjq67z', 't1_cgjq67z'],
 'ups': 3,
 'cleaned': 'sooner or later importing from japan will be the only option. they sell for cheap over there but importation costs are a bitch.'}

In [66]:
corpus.print_summary_stats()

Number of Speakers: 63573
Number of Utterances: 115827
Number of Conversations: 9483


In [67]:
no_maj = 0
for utt in corpus.iter_utterances():
    if utt.meta['majority_type'] is None:
        no_maj += 1
print(no_maj)

12277


In [68]:
no_text = 0
for utt in corpus.iter_utterances():
    if utt.text is None or utt.text == '':
        no_text += 1
print(no_text)

2571


In [69]:
import random
for utt in corpus.iter_utterances():
    if utt.meta['majority_type'] is None:
        utt.meta['majority_type'] = random.choice(utt.meta['annotation-types'])

### Feature set

In [160]:
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy

In [161]:
nlp = spacy.load('en_core_web_md')

In [163]:
tfidf = TfidfVectorizer(decode_error='ignore', strip_accents='unicode', 
                        tokenizer=lambda text: [token.text for token in nlp(text)],
                        ngram_range=(1,3), min_df=50
                       )

In [164]:
docs = [utt.meta['cleaned'] for utt in corpus.iter_utterances()]

In [165]:
X_content = tfidf.fit_transform(docs)

In [166]:
X_content.shape

(113256, 22577)

In [168]:
for utt in convo.iter_utterances():
    utt.meta['doc'] = nlp(utt.meta['cleaned'])

In [174]:
len(list(nlp(corpus.random_utterance().meta['cleaned']).sents))

3

In [177]:
len(nlp(corpus.random_utterance().meta['cleaned']))

14

In [180]:
corpus.random_utterance().meta

{'post_depth': 0,
 'majority_type': 'announcement',
 'majority_link': 'none',
 'annotation-types': ['announcement', 'other'],
 'annotation-links': ['none', 'none'],
 'ups': None,
 'cleaned': '[update 5.5](<url>) hey, everybody. it\'s roman. i\'ve been reading through your comments of dylan\'s last update, and they actually made me laugh. you people. so uncreative. you really thought i wrecked the car, and he would have been able to press enter? ha. i submitted his last post, morons. right now, you\'re probably all really confused, and this pleases me more than you know. i pulled the wool over everyone\'s eyes. let me get you caught up. this is the third time i\'ve done something like this. you know that creep that\'s been "stalking" me? he\'s my father. i\'ve lived in three different countries, lived three different lives, murdered countless people. it\'s sweet that dylan confessed his love to me, poor boy. he didn\'t even know what hit him. i sure did have everyone fooled, huh? dylan,

In [181]:
for convo in corpus.iter_conversations():
    convo_len = len(list(convo.iter_utterances()))
    for utt in convo.iter_utterances():
        utt.meta['post_depth_norm'] = utt.meta['post_depth'] / convo_len
        utt.meta['num_sentences'] = len(list(doc.sents))
        utt.meta['num_tokens'] = len(doc)
        utt.meta['num_chars'] = len(utt.meta['cleaned'])

In [202]:
for utt in corpus.iter_utterances():
    utt.meta['is-post'] = int(utt.meta['post_depth'] == 0)

In [186]:
corpus.get_utterance('t1_c2ayszn').reply_to

't3_jcga7'

In [187]:
feats = ['post_depth_norm', 'num_sentences', 'num_tokens', 'num_chars']

for utt in corpus.iter_utterances():
    if utt.reply_to is not None:
        try:
            prev_utt = corpus.get_utterance(utt.reply_to)
            for feat in feats:
                utt.meta['prev_' + feat] = prev_utt.meta[feat]
        except KeyError:
            utt.meta['prev_num_sentences'] = -1
            utt.meta['prev_num_tokens'] = -1
            utt.meta['prev_num_chars'] = -1
    else:
        utt.meta['prev_num_sentences'] = -1
        utt.meta['prev_num_tokens'] = -1
        utt.meta['prev_num_chars'] = -1

In [203]:
structural_feats = ['is-post', 'post_depth'] + feats + ['prev_' + f for f in feats[-3:]]
print(structural_feats)

['is-post', 'post_depth', 'post_depth_norm', 'num_sentences', 'num_tokens', 'num_chars', 'prev_num_sentences', 'prev_num_tokens', 'prev_num_chars']


In [204]:
X_structure = []
for utt in corpus.iter_utterances():
    utt_feats = [utt.meta[f] for f in structural_feats]
    X_structure.append(utt_feats)

In [206]:
import numpy as np

In [207]:
X_structure = np.array(X_structure)

In [211]:
X_content.shape

(113256, 22577)

In [209]:
X_structure.shape

(113256, 9)

In [213]:
np.hstack

<function numpy.hstack(tup)>

In [216]:
X_content.shape

(113256, 22577)

In [219]:
stacked = np.hstack((X_content.toarray(), X_structure))

## Multiclass classifier

In [221]:
Y = [utt.meta['majority_type'] for utt in corpus.iter_utterances()]

In [222]:
from sklearn.model_selection import train_test_split

In [225]:
X_train, X_test, y_train, y_test = train_test_split(X_content, Y, test_size=0.2, random_state=42)

In [226]:
num_classes = len(set(Y))

In [227]:
xgb = XGBClassifier(booster='gbtree', objective='multi:softprob', 
                    random_state=42, eval_metric='auc', num_class=num_classes)

xgb.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, eval_metric='auc',
              gamma=0, gpu_id=-1, importance_type='gain',
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=0,
              num_class=10, num_parallel_tree=1, objective='multi:softprob',
              random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=None,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [228]:
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing

In [229]:
y_pred = xgb.predict(X_test)

In [230]:
print(classification_report(y_test, y_pred))

                  precision    recall  f1-score   support

       agreement       0.57      0.24      0.33      1265
    announcement       0.49      0.08      0.14       311
          answer       0.51      0.85      0.64      8520
    appreciation       0.67      0.60      0.63      1928
    disagreement       0.42      0.04      0.08       858
     elaboration       0.35      0.15      0.21      4300
           humor       0.31      0.03      0.05       686
negativereaction       0.28      0.03      0.06       552
           other       0.54      0.05      0.10       713
        question       0.72      0.83      0.77      3519

        accuracy                           0.55     22652
       macro avg       0.49      0.29      0.30     22652
    weighted avg       0.52      0.55      0.48     22652



In [133]:
lb = preprocessing.LabelBinarizer()
lb.fit(y_test)

LabelBinarizer(neg_label=0, pos_label=1, sparse_output=False)

In [134]:
y_test_lb = lb.transform(y_test)

In [135]:
pred_lb = lb.transform(y_pred)

In [141]:
roc_auc_score(y_test_lb, pred_lb, multi_class='ovr')

0.5916967198317197

In [142]:
from sklearn.metrics import classification_report

In [158]:
vectorizer.vocabulary_

{'13': 26,
 '27': 70,
 '12': 24,
 'url': 13433,
 'it': 6199,
 'was': 13740,
 'only': 8595,
 'few': 4012,
 'minutes': 7608,
 'into': 5948,
 'robert': 9914,
 'epic': 3704,
 'that': 11247,
 'got': 4784,
 'feeling': 3996,
 'watching': 13841,
 'great': 4823,
 'movie': 7757,
 'by': 2183,
 'end': 3653,
 'could': 2827,
 'not': 8083,
 'be': 1538,
 'now': 8198,
 'sure': 11004,
 'helps': 5270,
 'since': 10409,
 'this': 12389,
 'created': 2900,
 'an': 567,
 'immediate': 5648,
 'connection': 2738,
 'to': 12647,
 'subject': 10934,
 'matter': 7370,
 'spent': 10739,
 'portion': 9215,
 'of': 8266,
 'with': 14337,
 'my': 7805,
 'in': 5671,
 'lap': 6789,
 'trying': 13245,
 'play': 9116,
 'along': 473,
 'characters': 2458,
 'who': 14194,
 'all': 401,
 'seem': 10165,
 'really': 9618,
 'playing': 9139,
 'and': 607,
 'singing': 10425,
 'these': 12215,
 'songs': 10670,
 'however': 5481,
 'also': 487,
 'am': 530,
 'fan': 3933,
 'country': 2860,
 'western': 14009,
 'so': 10491,
 'have': 5032,
 'easily': 3564,
 

In [148]:
print(classification_report(y_test_lb, pred_lb))

              precision    recall  f1-score   support

           0       0.51      0.26      0.35      1265
           1       0.44      0.09      0.15       311
           2       0.47      0.88      0.62      8520
           3       0.70      0.55      0.61      1928
           4       0.38      0.04      0.07       858
           5       0.32      0.10      0.15      4300
           6       0.08      0.00      0.00       686
           7       0.33      0.04      0.07       552
           8       0.73      0.04      0.07       713
           9       0.68      0.58      0.62      3519

   micro avg       0.51      0.51      0.51     22652
   macro avg       0.46      0.26      0.27     22652
weighted avg       0.49      0.51      0.44     22652
 samples avg       0.51      0.51      0.51     22652



In [149]:
print(classification_report(y_test, y_pred))

                  precision    recall  f1-score   support

       agreement       0.51      0.26      0.35      1265
    announcement       0.44      0.09      0.15       311
          answer       0.47      0.88      0.62      8520
    appreciation       0.70      0.55      0.61      1928
    disagreement       0.38      0.04      0.07       858
     elaboration       0.32      0.10      0.15      4300
           humor       0.08      0.00      0.00       686
negativereaction       0.33      0.04      0.07       552
           other       0.73      0.04      0.07       713
        question       0.68      0.58      0.62      3519

        accuracy                           0.51     22652
       macro avg       0.46      0.26      0.27     22652
    weighted avg       0.49      0.51      0.44     22652



In [112]:
output = pd.DataFrame()
output['expected'] = y_test
output['predicted'] = X_pred

In [115]:
from collections import Counter
Counter(y_test)

Counter({'answer': 8520,
         'elaboration': 4300,
         'question': 3519,
         'announcement': 311,
         'other': 713,
         'appreciation': 1928,
         'disagreement': 858,
         'agreement': 1265,
         'negativereaction': 552,
         'humor': 686})

In [125]:
from sklearn_crfsuite import metrics

{'post_depth': 6,
 'majority_type': 'question',
 'majority_link': 't1_d206qne',
 'annotation-types': ['question', 'question', 'question'],
 'annotation-links': ['t1_d206qne', 't1_d206qne', 't1_d206qne'],
 'ups': 1,
 'cleaned': "wouldn't it be better with spd/cd/atk?",
 'is-reply': 1,
 'post_depth_norm': 0.2608695652173913,
 'num_sentences': 1,
 'num_tokens': 15,
 'num_chars': 38,
 'prev_post_depth_norm': 0.21739130434782608,
 'prev_num_sentences': 1,
 'prev_num_tokens': 15,
 'prev_num_chars': 58,
 'is-post': 0}

In [127]:
len(y_test)

22652

In [128]:
len(X_pred)

22652

In [130]:
X_pred.shape

(22652,)

AttributeError: 'list' object has no attribute 'shape'

In [129]:
print(metrics.flat_classification_report(y_test, X_pred))

ValueError: Found input variables with inconsistent numbers of samples: [190948, 161197]

In [118]:
ids = [utt.id for utt in corpus.iter_utterances()]

In [120]:
len(ids)

113256

In [121]:
len(y_test)

22652

In [119]:
output['id'] = ids

ValueError: Length of values does not match length of index

In [114]:
output.head(100)

Unnamed: 0,expected,predicted
0,answer,answer
1,answer,answer
2,answer,appreciation
3,answer,answer
4,elaboration,answer
5,answer,answer
6,question,answer
7,announcement,question
8,other,answer
9,appreciation,appreciation


In [86]:
corpus.random_utterance().meta

{'post_depth': 1,
 'majority_type': 'answer',
 'majority_link': 't3_31gsg3',
 'annotation-types': ['answer', 'answer', 'appreciation'],
 'annotation-links': ['t3_31gsg3', 't3_31gsg3', 't3_31gsg3'],
 'ups': 1,
 'cleaned': "should be fine. you'd probably be fine going pure fighter, but if you go wizard maybe transmuter and level into an odd str (since transmute would make that even and the other level up could go to con 14). traits: i love reactionary on every character, and magical knack if you splash wizard for extra duration* and stuff.",
 'is-reply': 1}