In [2]:
import os
from convokit import download, Corpus
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
full_model_path = "/reef/sqt2/BERTCRAFT"
single_model_path = "/reef/sqt2/SINGLE_UTT"

wikicorpus = Corpus(filename=download("conversations-gone-awry-corpus"))
cmvcorpus = Corpus(filename=download("conversations-gone-awry-cmv-corpus"))

Dataset already exists at /home/sqt2/.convokit/downloads/conversations-gone-awry-corpus
Dataset already exists at /home/sqt2/.convokit/downloads/conversations-gone-awry-cmv-corpus


In [46]:
def new_test(test_samples, pred_path, corpus, corpus_name):
    all_logits = []
    label_metadata = "conversation_has_personal_attack" if corpus_name == "wikiconv" else "has_removed_comment"
    pred_file = open(pred_path, 'r')
    pred_lines = pred_file.readlines()[1:]
    pred_dict = {}
    for line in pred_lines:
        id2pred = line.split(",")
        
        assert len(id2pred) == 3
        utt_id = id2pred[0]
        utt_pred = int(float(id2pred[2]) > 0.4)
        utt_score = id2pred[2]
        all_logits.append(float(utt_score))
        pred_dict[utt_id] = [int(utt_pred), float(utt_score)]
    # plt.hist(all_logits, bins=20)
    # plt.show()
    for convo in corpus.iter_conversations():
        prev = 0
        # only consider test set conversations (we did not make predictions for the other ones)
        if convo.id in test_samples:
            for utt in convo.iter_utterances():
                if utt.id in pred_dict:
                    utt.meta['forecast_score'] = (pred_dict[utt.id][0] * prev) or (pred_dict[utt.id][1] > 0.8)
                    prev = pred_dict[utt.id][0]
    
    conversational_forecasts_df = {
            "convo_id": [],
            "label": [],
            "prediction": []
        }
    for convo in corpus.iter_conversations():
        if convo.id in test_samples:
            conversational_forecasts_df['convo_id'].append(convo.id)
            conversational_forecasts_df['label'].append(int(convo.meta[label_metadata]))
            forecast_scores = [utt.meta['forecast_score'] for utt in convo.iter_utterances() if 'forecast_score' in utt.meta]
            conversational_forecasts_df['prediction'].append(max(forecast_scores))
    conversational_forecasts_df = pd.DataFrame(conversational_forecasts_df).set_index("convo_id")
    test_labels = conversational_forecasts_df.label
    test_preds = conversational_forecasts_df.prediction
    test_acc = (test_labels == test_preds).mean()
    
    tp = ((test_labels==1)&(test_preds==1)).sum()
    fp = ((test_labels==0)&(test_preds==1)).sum()
    tn = ((test_labels==0)&(test_preds==0)).sum()
    fn = ((test_labels==1)&(test_preds==0)).sum()

    test_precision = tp / (tp + fp)
    test_recall = tp / (tp + fn)
    test_fpr = fp / (fp + tn)
    test_f1 = 2 / (((tp + fp) / tp) + ((tp + fn) / tp))
    return {"accuracy":test_acc, "precision":test_precision, "recall":test_recall, "f1":test_f1}

In [47]:
def get_single_utt_preds(saved_path):
    single_utt_predictions = {}
    all_seeds = os.listdir(saved_path)
    for seed in all_seeds:
        pred_path = os.path.join(saved_path, seed, "predictions.csv")
        pred_file = open(pred_path, 'r')
        pred_lines = pred_file.readlines()[1:]
        for line in pred_lines:
            id2pred = line.split(",")
            
            assert len(id2pred) == 3
            utt_id = id2pred[0]
            utt_pred = id2pred[1]
            if utt_id not in single_utt_predictions:
                single_utt_predictions[utt_id] = int(utt_pred)
            else:
                single_utt_predictions[utt_id] += int(utt_pred)
    return single_utt_predictions
    
def full_evaluate(full_model_name, full_model_path, single_model_name, single_model_path, corpus, corpus_name):
    single_model_path = os.path.join(single_model_path, corpus_name, single_model_name)
    full_model_path = os.path.join(full_model_path, corpus_name, full_model_name)
    
    single_utt_predictions = get_single_utt_preds(single_model_path)
    test_samples, dynamic_samples, single_samples = extract_dynamic_samples(single_utt_predictions, corpus, corpus_name)

    with open('{}.txt'.format(corpus_name), 'w') as f:
        for id in dynamic_samples:
            f.write("%s\n" % id)

    result_dict = {"full_test": {"accuracy":[], "precision":[], "recall":[], "f1":[]},
                  "dynamic_only": {"accuracy":[], "precision":[], "recall":[], "f1":[]},
                  "single_enough": {"accuracy":[], "precision":[], "recall":[], "f1":[]}}

    for seed in range(1,11):        
        pred_path = os.path.join(full_model_path, "seed-{}".format(seed), "predictions.csv")
        full_test = new_test(test_samples, pred_path, corpus, corpus_name)
        for metric in full_test:
            result_dict['full_test'][metric].append(full_test[metric])
        dynamic_only = new_test(dynamic_samples, pred_path, corpus, corpus_name)
        for metric in dynamic_only:
            result_dict['dynamic_only'][metric].append(dynamic_only[metric])
        single_enough = new_test(single_samples, pred_path, corpus, corpus_name)
        for metric in single_enough:
            result_dict['single_enough'][metric].append(single_enough[metric])
    for metric in result_dict['full_test']:
        result_dict['full_test'][metric] = np.mean(result_dict['full_test'][metric])
        result_dict['dynamic_only'][metric] = np.mean(result_dict['dynamic_only'][metric])
        result_dict['single_enough'][metric] = np.mean(result_dict['single_enough'][metric])
    return result_dict
def extract_dynamic_samples(all_predictions, corpus, corpus_name):
    label_metadata = "conversation_has_personal_attack" if corpus_name == "wikiconv" else "has_removed_comment"
    num_convo = 0
    hard_pos, hard_neg = 0, 0
    all_pos, all_neg = 0, 0
    dynamic_samples = []
    test_samples = []
    for convo in corpus.iter_conversations():
        if convo.meta['split'] == 'test':
            test_samples.append(convo.id)
            max_agreement = 0
            for utterance in convo.iter_utterances():
                id = utterance.id
                if id in all_predictions:
                    if all_predictions[id] > max_agreement:
                        max_agreement = all_predictions[id]
            if convo.meta[label_metadata] == False:
                all_neg += 1
                if max_agreement >= 3:
                    hard_neg += 1
                    dynamic_samples.append(convo.id)
            else:
                all_pos += 1
                if max_agreement <= 8:
                    hard_pos += 1
                    dynamic_samples.append(convo.id)
    print("We have {} positive samples and {} negative samples in the test set".format(all_pos, all_neg))
    # print("We have {} positive samples and {} negative dynamic samples".format(hard_pos, hard_neg))

    single_samples = [id for id in test_samples if id not in dynamic_samples]
    print(len(dynamic_samples))
    print(len(single_samples))
    return test_samples, dynamic_samples, single_samples

In [48]:
print(full_evaluate("roberta-base", full_model_path, "roberta-base", single_model_path, wikicorpus, 'wikiconv'))

We have 420 positive samples and 420 negative samples in the test set
420
420
{'full_test': {'accuracy': 0.6488095238095238, 'precision': 0.6419128579597747, 'recall': 0.6757142857142857, 'f1': 0.6573810572690272}, 'dynamic_only': {'accuracy': 0.4133333333333333, 'precision': 0.4284775098524071, 'recall': 0.46261682242990654, 'f1': 0.44372206824574717}, 'single_enough': {'accuracy': 0.8842857142857141, 'precision': 0.8719198117095093, 'recall': 0.8970873786407767, 'f1': 0.8836917831036912}}


In [49]:
print(full_evaluate("roberta-base", full_model_path, "roberta-base", single_model_path, cmvcorpus, 'cmv'))

We have 684 positive samples and 684 negative samples in the test set
643
725
{'full_test': {'accuracy': 0.6628654970760233, 'precision': 0.6472230110242035, 'recall': 0.7290935672514619, 'f1': 0.6826342611722096}, 'dynamic_only': {'accuracy': 0.40979782270606535, 'precision': 0.48020863892416116, 'recall': 0.5335149863760218, 'f1': 0.5026072874705325}, 'single_enough': {'accuracy': 0.8873103448275861, 'precision': 0.8273038729813302, 'recall': 0.9555205047318612, 'f1': 0.8838040981592844}}


# 2 Threshold

In [14]:
from convokit import download, Corpus
wikicorpus = Corpus(filename=download("conversations-gone-awry-corpus"))

Dataset already exists at /home/sqt2/.convokit/downloads/conversations-gone-awry-corpus


In [33]:
import numpy as np
import pandas as pd
def tune_thresholds(val_pred_path, corpus, corpus_name):
    label_metadata = "conversation_has_personal_attack" if corpus_name == "wikiconv" else "has_removed_comment"

    pred_file = open(val_pred_path, 'r')
    pred_lines = pred_file.readlines()[1:]
    pred_dict = {}

    best_acc = 0
    bestthres_set = [0, 0]
    for first_thres in np.arange(0.3,0.5,0.01):
        for second_thres in np.arange(0.5,0.85, 0.01):
            for line in pred_lines:
                id2pred = line.split(",")
                
                assert len(id2pred) == 3
                utt_id = id2pred[0]
                utt_pred = int(float(id2pred[2]) > first_thres)
                utt_score = id2pred[2]
                # all_logits.append(float(utt_score))
                pred_dict[utt_id] = [int(utt_pred), float(utt_score)]
            # plt.hist(all_logits, bins=20)
            # plt.show()
            for convo in corpus.iter_conversations():
                prev = 0
                # only consider test set conversations (we did not make predictions for the other ones)
                if convo.meta['split'] == 'val':
                    for utt in convo.iter_utterances():
                        if utt.id in pred_dict:
                            utt.meta['forecast_score'] = (pred_dict[utt.id][0] * prev) or (pred_dict[utt.id][1] > second_thres)
                            prev = prev or pred_dict[utt.id][0]
            
            conversational_forecasts_df = {
                    "convo_id": [],
                    "label": [],
                    "prediction": []
                }
            for convo in corpus.iter_conversations():
                if convo.meta['split'] == 'val':
                    conversational_forecasts_df['convo_id'].append(convo.id)
                    conversational_forecasts_df['label'].append(int(convo.meta[label_metadata]))
                    forecast_scores = [utt.meta['forecast_score'] for utt in convo.iter_utterances() if 'forecast_score' in utt.meta]
                    conversational_forecasts_df['prediction'].append(max(forecast_scores))
            conversational_forecasts_df = pd.DataFrame(conversational_forecasts_df).set_index("convo_id")
            test_labels = conversational_forecasts_df.label
            test_preds = conversational_forecasts_df.prediction
            test_acc = (test_labels == test_preds).mean()
            
            tp = ((test_labels==1)&(test_preds==1)).sum()
            fp = ((test_labels==0)&(test_preds==1)).sum()
            tn = ((test_labels==0)&(test_preds==0)).sum()
            fn = ((test_labels==1)&(test_preds==0)).sum()
        
            test_precision = tp / (tp + fp)
            test_recall = tp / (tp + fn)
            test_fpr = fp / (fp + tn)
            test_f1 = 2 / (((tp + fp) / tp) + ((tp + fn) / tp))
            
            if test_acc > best_acc:
                best_acc = test_acc
                bestthres_set = [first_thres, second_thres]
    print(best_acc, bestthres_set)
    return {"accuracy":test_acc, "precision":test_precision, "recall":test_recall, "f1":test_f1}

In [34]:
tune_thresholds("/reef/sqt2/SINGLE_UTT_NEW/wikiconv/bert-base-cased/seed-1/pred_val.csv", wikicorpus, 'wikiconv')

0.6202380952380953 [0.49000000000000016, 0.7500000000000002]


{'accuracy': 0.6130952380952381,
 'precision': 0.594059405940594,
 'recall': 0.7142857142857143,
 'f1': 0.6486486486486487}

In [None]:
tune_thresholds("/reef/sqt2/SINGLE_UTT_NEW/wikiconv/bert-base-cased/seed-1/pred_test.csv", wikicorpus, 'wikiconv')

In [None]:

tune_thresholds("/reef/sqt2/BERTCRAFT/wikiconv/roberta-base/seed-1/predictions.csv", wikicorpus, 'wikiconv')