In [1]:
import os
from convokit import download, Corpus
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
wikicorpus = Corpus(filename=download("conversations-gone-awry-corpus"))
cmvcorpus = Corpus(filename=download("conversations-gone-awry-cmv-corpus"))

Dataset already exists at /home/sqt2/.convokit/downloads/conversations-gone-awry-corpus
Dataset already exists at /home/sqt2/.convokit/downloads/conversations-gone-awry-cmv-corpus


In [16]:
def get_preds(saved_path):
    predictions = {}
    all_seeds = os.listdir(saved_path)
    num_seed = len(all_seeds)
    for seed in all_seeds:
        pred_path = os.path.join(saved_path, seed, "train_predictions.csv")
        pred_file = open(pred_path, 'r')
        pred_lines = pred_file.readlines()[1:]
        for line in pred_lines:
            id2pred = line.split(",")
            
            assert len(id2pred) == 3
            utt_id = id2pred[0]
            utt_pred = id2pred[2]
            if utt_id not in predictions:
                predictions[utt_id] = float(utt_pred)
            else:
                predictions[utt_id] += float(utt_pred)
    for utt_id in predictions:
        predictions[utt_id] /= num_seed
    return predictions
    
def get_hard_neg(saved_path, corpus, low_threshold=0.5, high_threshold=1.0):
    num_neg = 0
    label_metadata = "conversation_has_personal_attack" if "wikiconv" in saved_path else "has_removed_comment"
    predictions = get_preds(saved_path)
    hard_neg = []

    for convo in corpus.iter_conversations():
        if convo.meta['split'] == 'train':
            if convo.meta[label_metadata] == False:
                num_neg += 1
                hard_id = None
                hard_score = 0
                for utterance in convo.iter_utterances():
                    id = utterance.id
                    try:
                        if high_threshold > predictions[id] > low_threshold:
                            if predictions[id] > hard_score:
                                hard_id = id
                                hard_score = predictions[id]
                    except:
                        continue
                if hard_id:
                    hard_neg.append(hard_id)
    print("Number of negative samples in training set:", num_neg)
    print("Number of HARD negative samples in training set:", len(hard_neg))
    return hard_neg

In [17]:
wikipred_path = "/reef/sqt2/AdverFiltering/wikiconv/roberta-base"
cmvpred_path = "/reef/sqt2/AdverFiltering/cmv/roberta-base"


In [18]:
import json
to_save = {"data": get_hard_neg(wikipred_path, wikicorpus, 0)}
with open('/home/sqt2/myExperiment/adversarial_filtering/hard_neg/wikiconv.json', 'w', encoding='utf-8') as f:
    json.dump(to_save, f, ensure_ascii=False, indent=4)

Number of negative samples in training set: 1254
Number of HARD negative samples in training set: 1254


In [19]:
to_save = {"data": get_hard_neg(cmvpred_path, cmvcorpus, 0)}
with open('/home/sqt2/myExperiment/adversarial_filtering/hard_neg/cmv.json', 'w', encoding='utf-8') as f:
    json.dump(to_save, f, ensure_ascii=False, indent=4)

Number of negative samples in training set: 2053
Number of HARD negative samples in training set: 2053


In [20]:
# Opening JSON file
import json
f = open('/home/sqt2/myExperiment/adversarial_filtering/hard_neg/wikiconv.json')

# returns JSON object as 
# a dictionary
data = json.load(f)['data']
print(len(data))

1254
