In [1]:
import random
from convokit import Corpus, download
corpus = Corpus(filename=download("conversations-gone-awry-corpus"))

Dataset already exists at /Volumes/Users/tran_s2/.convokit/downloads/conversations-gone-awry-corpus


In [2]:
print(len(corpus.get_conversation_ids()))

4188


In [3]:
def loadPairs(corpus, split=None, last_only=False):
    """
    Load context-reply pairs from the Corpus, optionally filtering to only conversations
    from the specified split (train, val, or test).
    Each conversation, which has N comments (not including the section header) will
    get converted into N-1 comment-reply pairs, one pair for each reply
    (the first comment does not reply to anything).
    Each comment-reply pair is a tuple consisting of the conversational context
    (that is, all comments prior to the reply), the reply itself, the label (that
    is, whether the reply contained a derailment event), and the comment ID of the
    reply (for later use in re-joining with the ConvoKit corpus).
    The function returns a list of such pairs.
    """
    pairs = []
    count_attack = 0
    count_convo = 0
    for convo in corpus.iter_conversations():
        # consider only conversations in the specified split of the data
        if split is None or convo.meta['split'] == split:
            count_convo += 1
            utterance_list = []
            for utterance in convo.iter_utterances():
                if utterance.meta['is_section_header']:
                    continue
                if utterance.meta['comment_has_personal_attack']:
                    count_attack += 1
                utterance_list.append({"text": utterance.text, 
                                        "is_attack": int(utterance.meta['comment_has_personal_attack']), 
                                        "id": utterance.id})
                
            iter_range = range(1, len(utterance_list)) if not last_only else [len(utterance_list)-1]
            for idx in iter_range:
                reply = utterance_list[idx]["text"]
                label = utterance_list[idx]["is_attack"]
                comment_id = utterance_list[idx]["id"]
                # gather as context all utterances preceding the reply
                context = [u["text"] for u in utterance_list[:idx]]
                pairs.append((context, reply, label, comment_id))

    return pairs
def conversations2utterances(conversations):
    """
    Convert list of conversations into list of utterances for UtteranceModel.
    INPUT:
        conversations: list of list of str
            List of conversations, each conversation is a list of utterances.
    OUTPUT:
        utterances: list of str
            List of utterances in the dataset.
        conversationLength: list of int
            List of number of utterances in conversations.
    """
    conversationLength = [len(convo) for convo in conversations]
    utterances = []
    for convo in conversations:
        for utterance in convo:
            utterances.append(utterance)
    # assert len(utterances) == sum(conversationLength)
    return utterances, conversationLength

def load_data(corpus, context_batch_size = 32, split=None, last_only=False, shuffle=True):
    """
    Load data from corpus into the format ready for UtteranceModel.
    INPUT:
        corpus: convokit.Corpus
        split: str, optional
            If specified, only consider conversations in the specified split of the data.
        last_only: bool, optional
            If True, only consider the last utterance in each conversation.
    OUTPUT:
        utterances: list of str
            List of utterances in the dataset.
        conversationLength: list of int
            List of lengths of conversations in the dataset.
        comment_ids: list of str
            List of ids corresponding to the reply utterance.
        labels: list of int
            List of labels for each context if the next reply contains personal attack.
    """
    pairs = loadPairs(corpus, split, last_only)
    if shuffle:
        random.shuffle(pairs)
    batch_labels = []
    batch_comment_ids = []
    batch_utterances = []
    batch_conversationLength = []
    conversations = []
    labels = []
    comment_ids = []
    for pair in pairs:
        if len(labels) == context_batch_size:
            utterances, conversationLength = conversations2utterances(conversations)
            batch_utterances.append(utterances)
            batch_conversationLength.append(conversationLength)
            batch_labels.append(labels)
            batch_comment_ids.append(comment_ids)
            assert len(conversationLength) == len(comment_ids) == len(labels)
            conversations = []
            labels = []
            comment_ids = []

        context, _, label, comment_id = pair
        conversations.append(context)
        labels.append(label)
        comment_ids.append(comment_id)
    if len(conversations) > 0:
        utterances, conversationLength = conversations2utterances(conversations)
        batch_utterances.append(utterances)
        batch_conversationLength.append(conversationLength)
        batch_labels.append(labels)
        batch_comment_ids.append(comment_ids)
    return batch_utterances, batch_conversationLength, batch_comment_ids, batch_labels

# UtteranceRNN


In [4]:
import requests

class Voc:
    """A class for representing the vocabulary used by a CRAFT model"""

    def __init__(self, name, word2index=None, index2word=None):
        # Default word tokens
        self.PAD_token = 0  # Used for padding short sentences
        self.SOS_token = 1  # Start-of-sentence token
        self.EOS_token = 2  # End-of-sentence token
        self.UNK_token = 3  # Unknown word token

        self.name = name
        self.trimmed = False if not word2index else True # if a precomputed vocab is specified assume the user wants to use it as-is
        self.word2index = word2index if word2index else {"UNK": self.UNK_token}
        self.word2count = {}
        self.index2word = index2word if index2word else {self.PAD_token: "PAD", self.SOS_token: "SOS", self.EOS_token: "EOS", self.UNK_token: "UNK"}
        self.num_words = 4 if not index2word else len(index2word)  # Count SOS, EOS, PAD, UNK

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.num_words
            self.word2count[word] = 1
            self.index2word[self.num_words] = word
            self.num_words += 1
        else:
            self.word2count[word] += 1

    # Remove words below a certain count threshold
    def trim(self, min_count):
        if self.trimmed:
            return
        self.trimmed = True

        keep_words = []

        for k, v in self.word2count.items():
            if v >= min_count:
                keep_words.append(k)

        print('keep_words {} / {} = {:.4f}'.format(
            len(keep_words), len(self.word2index), len(keep_words) / len(self.word2index)
        ))

        # Reinitialize dictionaries
        self.word2index = {"UNK": self.UNK_token}
        self.word2count = {}
        self.index2word = {self.PAD_token: "PAD", self.SOS_token: "SOS", self.EOS_token: "EOS", self.UNK_token: "UNK"}
        self.num_words = 4 # Count default tokens
        for word in keep_words:
            self.addWord(word)

# Create a Voc object from precomputed data structures
def loadPrecomputedVoc(corpus_name, word2index_url, index2word_url):
    # load the word-to-index lookup map
    r = requests.get(word2index_url)
    word2index = r.json()
    # load the index-to-word lookup map
    r = requests.get(index2word_url)
    index2word = r.json()
    return Voc(corpus_name, word2index, index2word)

In [5]:
import nltk
import unicodedata
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
import numpy as np
class Tokenizer:
    def __init__(self):
        self.tokenizer = nltk.tokenize.RegexpTokenizer(pattern=r'\w+|[^\w\s]')
        WORD2INDEX_URL = "http://zissou.infosci.cornell.edu/convokit/models/craft_wikiconv/word2index.json"
        INDEX2WORD_URL = "http://zissou.infosci.cornell.edu/convokit/models/craft_wikiconv/index2word.json"
        self.vocab = self.loadPrecomputedVoc("wikiconv", WORD2INDEX_URL, INDEX2WORD_URL)
        self.vocab_size = self.vocab.num_words
    # Create a Voc object from precomputed data structures
    def loadPrecomputedVoc(self, corpus_name, word2index_url, index2word_url):
        # load the word-to-index lookup map
        r = requests.get(word2index_url)
        word2index = r.json()
        # load the index-to-word lookup map
        r = requests.get(index2word_url)
        index2word = r.json()
        return Voc(corpus_name, word2index, index2word)
    def unicodeToAscii(self, utterance):
        return ''.join(
            c for c in unicodedata.normalize('NFD', utterance)
            if unicodedata.category(c) != 'Mn')
    def tokenize(self, utterance):
        # simplify the problem space by considering only ASCII data
        cleaned_text = self.unicodeToAscii(utterance.lower())
        # if the resulting string is empty, nothing else to do
        if not cleaned_text.strip():
            return []
        return self.tokenizer.tokenize(cleaned_text)
    
    def forward(self, utterance):
        tokens = self.tokenize(utterance)
        inputs = []
        for token in tokens:
            if token in self.vocab.word2index:
                inputs.append(self.vocab.word2index[token])
            else:
                inputs.append(self.vocab.UNK_token)
        inputs.append(self.vocab.EOS_token)
        return inputs
    
class EncoderRNN(nn.Module):
    """This module represents the utterance encoder component of CRAFT, responsible for creating vector representations of utterances"""
    def __init__(self, device, hidden_size, embedding_dim, max_utterance_len, batch_size=256, n_layers=1, dropout=0):
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.tokenizer = Tokenizer()
        self.max_utterance_len = max_utterance_len
        self.batch_size = batch_size
        self.device = device
        self.embedding = nn.Embedding(self.tokenizer.vocab_size, embedding_dim).to(device)

        # Initialize GRU; the input_size and hidden_size params are both set to 'hidden_size'
        #   because our input size is a word embedding with number of features == hidden_size
        self.gru = nn.GRU(embedding_dim, hidden_size, n_layers, batch_first = True,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=True).to(device)
    def tokenize(self, utterances):
        inputs = [self.tokenizer.forward(utterance) for utterance in utterances]
        features = np.zeros((len(inputs), self.max_utterance_len), dtype=int)

        # for each review, I grab that review and 
        for i, row in enumerate(inputs):
            features[i, -min(len(row), self.max_utterance_len):] = np.array(row)[:self.max_utterance_len]

        def batch(features, batch_size):
            for i in range(0, len(features), batch_size):
                yield torch.from_numpy(features[i:min(i+batch_size, len(features))]).long().to(self.device)

        return batch(features, self.batch_size)
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        hidden = [weight.new(self.n_layers, batch_size, self.hidden_size).zero_() for _ in range(self.n_layers)]
        return torch.cat(hidden, dim=0).to(self.device)
    def forward(self, utterances, hidden=None):
        dataloader = self.tokenize(utterances)
        outputs = []
        hiddens = []
        for inputs in dataloader:
            inputs = inputs.to(self.device)
            hidden = self.init_hidden(inputs.shape[0])
            embedded = self.embedding(inputs)
            output, hidden = self.gru(embedded, hidden)
            outputs.append(output.to('cpu').detach())
            hiddens.append(hidden.to('cpu').detach())
        outputs = torch.cat(outputs, dim=0)
        hiddens = torch.cat(hiddens, dim=1)
        return outputs, hiddens
        

In [6]:
import os
import wget
def load_Utterance_pretrain(device):
    hidden_size = 500
    encoder_n_layers = 2
    dropout = 0.1
    # MODEL_URL = "http://zissou.infosci.cornell.edu/convokit/models/craft_wikiconv/craft_pretrained.tar"
    # print("Loading saved parameters...")
    # if not os.path.isfile("pretrained_model.tar"):
    #     print("\tDownloading pre-trained CRAFT...")
    #     wget.download(MODEL_URL)
    #     # urlretrieve(MODEL_URL, "craft_pretrained.tar")
    #     print("\t...Done!")
    checkpoint = torch.load("pretrained_model.tar")
    encoder = EncoderRNN(device=device, hidden_size=hidden_size, embedding_dim=hidden_size, max_utterance_len = 300,
                          n_layers=encoder_n_layers, dropout=dropout)
    encoder_sd = checkpoint['en']
    embedding_sd = checkpoint['embedding']
    encoder.embedding.load_state_dict(embedding_sd)
    encoder.load_state_dict(encoder_sd)
    return encoder

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device = torch.device("cpu")

# Utterance BERT

# Context RNN

In [8]:
class ContextEncoderRNN(nn.Module):
    """This module represents the context encoder component of CRAFT, responsible for creating an order-sensitive vector representation of conversation context"""
    def __init__(self, hidden_size, n_layers=1, dropout=0):
        super(ContextEncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size

        # only unidirectional GRU for context encoding
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, batch_first = True,
                          dropout=(0 if n_layers == 1 else dropout), bidirectional=False)

    def forward(self, input_seq, hidden=None):
        # Pack padded batch of sequences for RNN module
        # packed = torch.nn.utils.rnn.pack_padded_sequence(input_seq, input_lengths)
        # Forward pass through GRU
        outputs, hidden = self.gru(input_seq, hidden)
        # Unpack padding
        # outputs, _ = torch.nn.utils.rnn.pad_packed_sequence(outputs)
        # return output and final hidden state
        return outputs, hidden

class SingleTargetClf(nn.Module):
    """This module represents the CRAFT classifier head, which takes the context encoding and uses it to make a forecast"""
    def __init__(self, hidden_size, dropout=0.1):
        super(SingleTargetClf, self).__init__()

        self.hidden_size = hidden_size

        # initialize classifier
        self.layer1 = nn.Linear(hidden_size, hidden_size)
        self.layer1_act = nn.LeakyReLU()
        self.layer2 = nn.Linear(hidden_size, hidden_size // 2)
        self.layer2_act = nn.LeakyReLU()
        self.clf = nn.Linear(hidden_size // 2, 1)
        self.dropout = nn.Dropout(p=dropout)

    # def forward(self, encoder_outputs, encoder_input_lengths):
    #     # from stackoverflow (https://stackoverflow.com/questions/50856936/taking-the-last-state-from-bilstm-bigru-in-pytorch)
    #     # First we unsqueeze seqlengths two times so it has the same number of
    #     # of dimensions as output_forward
    #     # (batch_size) -> (1, batch_size, 1)
    #     lengths = encoder_input_lengths.unsqueeze(0).unsqueeze(2)
    #     # Then we expand it accordingly
    #     # (1, batch_size, 1) -> (1, batch_size, hidden_size)
    #     lengths = lengths.expand((1, -1, encoder_outputs.size(2)))

    #     # take only the last state of the encoder for each batch
    #     last_outputs = torch.gather(encoder_outputs, 0, lengths-1).squeeze()
    #     # forward pass through hidden layers
    #     layer1_out = self.layer1_act(self.layer1(self.dropout(last_outputs)))
    #     layer2_out = self.layer2_act(self.layer2(self.dropout(layer1_out)))
    #     # compute and return logits
    #     logits = self.clf(self.dropout(layer2_out)).squeeze()
    #     return logits
    def forward(self, encoder_hidden):
        hidden = encoder_hidden[-1,:,:]
        # forward pass through hidden layers
        hidden = hidden.squeeze()
        layer1_out = self.layer1_act(self.layer1(self.dropout(hidden)))
        layer2_out = self.layer2_act(self.layer2(self.dropout(layer1_out)))
        # compute and return logits
        logits = self.clf(self.dropout(layer2_out)).squeeze()
        return logits


In [9]:
hidden_size = 500
context_encoder_n_layers = 2
dropout = 0.1
context_encoder = ContextEncoderRNN(hidden_size, context_encoder_n_layers, dropout)
checkpoint = torch.load("pretrained_model.tar")
context_sd = checkpoint['ctx']
context_encoder.load_state_dict(context_sd)

<All keys matched successfully>

In [10]:
attack_clf = SingleTargetClf(hidden_size, dropout)

In [11]:
context_encoder = context_encoder.to(device)
attack_clf = attack_clf.to(device)

# Pipeline

In [12]:
my_utt = load_Utterance_pretrain(device)

In [4]:
train_utterances, train_conversationLength, train_comment_ids, train_labels = load_data(corpus, split='train', last_only=True, context_batch_size=99999999)
valid_utterances, valid_conversationLength, valid_comment_ids, valid_labels = load_data(corpus, split='val', last_only=True)

In [5]:
print(len(train_labels[0]))
print(sum(train_labels[0])/len(train_labels[0]))
print(sum(valid_labels[0])/len(valid_labels[0]))

2508
0.5
0.4375


In [15]:
def prepare_context_batch(utt_hidden, batch_conversationLength, max_context_len=20):
    assert utt_hidden.shape[1] == sum(batch_conversationLength)
    utt_encoder_summed = utt_hidden[-2,:,:] + utt_hidden[-1,:,:]
    hidden_size = utt_encoder_summed.shape[1]
    context_features = np.zeros((len(batch_conversationLength), max_context_len, hidden_size), dtype=np.float32)

    current_utt_idx = 0
    for i, convo_len in enumerate(batch_conversationLength):
        if convo_len > max_context_len:
            current_utt_idx += convo_len - max_context_len
            convo_len = max_context_len
        context_features[i, -convo_len:, :] = np.array(utt_encoder_summed)[current_utt_idx:current_utt_idx+convo_len, :]
        current_utt_idx += convo_len
    return context_features

In [16]:
learning_rate = 1e-5

In [17]:
from torch import optim

encoder_optimizer = optim.Adam(my_utt.parameters(), lr=learning_rate)
context_encoder_optimizer = optim.Adam(context_encoder.parameters(), lr=learning_rate)
attack_clf_optimizer = optim.Adam(attack_clf.parameters(), lr=learning_rate)



In [18]:
# from torch.nn import BCEWithLogitsLoss
import torch.nn.functional as F
from sklearn.metrics import f1_score
def calculate_f1_score(labels, preds):
    # preds = torch.sigmoid(logits) > 0.5
    # Calculating precision, recall, and F1 score using PyTorch
    TP = ((preds == 1) & (labels == 1)).sum().item()
    FP = ((preds == 1) & (labels == 0)).sum().item()
    FN = ((preds == 0) & (labels == 1)).sum().item()

    precision = TP / (TP + FP) if TP + FP > 0 else 0
    recall = TP / (TP + FN) if TP + FN > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    return f1
def evaluate(encoder, context_encoder, attack_clf, val_utterances, val_conversationLength, val_labels):
    encoder.eval()
    context_encoder.eval()
    attack_clf.eval()
    val_loss = 0
    val_f1 = 0
    val_accuracy = 0
    val_batches = 0
    pos = 0
    all_labels = []
    all_preds = []
    for i in range(len(val_utterances)):
        batch_utterances = val_utterances[i]
        batch_conversationLength = val_conversationLength[i]
        batch_labels = val_labels[i]
        batch_size = len(batch_labels)
        if batch_size == 0:
            continue
        with torch.no_grad():
            utt_outputs, utt_hidden = encoder.forward(batch_utterances)
            context_features = prepare_context_batch(utt_hidden, batch_conversationLength)
            context_features = torch.from_numpy(context_features).to(device)
            context_outputs, context_hidden = context_encoder(context_features)
            logits = attack_clf(context_hidden)
            labels = torch.tensor(batch_labels, dtype=torch.float32).to(device)
            # pos_weight = torch.tensor([1]).type_as(logits)
            # loss_fct = BCEWithLogitsLoss(pos_weight=pos_weight, reduction = 'sum')
            # loss = loss_fct(logits, labels)
            loss = F.binary_cross_entropy_with_logits(logits, labels)
            val_loss += loss.item()
            preds = torch.sigmoid(logits) > 0.5
            pos += preds.sum().item() / len(batch_labels)
            val_f1 += calculate_f1_score(labels.cpu().detach(), logits.cpu().detach())
            val_accuracy += (preds == labels).sum().item() / len(batch_labels)
            val_batches += 1
            all_labels.append(labels.cpu().detach())
            all_preds.append(preds.cpu().detach())
    all_labels = torch.cat(all_labels)
    all_preds = torch.cat(all_preds)
    val_f1 = calculate_f1_score(all_labels, all_preds)
    return val_loss / val_batches, val_f1, val_accuracy/val_batches, pos/len(val_labels)

In [19]:
num_steps = 0
for epoch in range(30):
    for batch_idx in range(len(train_labels)):
        num_steps += 1
        if num_steps % 50 == 0:
            print(num_steps, epoch)
            val_loss, val_f1, val_accuracy, pos= evaluate(my_utt, context_encoder, attack_clf, valid_utterances, valid_conversationLength, valid_labels)
            print("Validation loss: {:.2f} accuracy: {:.2f} f1: {:.2f} pos: {:.2f}".format(val_loss, val_accuracy * 100, val_f1 * 100, pos))
        my_utt.train()
        context_encoder.train()
        attack_clf.train()
        
        batch_utterances = train_utterances[batch_idx]
        batch_conversationLength = train_conversationLength[batch_idx]
        batch_comment_ids = train_comment_ids[batch_idx]
        batch_labels = train_labels[batch_idx]
        encoder_optimizer.zero_grad()
        context_encoder_optimizer.zero_grad()
        attack_clf_optimizer.zero_grad()
        
        encoder_outputs, hidden = my_utt.forward(batch_utterances)
        context_features = torch.from_numpy(prepare_context_batch(hidden, batch_conversationLength)).to(device)
        final_outputs, final_hidden = context_encoder.forward(context_features)
        logits = attack_clf(final_hidden)
        labels = torch.tensor(batch_labels, dtype=torch.float32).to(device)
        # pos_weight = torch.tensor([1]).type_as(logits)
        # loss_fct = BCEWithLogitsLoss(pos_weight=pos_weight, reduction = 'sum')
        # loss = loss_fct(logits, labels)
        loss = F.binary_cross_entropy_with_logits(logits, labels)

        loss.backward()
        clip = 50.0
        # Clip gradients: gradients are modified in place
        _ = torch.nn.utils.clip_grad_norm_(my_utt.parameters(), clip)
        _ = torch.nn.utils.clip_grad_norm_(context_encoder.parameters(), clip)
        _ = torch.nn.utils.clip_grad_norm_(attack_clf.parameters(), clip)

        # Adjust model weights
        encoder_optimizer.step()
        context_encoder_optimizer.step()
        attack_clf_optimizer.step()


50 0
Validation loss: 0.69 accuracy: 49.31 f1: 66.67 pos: 1.00
100 1
Validation loss: 0.69 accuracy: 49.31 f1: 66.67 pos: 1.00
150 1
Validation loss: 0.69 accuracy: 50.00 f1: 66.93 pos: 0.99
200 2
Validation loss: 0.69 accuracy: 53.82 f1: 67.24 pos: 0.89
250 3
Validation loss: 0.69 accuracy: 53.01 f1: 66.21 pos: 0.88
300 3
Validation loss: 0.69 accuracy: 58.10 f1: 59.49 pos: 0.53
350 4
Validation loss: 0.69 accuracy: 58.10 f1: 62.24 pos: 0.59
400 5
Validation loss: 0.69 accuracy: 56.60 f1: 62.26 pos: 0.64
450 5
Validation loss: 0.69 accuracy: 58.80 f1: 56.68 pos: 0.44
500 6
Validation loss: 0.68 accuracy: 57.87 f1: 59.18 pos: 0.51
550 6
Validation loss: 0.68 accuracy: 58.80 f1: 60.28 pos: 0.53
600 7
Validation loss: 0.68 accuracy: 58.91 f1: 56.64 pos: 0.44
650 8
Validation loss: 0.68 accuracy: 59.38 f1: 62.12 pos: 0.56
700 8
Validation loss: 0.68 accuracy: 59.14 f1: 53.98 pos: 0.38
750 9
Validation loss: 0.67 accuracy: 60.07 f1: 58.43 pos: 0.45
800 10
Validation loss: 0.67 accuracy: 59