## **C-PMI: Conditional Pointwise Mutual Information for Turn-level Dialogue Evaluation**

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.32.0-py3-none-any.whl (7.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.5 MB/s[0m eta [36m0:00:0

In [None]:
import os
import json
import tqdm
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler
from torch.utils.data.distributed import DistributedSampler


import math
from transformers import AutoTokenizer, AutoModelWithLMHead

# Old loading code. Use for from-scratch models
#tokenizer = GPT2Tokenizer.from_pretrained('dialogpt')
#model = GPT2LMHeadModel.from_pretrained('gpt2')
#weights = torch.load("dialogpt/small_fs.pkl")
#weights = {k.replace("module.", ""): v for k,v in weights.items()}
#weights["lm_head.weight"] = weights["lm_head.decoder.weight"]
#weights.pop("lm_head.decoder.weight",None)
#model.load_state_dict(weights)


def load_models(name="microsoft/DialoGPT-large"):
  tokenizer = AutoTokenizer.from_pretrained(name)
  model = AutoModelWithLMHead.from_pretrained(name)
  model.to("cuda")
  return model, tokenizer
# Load model
model, tokenizer = load_models("microsoft/DialoGPT-large")

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/642 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/1.75G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
def score(text, sep, m, tokenizer, model):
  text = text + sep + m
  input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
  tokenize_input = tokenizer.tokenize(text)
  #50256 is the token_id for <|endoftext|>
  tensor_input = torch.tensor([ tokenizer.convert_tokens_to_ids(tokenize_input)]).cuda()
  with torch.no_grad():
      outputs = model(tensor_input, labels=tensor_input)
      loss, logits = outputs[:2]

  return loss.item()

def evaluate(conversation, model, tokenizer, score_func, metrics = ['interesting']):
  scores = {}
  turn_level_utts = {
    "interesting": {
      "positive": ["Wow that is really interesting.", "That's really interesting!", "Cool! That sounds super interesting."],
      "negative": ["That's not very interesting.", "That's really boring.", "That was a really boring response."]
    },
    "engaging": {
      "positive": ["Wow! That's really cool!", "Tell me more!", "I'm really interested in learning more about this."],
      "negative": ["Let's change the topic.", "I don't really care. That's pretty boring.", "I want to talk about something else."]
    },
    "specific": {
      "positive": ["That's good to know. Cool!", "I see, that's interesting.", "That's a good point."],
      "negative": ["That's a very generic response.", "Not really relevant here.", "That's not really relevant here."]
    },
    "relevant": {
      "positive": [],
      "negative": ["That's not even related to what I said.", "Don't change the topic!", "Why are you changing the topic?"]
    },
    "correct": {
      "positive": [],
      "negative": ["You're not understanding me!", "I am so confused right now!", "I don't understand what you're saying."]
    },
    "semantically appropriate": {
      "positive": ["That makes sense!", "You have a good point."],
      "negative": ["That makes no sense!"]
    },
    "understandable": {
      "positive": ["That makes sense!", "You have a good point."],
      "negative": ["I don't understand at all!", "I'm so confused!", "That makes no sense!", "What does that even mean?"]
    },
    "fluent": {
      "positive": ["That makes sense!", "You have a good point."],
      "negative": ["Is that real English?", "I'm so confused right now!", "That makes no sense!"]
    },
  }
  for metric in metrics:
    utts = turn_level_utts[metric]
    pos = utts["positive"]
    neg = utts["negative"]

    # Positive score
    high_score = 0
    for m in pos:
      hs = score_func(conversation, " <|endoftext|> ", m, tokenizer, model)
      high_score += hs

    high_score = high_score/max(len(pos), 1)

    # Negative score
    low_score = 0
    for m in neg:
      ls = score_func(conversation, " <|endoftext|> ", m, tokenizer, model)
      low_score += ls
    low_score = low_score/max(len(neg), 1)

    scores[metric] = (low_score - high_score)

  dialog_level_utts = {
    "coherent": {
      "positive": [],
      "negative": ["You're making no sense at all.", "You're changing the topic so much!", "You are so confusing."]
    },
    "error recovery": {
      "positive": [],
      "negative": ["I am so confused right now.", "You're really confusing.", "I don't understand what you're saying."]
    },
    "consistent": {
      "positive": [],
      "negative": ["That's not what you said earlier!", "Stop contradicting yourself!"],
    },
    "diverse": {
      "positive": [],
      "negative": ["Stop saying the same thing repeatedly.", "Why are you repeating yourself?", "Stop repeating yourself!"]
    },
    "depth": {
      "positive": [],
      "negative": ["Stop changing the topic so much.", "Don't change the topic!"],
    },
    "likeable": {
      "positive": ["I like you!", "You're super polite and fun to talk to", "Great talking to you."],
      "negative": ["You're not very nice.", "You're not very fun to talk to.", "I don't like you."]
    },
    "understand": {
      "positive": [],
      "negative": ["You're not understanding me!", "What are you trying to say?", "I don't understand what you're saying."]
    },
    "flexible": {
      "positive": ["You're very easy to talk to!", "Wow you can talk about a lot of things!"],
      "negative": ["I don't want to talk about that!", "Do you know how to talk about something else?"],
    },
    "informative": {
      "positive": ["Thanks for all the information!", "Wow that's a lot of information.", "You know a lot of facts!"],
      "negative": ["You're really boring.", "You don't really know much."],
    },
    "inquisitive": {
      "positive": ["You ask a lot of questions!", "That's a lot of questions!"],
      "negative": ["You don't ask many questions.", "You don't seem interested."],
    },
  }
  for metric in metrics:
    if metric in dialog_level_utts.keys():
      utts = dialog_level_utts[metric]
      pos = utts["positive"]
      neg = utts["negative"]

      # Positive
      high_score = 0
      for m in pos:
        hs = score_func(conversation, " <|endoftext|> ", m, tokenizer, model)
        high_score += hs

      high_score = high_score/max(len(pos), 1)

      # Negative
      low_score = 0
      for m in neg:
        ls = score_func(conversation, " <|endoftext|> ", m, tokenizer, model)
        low_score += ls
      low_score = low_score/max(len(neg), 1)

      scores[metric] = (low_score - high_score)

  return scores

In [None]:
# Evaluate
conversation = "<|endoftext|> Hi! <|endoftext|> Hello, how is your day? <|endoftext|> It's good. It's raining a bit, but I am enjoying a good book. How about you? <|endoftext|> It's good, I just got back from walking my dog What book did you read?"
scores = evaluate(conversation,
                      model,
                      tokenizer, score)

In [None]:
print(scores)

{'interesting': -0.28983290990193655}


In [None]:
!wget http://shikib.com/fed_data.json

--2023-08-26 21:55:44--  http://shikib.com/fed_data.json
Resolving shikib.com (shikib.com)... 192.30.252.154, 192.30.252.153
Connecting to shikib.com (shikib.com)|192.30.252.154|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 758828 (741K) [application/json]
Saving to: ‘fed_data.json’


2023-08-26 21:55:44 (4.87 MB/s) - ‘fed_data.json’ saved [758828/758828]



In [None]:
#borrowed from https://github.com/exe1023/DialEvalMetrics/blob/main/data/fed_data/data_loader.py
import json
from pathlib import Path
import numpy as np

def load_fed_data(base_dir):
    base_dir = Path(base_dir)

    with (base_dir / 'fed_data.json').open() as f:
        data = json.load(f)

    contexts, references, responses, scores = [], [], [], []
    for idx, sample in enumerate(data):

        context = []
        context_raw = sample['context'].split('\n')
        for text_raw in context_raw:
            text = ':'.join(text_raw.split(':')[1:])
            #text = text_raw.split(': ')[1]
            context.append(text.strip())

        try:
            response_raw = sample['response']
        except:
            continue

        response = ':'.join(response_raw.split(':')[1:])
        #response = response_raw.split(': ')[1]
        response = response.strip()

        score = {}
        annotations = sample['annotations']
        for aspect in annotations.keys():
            aspect_score = [x for x in annotations[aspect] if type(x) == int]
            if len(aspect_score) == 0:
                score[aspect] = 0
            else:
                score[aspect] = np.mean(aspect_score)

        contexts.append(context)
        references.append('NO REF')
        responses.append(response)
        scores.append(score)

    return {
        'contexts': contexts,
        'references': references,
        'responses': responses,
        'scores': scores
    }


def load_fed_dialog_data(base_dir):
    base_dir = Path(base_dir)

    with (base_dir / 'fed_data.json').open() as f:
        data = json.load(f)

    contexts, references, responses, scores = [], [], [], []
    for idx, sample in enumerate(data):

        if 'response' in sample:
            continue

        context = []
        context_raw = sample['context'].split('\n')
        for text_raw in context_raw:
            text = ':'.join(text_raw.split(':')[1:])
            context.append(text.strip())

        context = context[:-1]
        response = context[-1]

        score = {}
        annotations = sample['annotations']
        for aspect in annotations.keys():
            aspect_score = [x for x in annotations[aspect] if type(x) == int]
            if len(aspect_score) == 0:
                score[aspect] = 0
            else:
                score[aspect] = np.mean(aspect_score)

        contexts.append(context)
        references.append('NO REF')
        responses.append(response)
        scores.append(score)

    return {
        'contexts': contexts,
        'references': references,
        'responses': responses,
        'scores': scores
    }



data = load_fed_data('.')

print(data['contexts'][:5])
print(data['responses'][:5])
print(data['scores'][:5])

# data = load_fed_dialog_data('.')
# print(data['contexts'][:5])
# print(data['responses'][:5])
# print(data['scores'][:5])

new_data = []
for i in range(len(data['contexts'])):
    dic = { 'context': data['contexts'][i],
           'response': data['responses'][i],
           'annotations': data['scores'][i],
    }
    new_data.append(dic)
data = new_data

print(data[0])

[['Hi!', "Hi! What's up?", 'Nothing much, how about you', 'Not much either.', 'What are you doing', 'Playing Terraria. What about you?', 'Sitting in a meeting', 'What kind of meeting?', "Can't say"], ['Hi!', "Hi! What's up?", 'Nothing much, how about you', 'Not much either.', 'What are you doing', 'Playing Terraria. What about you?', 'Sitting in a meeting', 'What kind of meeting?', "Can't say", "It's probably boring, isn't it?", 'Haha, yes!'], ['Hi!', "Hi! What's up?", 'Nothing much, how about you', 'Not much either.', 'What are you doing', 'Playing Terraria. What about you?', 'Sitting in a meeting', 'What kind of meeting?', "Can't say", "It's probably boring, isn't it?", 'Haha, yes!', 'What is the meeting about?', 'I cannot tell you', 'What can you tell me?', 'Nothing much except that the weather is pleasant'], ['Hi!', 'Hey! How are you today?', 'good', "I'm glad to hear that! What are your plans for today?", "I'm trying to find a good podcast to listen to", 'What kinds of podcasts do

In [None]:
#ref: https://github.com/exe1023/DialEvalMetrics/blob/f27d717cfb02b08ffd774e60faa6b319a766ae77/usr_fed/fed/fed_server.py#L43
def prep_conv(sample, sep = " <|endoftext|> "):
    if 'response' in sample.keys():
      text = sep.join(sample['context']) + sep + sample['response']
    else:
      text =  sep.join(sample['context'])
    return sep[1:] + text
print(prep_conv(data[0]))



<|endoftext|> Hi! <|endoftext|> Hi! What's up? <|endoftext|> Nothing much, how about you <|endoftext|> Not much either. <|endoftext|> What are you doing <|endoftext|> Playing Terraria. What about you? <|endoftext|> Sitting in a meeting <|endoftext|> What kind of meeting? <|endoftext|> Can't say <|endoftext|> It's probably boring, isn't it?


In [None]:
from scipy.stats import spearmanr

def eval_score_func(data, N = -1, score_func = score, metric = 'interesting'):
  if N >-1:
    subset = data[:N]
  else:
    subset = data
  int_scores = []
  golds= []
  d_met = metric.capitalize()
  for d in tqdm.tqdm(subset):
      if d_met in d['annotations'].keys():
          scores = evaluate(prep_conv(d),
                        model,
                        tokenizer, score_func, [metric])
          met = d['annotations'][d_met]
          golds.append(met)
          int_scores.append(scores[metric])
  correlation, pvalue = spearmanr(golds, int_scores)
  print("\n")
  print(correlation, pvalue)

In [None]:
def MI_score(text, sep, m, tokenizer, model, style='perplexity'):
  def get_avg_nll(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    tokenize_input = tokenizer.tokenize(text)
    #50256 is the token_id for <|endoftext|>
    tensor_input = torch.tensor([ tokenizer.convert_tokens_to_ids(tokenize_input)]).cuda()
    with torch.no_grad():
        outputs = model(tensor_input, labels=tensor_input)
        loss, logits = outputs[:2]

    return loss.item()


  lpx = - get_avg_nll(text + sep + m)
  lpx1 = - get_avg_nll(text)
  lpx2 = - get_avg_nll(sep[1:]+m)

  if style == "perplexity":
    ent = -(lpx/lpx1/lpx2)
  elif style == "tf-idf":
    ent = -(math.exp(lpx2)*lpx/lpx1)
  else:
    raise NotImplementedError
  return ent

from functools import partial
MI_score_ent = partial(MI_score, style="tf-idf")

In [None]:
from math import exp
def MI_score_turn(text, sep, m, tokenizer, model, style='pmi'):
  def get_avg_nll(text):
    input_ids = torch.tensor(tokenizer.encode(text)).unsqueeze(0)  # Batch size 1
    tokenize_input = tokenizer.tokenize(text)
    #50256 is the token_id for <|endoftext|>
    tensor_input = torch.tensor([ tokenizer.convert_tokens_to_ids(tokenize_input)]).cuda()
    with torch.no_grad():
        outputs = model(tensor_input, labels=tensor_input)
        loss, logits = outputs[:2]

    return loss.item()



  lpx = - get_avg_nll(text + sep + m)
  response = text.split(sep)[-1]
  context = sep.join((" "+text).split(sep)[1:-1])
  lpx1 = - get_avg_nll(sep[1:] + context + sep +m)
  lpx2 = - get_avg_nll(sep[1:]+response+ sep +m)
  lpx3 = - get_avg_nll(sep[1:]+m)

  if style == "pmi":
    ent = (lpx+lpx3-lpx1-lpx2)
  elif style == "tf-idf":
    ent = -(math.exp(lpx2)*lpx/lpx1)
  elif style == "perplexity":
    ent = -(exp(-lpx)-exp(-lpx1)-exp(-lpx2))
  elif style == "interaction_info":
    ent = (lpx+lpx3-lpx1-lpx2)
    lpx = - get_avg_nll(text)
    lpx1 = - get_avg_nll(sep[1:] + context)
    lpx2 = - get_avg_nll(sep[1:]+response)
    ent = lpx -lpx1 -lpx2 -ent
  elif style == "sym_pmi":
    ent = (lpx+lpx3-lpx1-lpx2)
    lpx = - get_avg_nll(sep[1:]+response + sep +context + sep + m)
    ent += (lpx+lpx3-lpx1-lpx2)
    ent /=2
  else:
    raise NotImplementedError
  return ent
from functools import partial
MI_score_turn_ppl = partial(MI_score_turn, style="perplexity")
MI_score_turn_sympmi = partial(MI_score_turn, style="sym_pmi")
MI_score_turn_pmi = partial(MI_score_turn, style="pmi")
MI_score_turn_iinfo = partial(MI_score_turn, style="interaction_info")

In [None]:
eval_score_func(data, score_func = MI_score_turn_sympmi, metric = 'interesting')

100%|██████████| 375/375 [12:02<00:00,  1.93s/it]



0.4840535921948752 2.0104777671481675e-23





In [None]:
eval_score_func(data, score_func = MI_score_turn_pmi, metric = 'interesting')

100%|██████████| 375/375 [08:38<00:00,  1.38s/it]



0.48178805145120673 3.4389643596534263e-23





In [None]:
eval_score_func(data, score_func = MI_score_turn_pmi, metric = 'fluent')

100%|██████████| 375/375 [07:08<00:00,  1.14s/it]



0.17577201914343185 0.0006282771332988924





In [None]:
eval_score_func(data, score_func = MI_score_turn_pmi, metric = 'engaging')

100%|██████████| 375/375 [08:36<00:00,  1.38s/it]



0.37031182625022735 1.2415179631695478e-13





In [None]:
eval_score_func(data, score_func = MI_score_turn_sympmi, metric = 'fluent')

100%|██████████| 375/375 [09:47<00:00,  1.57s/it]



0.16636179962432907 0.0012233144399643131





In [None]:
eval_score_func(data, score_func = MI_score_turn_sympmi, metric = 'engaging')

100%|██████████| 375/375 [12:13<00:00,  1.96s/it]



0.36901731325274206 1.5318196338257915e-13





In [None]:
for met in ["specific","relevant","correct", "semantically appropriate","understandable" ]:
    print("Ours: (sym-pmi) ")
    eval_score_func(data, score_func = MI_score_turn_sympmi, metric = met)




Ours: (sym-pmi) 


100%|██████████| 375/375 [12:14<00:00,  1.96s/it]




0.28039826754941216 3.333654217294593e-08
Ours: (sym-pmi) 


100%|██████████| 375/375 [06:08<00:00,  1.02it/s]




0.1046720358328147 0.04278876025504507
Ours: (sym-pmi) 


100%|██████████| 375/375 [06:08<00:00,  1.02it/s]




0.14788341418578338 0.004104905063353201
Ours: (sym-pmi) 


100%|██████████| 375/375 [06:03<00:00,  1.03it/s]




0.1789057141093261 0.0004994451558825398
Ours: (sym-pmi) 


100%|██████████| 375/375 [12:08<00:00,  1.94s/it]



0.10754798024390105 0.03736565222771731





In [None]:
for met in ["specific","relevant","correct", "semantically appropriate","understandable" ]:
    print("Ours: (pmi) ")
    eval_score_func(data, score_func = MI_score_turn_pmi, metric = met)


Ours: (pmi) 


100%|██████████| 375/375 [08:51<00:00,  1.42s/it]




0.28650290663121786 1.6192487877935912e-08
Ours: (pmi) 


100%|██████████| 375/375 [04:30<00:00,  1.39it/s]




0.12762469978470525 0.013386692491667261
Ours: (pmi) 


100%|██████████| 375/375 [04:28<00:00,  1.39it/s]




0.17640633612673498 0.0005999446003946672
Ours: (pmi) 


100%|██████████| 375/375 [04:26<00:00,  1.41it/s]




0.18103497423894496 0.00042640530415876163
Ours: (pmi) 


100%|██████████| 375/375 [08:48<00:00,  1.41s/it]



0.11144085296037845 0.030961827141767255





In [None]:
eval_score_func(data, score_func = MI_score_ent, metric = 'interesting')

100%|██████████| 375/375 [07:36<00:00,  1.22s/it]



0.18893377245202542 0.0002335399079426175





In [None]:
eval_score_func(data, score_func = MI_score, metric = 'interesting')

100%|██████████| 375/375 [07:38<00:00,  1.22s/it]



0.3258295395483171 1.007053909951137e-10





In [None]:
eval_score_func(data, metric = 'fluent')
eval_score_func(data, score_func = MI_score, metric = 'fluent')

100%|██████████| 375/375 [02:46<00:00,  2.25it/s]




0.015269446751476566 0.7682065652049144


100%|██████████| 375/375 [06:25<00:00,  1.03s/it]



0.011682590702081382 0.8216021297875056





In [None]:
eval_score_func(data, metric = 'engaging')
eval_score_func(data, score_func = MI_score, metric = 'engaging')

100%|██████████| 375/375 [03:25<00:00,  1.82it/s]




0.17596608228561322 0.0006194801083057276


100%|██████████| 375/375 [07:40<00:00,  1.23s/it]



0.1772072918067314 0.0005658608024694887





In [None]:
for met in ["specific","relevant","correct", "semantically appropriate","understandable" ]:
    print("Baseline: ")
    eval_score_func(data, metric = met )




Baseline: 


100%|██████████| 375/375 [03:21<00:00,  1.86it/s]




0.23042724187595698 6.540561962379317e-06
Baseline: 


100%|██████████| 375/375 [01:41<00:00,  3.69it/s]




0.1342320233257707 0.009254628596184063
Baseline: 


100%|██████████| 375/375 [01:41<00:00,  3.71it/s]




0.15923306237678347 0.0019814545814939998
Baseline: 


100%|██████████| 375/375 [01:40<00:00,  3.74it/s]




0.07718116654133574 0.13574024204581298
Baseline: 


100%|██████████| 375/375 [03:19<00:00,  1.88it/s]



0.05967535002016753 0.2490015843813382



