# Manual Annotation
This notebook is for running the manual annotation. It supports two activities:
-  Identifying the best response in each batch of generated text.
- Annotating the best response, as identified by the model, from each batch.
- Annotating the best response, as identified manually in the above step from each batch.

The first step of manually identifying the best response should be done just once by one person while the next step should be done individually by each annotator. Before getting started, ensure that the annotator name is set correctly.

Pre-requisites:
- The annotation input file, generated through the inferencing.ipynb notebook, must be available in GDRIVE_BASE location.


## Initial Setup

In [2]:
!pip install krippendorff

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting krippendorff
  Downloading krippendorff-0.6.0-py3-none-any.whl (17 kB)
Installing collected packages: krippendorff
Successfully installed krippendorff-0.6.0


In [3]:
%load_ext autoreload
%autoreload 2

import sys
from google.colab import drive
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm
from  nltk.metrics import agreement
import krippendorff

GDRIVE_BASE = 'drive/MyDrive/MIDS/w266/project/'
# ANNOTATOR='nico'
# ANNOTATOR='ghiwa'
ANNOTATOR='ram'

drive.mount('/content/drive')
sys.path.insert(0, GDRIVE_BASE)

Mounted at /content/drive


# Best in Batch

In [23]:
BIB_IN_FILE = f'{GDRIVE_BASE}manual_eval_generated_output.csv'
BIB_OUT_FILE = f'{GDRIVE_BASE}manual_eval_generated_output_with_bib.csv'

def is_completed(batch):
  return len(batch[batch.human_top_score == True]) > 0

def mark_best(ignore_completed=False, save_each_step=True):
  # Load the annotation input
  if ignore_completed:
    # Load from prevously saved work.
    df_results = pd.read_csv(BIB_OUT_FILE)
  else:
    # Load from the original annotation input.
    df_results = pd.read_csv(BIB_IN_FILE)
    # Add a new column to hold the best in batch flag.
    df_results['human_top_score'] = False
  num_batches = df_results.batch_id.max()
  batch_size = int(df_results.shape[0] / num_batches)

  # Random order of batches.
  batch_ids = np.arange(num_batches)
  np.random.shuffle(batch_ids)
  with tqdm(total=num_batches, unit='item', unit_scale=True) as pbar:
    for cur_batch_id in batch_ids:
      # Adjust for batch ids starting from 1.
      cur_batch_id = cur_batch_id + 1
      batch = df_results[df_results.batch_id == cur_batch_id]
      if len(batch) != batch_size:
        # We should never get here something is wrong.
        raise Exception(f'Batch {cur_batch_id} has {len(batch)} items but expecting {batch_size} items.')
      if is_completed(batch):
        print(f'Skipping batch {cur_batch_id} as it is already completed.')
      else:
        cur_prompt = batch.prompt.iloc[0]
        pbar.set_postfix(batch_id=cur_batch_id, prompt=cur_prompt[0:10] + '...', refresh=True)
        # Display the batch and get user's choice pick the best.
        indices = {}
        print(f'PROMPT: {cur_prompt}')
        i = 1
        for index, row in batch.iterrows():
          cur_generated = row["generated"] #.replace(cur_prompt + ' ', '')
          print(f'{i}. {cur_generated}')
          indices[i] = index
          i = i+1
        
        user_opt = -1
        msg = f'Enter 1 to {batch_size} or "quit"'
        quit = False
        while user_opt == -1:
          try:
            user_input = input(f'{msg}: ')
            if user_input == 'quit':
              quit = True
              break
            user_opt = int(user_input)
            if user_opt in indices:
              df_results.loc[indices[user_opt],'human_top_score'] = True
              if save_each_step:
                df_results.to_csv(BIB_OUT_FILE, index=None)
            else:
              print(msg)
              user_opt = -1
          except ValueError:
            print(msg)
            user_opt = -1      
        pbar.update(1)
        if quit:
          break
    df_results.to_csv(BIB_OUT_FILE, index=None)

In [24]:
# Run this cell to start an interactive session to identify best generated text in each batch.
# This will walk through the batches randomly and present one batch at a time. 
# A batch is defined as the N (currently N=5) texts generated from one model configuration for one prompt.
# For each batch shown, select the best text. The method creates a new file with the final result.
# Set ignore_completed to True if continuing from a previous session.
# save_each_step=True ensures that the dataframe is saved after each batch is marked.
mark_best(ignore_completed=True, save_each_step=True)

  0%|          | 0.00/120 [00:00<?, ?item/s]

Skipping batch 48 as it is already completed.
Skipping batch 13 as it is already completed.
Skipping batch 3 as it is already completed.
Skipping batch 90 as it is already completed.
Skipping batch 100 as it is already completed.
Skipping batch 2 as it is already completed.
Skipping batch 104 as it is already completed.
Skipping batch 32 as it is already completed.
Skipping batch 28 as it is already completed.
Skipping batch 69 as it is already completed.
Skipping batch 46 as it is already completed.
Skipping batch 102 as it is already completed.
Skipping batch 8 as it is already completed.
Skipping batch 88 as it is already completed.
Skipping batch 94 as it is already completed.
Skipping batch 114 as it is already completed.
PROMPT: "There's no going back now Mr. Brandon, what should we do?" Penny asked desperately.
1. " I don ' t know, " said Mrs. March.
2. " I don ' t know, " said Mrs. Watkinson.
3. " I don ' t know, " said Mrs. Moffat.
4. " No, I don ' t.
5. nan
Enter 1 to 5 or "q

In [22]:
# Show the final results with best of batch marked.
df_results = pd.read_csv(BIB_OUT_FILE)
print(f'BIB annotation completed for {df_results[df_results.human_top_score == True].shape[0]} of {df_results.batch_id.max()} entries.')
df_results

BIB annotation completed for 119 of 120 entries.


Unnamed: 0,batch_id,name,model,tuned,dataset,prompt,score,generated,model_top_score,human_top_score
0,1,t5_s3,t5-v1_1-base-s3-finetuned,True,s3,"My stomach did a flip, then a flop, I couldn't...",0.000084,"I thought of the money, but then I saw another...",False,False
1,1,t5_s3,t5-v1_1-base-s3-finetuned,True,s3,"My stomach did a flip, then a flop, I couldn't...",0.000063,"""Come here, come here!"" cried John.",False,True
2,1,t5_s3,t5-v1_1-base-s3-finetuned,True,s3,"My stomach did a flip, then a flop, I couldn't...",0.000060,"I went to the hole and looked at it, and drew ...",False,False
3,1,t5_s3,t5-v1_1-base-s3-finetuned,True,s3,"My stomach did a flip, then a flop, I couldn't...",0.000096,I sprang back in the water and threw my hands ...,False,False
4,1,t5_s3,t5-v1_1-base-s3-finetuned,True,s3,"My stomach did a flip, then a flop, I couldn't...",0.000183,I sat up and looked.,True,False
...,...,...,...,...,...,...,...,...,...,...
595,120,baseline,facebook/opt-350m,False,,Do you think my mom will let me have pizza for...,0.017286,Do you think my mom will let me have pizza for...,False,False
596,120,baseline,facebook/opt-350m,False,,Do you think my mom will let me have pizza for...,0.030708,Do you think my mom will let me have pizza for...,True,False
597,120,baseline,facebook/opt-350m,False,,Do you think my mom will let me have pizza for...,0.017263,Do you think my mom will let me have pizza for...,False,False
598,120,baseline,facebook/opt-350m,False,,Do you think my mom will let me have pizza for...,0.026418,Do you think my mom will let me have pizza for...,False,True


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


# Bug Fixes

In [53]:
# Update annotated file with results from models that were excluded during annotation
PRE_ANNOTATION_FILE = f'{GDRIVE_BASE}manual_eval_generated_output_with_bib.csv'
POST_ANNOTATION_FILE = f'{GDRIVE_BASE}{ANNOTATOR}_final_annotations.csv'
# Backup file name. The script will first create a backup of the annotated file before overwriting it.
POST_ANNOTATION_FILE_BACK = f'{GDRIVE_BASE}{ANNOTATOR}_final_annotations_test_backup.csv'
import shutil

def is_match(cur_gen_id, pre_row, post_row):
  def is_col_match(col_name):
    is_match = (pre_row[col_name] == post_row[col_name])
    return is_match
  matching = is_col_match('name') and is_col_match('generated') and is_col_match('prompt') and \
      is_col_match('model_top_score') and is_col_match('human_top_score')
  if not matching:
    print('================Pre================')
    print(pre_row)
    print('================Post================')
    print(post_row)
  return matching

def bug_fix_1():
  # 
  # Read the pre-annotation file.
  df_pre = pd.read_csv(PRE_ANNOTATION_FILE)
  # Add the annotation related cols with defaults
  df_pre['gen_id'] = df_pre.index 
  df_pre['annotated'] = False
  for metric in METRICS:
    df_pre[metric] = -1

  # Iterate through post annotation file and update pre with the data.
  df_post = pd.read_csv(POST_ANNOTATION_FILE)
  for index, post_row in df_post.iterrows():
    cur_gen_id = post_row.gen_id
    pre_row = df_pre.loc[cur_gen_id]
    if not is_match(cur_gen_id, pre_row, post_row):
      raise Exception(f'Rows at {cur_gen_id} are not matching.')
    else:
      df_pre.loc[cur_gen_id, 'annotated'] = post_row['annotated']
      for metric in METRICS:
        df_pre.loc[cur_gen_id, metric] = post_row[metric]
  
  # Backup old data before overwriting.
  shutil.copyfile(POST_ANNOTATION_FILE, POST_ANNOTATION_FILE_BACK)
  df_post.to_csv(POST_ANNOTATION_FILE_BACK, index=None)
  df_pre.to_csv(POST_ANNOTATION_FILE, index=None)
  return df_pre


In [None]:
# Un comment the below line to run the fix for missing t5_s3, opt_s3 and b2b_s3 models
# during annotation.
# bug_fix_1()

# Manual Scoring Of Results

In [17]:
MA_IN_FILE=f'{GDRIVE_BASE}manual_eval_generated_output_with_bib.csv'
MA_OUT_FILE=f'{GDRIVE_BASE}{ANNOTATOR}_final_annotations.csv'
METRICS = [
    'relevance',
    'readability',
    'grammaticality',
    'non-redundancy',
    'kid-friendly']
SCALES = [
    ('outstanding', 7),
    ('very good', 6),
    ('good', 5),
    ('neutral', 4),
    ('bad', 3),
    ('very bad', 2),
    ('unusable', 1)]

def load_data(in_file, out_file, ignore_completed):
  if ignore_completed:
    df_data = pd.read_csv(out_file)
  else:
    df_data = pd.read_csv(in_file)
    # Add the extra cols for annotation.
    df_data['gen_id'] = df_data.index # Explicit id columns as we want to retain 
                                      # the original indices even after filtering.
    df_data['annotated'] = False
    for metric in METRICS:
      df_data[metric] = -1

  # Columns: 'batch_id', 'name', 'model', 'tuned', 'dataset', 'prompt', 'score', 'generated', 'model_top_score', 'human_top_score'
  # Pick only entries from models trained on S3 dataset and also only the 
  # top entries - one of model_top_score, human_top_score is True.
  df_filtered = df_data[((df_data.dataset == 's3') | (df_data.name == 'baseline')) & ((df_data.model_top_score) | (df_data.human_top_score))]
  # df_filtered = df_data[(df_data.name == 'baseline') & ((df_data.model_top_score) | (df_data.human_top_score))]
  return df_filtered

def annotate_one(cur_index, cur_row):
  print('='*50)
  msg = f"Enter one of {', '.join([f'{scale}({score})' for scale, score in SCALES])}"
  scores = {}
  print(msg)
  quit = False
  # Initialize the map with -1 scores.
  for metric in METRICS:
    scores[metric] = -1
  for metric in METRICS:
    # Get score for current metric.
    print(f'\tScore generated text for {metric}: ')
    quit = False
    user_opt = -1
    while user_opt == -1:
      try:
        user_input = input(f'Your score (or "quit"): ')
        if user_input == 'quit':
          quit = True
          return quit, scores
        user_opt = int(user_input)
        if user_opt >= 1 and  user_opt <= 7:
          scores[metric] = user_opt
        else:
          print(f'Invalid entry. Please {msg}')
          user_opt = -1
      except ValueError:
        print(msg)
        user_opt = -1
  return quit, scores

def annotate(annotator, in_file, out_file, ignore_completed=False, save_each_step=True):
  df_data = load_data(in_file, out_file, ignore_completed)
  num_entries = df_data.shape[0]
  print(f'Total annotation batch size: {num_entries}')

  # Random order of items.
  indices = df_data.index.values.copy()
  np.random.shuffle(indices)
  print(f'Score scale: ')
  with tqdm(total=indices.shape[0], unit='item', unit_scale=True) as pbar:
    for cur_index in indices:
      cur_row = df_data.loc[cur_index]
      cur_prompt = cur_row['prompt']
      cur_generated = cur_row['generated']
      cur_generated = cur_generated.replace(cur_prompt + ' ', '')
      if ignore_completed and cur_row['annotated']:
        print(f'Skipping row {cur_index} as it is already annotated.')
      else:
        # Display the prompt and the generated text.
        pbar.set_postfix(prompt=cur_prompt[0:10] + '...', generated=cur_generated[0:10] + '...', refresh=True)
        print(f'PROMPT: {cur_prompt}')
        print(f'GENERATED: {cur_generated}')
        quit, scores = annotate_one(cur_index, cur_row)
        pbar.update(1)
        if quit:
          break
        df_data.loc[cur_index, 'annotated'] = True
        for metric in METRICS:
          df_data.loc[cur_index, metric] = scores[metric]
        if save_each_step:
          df_data.to_csv(out_file, index=None)
    df_data.to_csv(out_file, index=None)



In [86]:
# Ensure that the follwing variables are set correctly:
#   ANNOTATOR - name of the annotator
#   MA_IN_FILE - must be the file created through the "Best In Batch" section above.
#   MA_OUT_FILE - output file that will hold annotations. (Please keep the annotator name in the filename).
# Run this cell to start an interactive session to annotate each generated text.
# This will walk through the generated texts randomly and present one text at a time. 
# It will prompt you to evaluate on one metric at a time. Each metric is evaluated on a scale of 1 to 7 with 1 being "Unusable" and 7 being "Outstanding".
# You can quit any time by entering "quit" at a prompt. Partial scores will be ignored if you enter quit. 
# If you want to restart from where you left off then set ignore_completed to True.
# Ensure that save_each_step=True so that no data is lost.
# Finally the process saves annotations in a file specific to ANNOTATOR.

annotate(ANNOTATOR, MA_IN_FILE, MA_OUT_FILE, ignore_completed=True, save_each_step=True)

Total annotation batch size: 4
Score scale: 


  0%|          | 0.00/4.00 [00:00<?, ?item/s]

PROMPT: Lara felt very sad and scared.
GENERATED: Continue the next sentence of the story making the language appropriate for kids between 6 and 12 years old: And that's it.  She started crying. She wanted to leave. But she had to stay.
Enter one of outstanding(7), very good(6), good(5), neutral(4), bad(3), very bad(2), unusable(1)
	Score generated text for relevance: 
Your score (or "quit"): 7
	Score generated text for readability: 
Your score (or "quit"): 7
	Score generated text for grammaticality: 
Your score (or "quit"): 7
	Score generated text for non-redundancy: 
Your score (or "quit"): 7
	Score generated text for kid-friendly: 
Your score (or "quit"): 7
PROMPT: All the dragons of the world lived on one mountain called the dragon mountain.
GENERATED: Continue the next sentence of the story making the language appropriate for kids between 6 and 12 years old: It was a mountain on which all dragons lived. The dragons that lived
Enter one of outstanding(7), very good(6), good(5), neu

In [87]:
df_annotated = pd.read_csv(MA_OUT_FILE)
print(f'Manual scoring completed for {df_annotated[df_annotated.annotated == True].shape[0]} of {df_annotated.shape[0]} entries.')

df_annotated

Manual scoring completed for 4 of 4 entries.


Unnamed: 0,batch_id,name,model,tuned,dataset,prompt,score,generated,model_top_score,human_top_score,gen_id,annotated,relevance,readability,grammaticality,non-redundancy,kid-friendly
0,9,baseline,facebook/opt-350m,False,,Lara felt very sad and scared.,0.00148,Continue the next sentence of the story making...,False,True,40,True,5,5,5,5,5
1,9,baseline,facebook/opt-350m,False,,Lara felt very sad and scared.,0.011939,Continue the next sentence of the story making...,True,False,42,True,7,7,7,7,7
2,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006471,Continue the next sentence of the story making...,False,True,85,True,1,2,1,2,1
3,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006506,Continue the next sentence of the story making...,True,False,88,True,6,6,6,6,6


# Score Models

In [4]:
ANNOTATORS = ['ghiwa', 'nico', 'ram']
ANNOTATION_FILE_FORMAT = '{}{}_final_annotations_test.csv' # Needs GDRIVE_BASE and annotator.
FINAL_SCORE_SUMMARY_FILE=f'{GDRIVE_BASE}model_annotator_metric_mean.csv'
MODELS = ['t5_s3', 'opt_s3', 'b2b_s3', 'baseline']
METRICS = [
    'relevance',
    'readability',
    'grammaticality',
    'non-redundancy',
    'kid-friendly']
SCALES = [
    ('outstanding', 7),
    ('very good', 6),
    ('good', 5),
    ('neutral', 4),
    ('bad', 3),
    ('very bad', 2),
    ('unusable', 1)]

def scores_for_krippendorff(annotator, annotations, human_top_score, metric):
  if human_top_score:
    df_filtered = annotations[(annotations.human_top_score == True)]
  else:
    df_filtered = annotations[(annotations.model_top_score == True)]
  if metric is None:
    scores = []
    for metric in METRICS:
      scores.extend(list(df_filtered[[metric]].values.flat))
      return scores
  else:
    return list(df_filtered[[metric]].values.flat)

def validate(annotations):
  num_items = annotations[next(iter(annotations))].shape[0]
  # Ensure all entries have been annotated.
  for annotator, df_annotations in annotations.items():
    if df_annotations.shape[0] != num_items:
      raise Exception('Number of entries do not match.')
    if (df_annotations['annotated'] == False).any():
      raise Exception('Not all entries are annotated by {annotator}.')

def load_annotations(annotators):
  annotations = {}
  for annotator in annotators:
    file = ANNOTATION_FILE_FORMAT.format(GDRIVE_BASE, annotator)
    df_annotations = pd.read_csv(file)
    annotations[annotator] = df_annotations
  validate(annotations)
  return annotations

def krippendorff_scores(annotations):
  scores = []
  # Do we need Krippendorff score at each metric level?
  for metric in METRICS:
    scores = [scores_for_krippendorff(annotator, annotations[annotator], human_top_score=False, metric=metric) for annotator in ANNOTATORS]
    k_score = krippendorff.alpha(reliability_data=scores, level_of_measurement='ordinal')
    scores.append(['all', True, False, 'all', f'alpha_{metric}', k_score]) #model, model_top_score, human_top_score, annotator, metric, score
    scores = [scores_for_krippendorff(annotator, annotations[annotator], human_top_score=True, metric=metric) for annotator in ANNOTATORS]
    k_score = krippendorff.alpha(reliability_data=scores, level_of_measurement='ordinal')
    scores.append(['all', False, True, 'all', f'alpha_{metric}', k_score]) #model, model_top_score, human_top_score, annotator, metric, score

  scores = [scores_for_krippendorff(annotator, annotations[annotator], human_top_score=False, metric=None) for annotator in ANNOTATORS]
  k_score = krippendorff.alpha(reliability_data=scores, level_of_measurement='ordinal')
  scores.append(['all', True, False, 'all', 'alpha_all', k_score]) #model, model_top_score, human_top_score, annotator, metric, score

  scores = [scores_for_krippendorff(annotator, annotations[annotator], human_top_score=True, metric=None) for annotator in ANNOTATORS]
  k_score = krippendorff.alpha(reliability_data=scores, level_of_measurement='ordinal')
  scores.append(['all', False, True, 'all', 'alpha_all', k_score]) #model, model_top_score, human_top_score, annotator, metric, score
  return scores

def model_scores(annotations, human_top_score):
  # Select the top human scored items.
  model_top_score = !human_top_score
  mean_scores = []
  for model in MODELS:
    if model == 'baseline':
      for annotator in ANNOTATORS:
        df = annotations[annotator]
        df_filtered = df[((df.human_top_score==human_top_score) & (df.name==model))]
        for metric in METRICS:
          mean = df_filtered[metric].mean()
          mean_scores.append([model, model_top_score, human_top_score, annotator, metric, mean])
  # For each metric calculate the average score for each model
  # Calculate overall average for each model.
  return mean_scores

def score_all():
  all_scores = []
  annotations = load_annotations(ANNOTATORS)
  all_scores = krippendorff_scores(annotations)
  
  # all_scores.extend(model_scores(annotations, human_top_score=True))
  # all_scores.extend(model_scores(annotations, human_top_score=False))
  df_scores = pd.DataFrame(all_scores, columns=['model', 'model_top_score', 'human_top_score', 'annotator', 'metric', 'score'])

  # Save the model-annotator-metric-mean data.
  df_scores.to_csv(FINAL_SCORE_SUMMARY_FILE, index=None)



In [5]:
score_all()