# Manual Annotation
This notebook is for running the manual annotation. It supports two activities:
-  Identifying the best response in each batch of generated text.
- Annotating the best response, as identified by the model, from each batch.
- Annotating the best response, as identified manually in the above step from each batch.

The first step of manually identifying the best response should be done just once by one person while the next step should be done individually by each annotator. Before getting started, ensure that the annotator name is set correctly.

Pre-requisites:
- The annotation input file, generated through the inferencing.ipynb notebook, must be available in GDRIVE_BASE location.


## Initial Setup

In [2]:
%load_ext autoreload
%autoreload 2

import sys
from google.colab import drive
import pandas as pd
import numpy as np
import torch
from tqdm.notebook import tqdm
from  nltk.metrics import agreement

GDRIVE_BASE = 'drive/MyDrive/MIDS/w266/project/'
# ANNOTATOR='nico'
# ANNOTATOR='ghiwa'
ANNOTATOR='ram'

drive.mount('/content/drive')
sys.path.insert(0, GDRIVE_BASE)

import common

print(f'common.__version__: {common.__version__}')
# tuning_configs = common.create_configs(GDRIVE_BASE)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
common.__version__: 1.4


# Best in Batch

In [None]:
def is_completed(batch):
  return len(batch[batch.human_top_score == True]) > 0

def mark_best(ignore_completed=False, save_each_step=True):
  # Load the annotation input
  if ignore_completed:
    # Load from prevously saved work.
    df_results = pd.read_csv(common.annotation_with_best_loc(GDRIVE_BASE))
  else:
    # Load from the original annotation input.
    df_results = pd.read_csv(common.annotation_input_loc(GDRIVE_BASE))
    # Add a new column to hold the best in batch flag.
    df_results['human_top_score'] = False
  num_batches = df_results.batch_id.max()
  batch_size = int(df_results.shape[0] / num_batches)

  # Random order of batches.
  batch_ids = np.arange(num_batches)
  np.random.shuffle(batch_ids)
  with tqdm(total=num_batches, unit='item', unit_scale=True) as pbar:
    for cur_batch_id in batch_ids:
      # Adjust for batch ids starting from 1.
      cur_batch_id = cur_batch_id + 1
      batch = df_results[df_results.batch_id == cur_batch_id]
      if len(batch) != batch_size:
        # We should never get here something is wrong.
        raise Exception(f'Batch {cur_batch_id} has {len(batch)} items but expecting {batch_size} items.')
      if is_completed(batch):
        print(f'Skipping batch {cur_batch_id} as it is already completed.')
      else:
        pbar.set_postfix(batch_id=cur_batch_id, prompt=batch.prompt.iloc[0][0:10] + '...', refresh=True)
        # Display the batch and get user's choice pick the best.
        indices = {}
        print(f'PROMPT: {batch.prompt.iloc[0]}')
        i = 1
        for index, row in batch.iterrows():
          print(f'{i}. {row["generated"]}')
          indices[i] = index
          i = i+1
        
        user_opt = -1
        msg = f'Enter 1 to {batch_size} or "quit"'
        quit = False
        while user_opt == -1:
          try:
            user_input = input(f'{msg}: ')
            if user_input == 'quit':
              quit = True
              break
            user_opt = int(user_input)
            if user_opt in indices:
              df_results.loc[indices[user_opt],'human_top_score'] = True
              if save_each_step:
                df_results.to_csv(common.annotation_with_best_loc(GDRIVE_BASE), index=None)
            else:
              print(msg)
              user_opt = -1
          except ValueError:
            print(msg)
            user_opt = -1      
        pbar.update(1)
        if quit:
          break
    df_results.to_csv(common.annotation_with_best_loc(GDRIVE_BASE), index=None)

In [None]:
# Run this cell to start an interactive session to identify best generated text in each batch.
# This will walk through the batches randomly and present one batch at a time. 
# A batch is defined as the N (currently N=5) texts generated from one model configuration for one prompt.
# For each batch shown, select the best text. The method creates a new file with the final result.
# Set ignore_completed to True if continuing from a previous session.
# save_each_step=True ensures that the dataframe is saved after each batch is marked.
mark_best(ignore_completed=False, save_each_step=True)

  0%|          | 0.00/18.0 [00:00<?, ?item/s]

Skipping batch 12 as it is already completed.
Skipping batch 8 as it is already completed.
Skipping batch 5 as it is already completed.
Skipping batch 4 as it is already completed.
Skipping batch 9 as it is already completed.
Skipping batch 16 as it is already completed.
Skipping batch 11 as it is already completed.
Skipping batch 7 as it is already completed.
Skipping batch 3 as it is already completed.
Skipping batch 18 as it is already completed.
Skipping batch 6 as it is already completed.
Skipping batch 13 as it is already completed.
Skipping batch 15 as it is already completed.
Skipping batch 14 as it is already completed.
Skipping batch 2 as it is already completed.
Skipping batch 17 as it is already completed.
Skipping batch 1 as it is already completed.
Skipping batch 10 as it is already completed.


In [None]:
# Show the final results with best of batch marked.
df_results = pd.read_csv(common.annotation_with_best_loc(GDRIVE_BASE))
df_results

# Manual Scoring Of Results

In [6]:
MA_IN_FILE=f'{GDRIVE_BASE}annotation_input_with_bib.csv'
MA_OUT_FILE=f'{GDRIVE_BASE}{ANNOTATOR}_final_annotations.csv'
METRICS = [
    'relevance',
    'readability',
    'grammaticality',
    'non-redundancy',
    'kid-friendly']
SCALES = [
    ('outstanding', 7),
    ('very good', 6),
    ('good', 5),
    ('neutral', 4),
    ('bad', 3),
    ('very bad', 2),
    ('unusable', 1)]

def load_data(in_file, out_file, ignore_completed):
  if ignore_completed:
    df_data = pd.read_csv(out_file)
  else:
    df_data = pd.read_csv(in_file)
    # Add the extra cols for annotation.
    df_data['annotated'] = False
    for metric in METRICS:
      df_data[metric] = -1

  # Columns: 'batch_id', 'name', 'model', 'tuned', 'dataset', 'prompt', 'score', 'generated', 'model_top_score', 'human_top_score'
  # Pick only entries from models trained on S3 dataset and also only the 
  # top entries - one of model_top_score, human_top_score is True.
  # Top entry could 
  # df_filtered = df_data[((df_data.dataset == 's3') | (df_data.name == 'baseline')) & ((df_data.model_top_score) | (df_data.human_top_score))]
  df_filtered = df_data[(df_data.name == 'baseline') & ((df_data.model_top_score) | (df_data.human_top_score))]
  return df_filtered

def annotate_one(cur_index, cur_row):
  print('='*50)
  msg = f"Enter one of {', '.join([f'{scale}({score})' for scale, score in SCALES])}"
  scores = {}
  print(msg)
  quit = False
  # Initialize the map with -1 scores.
  for metric in METRICS:
    scores[metric] = -1
  for metric in METRICS:
    # Get score for current metric.
    print(f'\tScore generated text for {metric}: ')
    quit = False
    user_opt = -1
    while user_opt == -1:
      try:
        user_input = input(f'Your score (or "quit"): ')
        if user_input == 'quit':
          quit = True
          return quit, scores
        user_opt = int(user_input)
        if user_opt >= 1 and  user_opt <= 7:
          scores[metric] = user_opt
        else:
          print(f'Invalid entry. Please {msg}')
          user_opt = -1
      except ValueError:
        print(msg)
        user_opt = -1
  return quit, scores

def annotate(annotator, in_file, out_file, ignore_completed=False, save_each_step=True):
  df_data = load_data(in_file, out_file, ignore_completed)
  num_entries = df_data.shape[0]
  print(f'Total annotation batch size: {num_entries}')

  # Random order of items.
  indices = df_data.index.values.copy()
  np.random.shuffle(indices)
  print(f'Score scale: ')
  with tqdm(total=indices.shape[0], unit='item', unit_scale=True) as pbar:
    for cur_index in indices:
      cur_row = df_data.loc[cur_index]
      cur_prompt = cur_row['prompt']
      cur_generated = cur_row['generated']
      cur_generated = cur_generated.replace(cur_prompt + ' ', '')
      if ignore_completed and cur_row['annotated']:
        print(f'Skipping row {cur_index} as it is already annotated.')
      else:
        # Display the prompt and the generated text.
        pbar.set_postfix(prompt=cur_prompt[0:10] + '...', generated=cur_generated[0:10] + '...', refresh=True)
        print(f'PROMPT: {cur_prompt}')
        print(f'GENERATED: {cur_generated}')
        quit, scores = annotate_one(cur_index, cur_row)
        pbar.update(1)
        if quit:
          break
        df_data.loc[cur_index, 'annotated'] = True
        for metric in METRICS:
          df_data.loc[cur_index, metric] = scores[metric]
        if save_each_step:
          df_data.to_csv(out_file, index=True, index_label='gen_id')
    df_data.to_csv(out_file, index=True, index_label='gen_id')
  return df_data




In [7]:
# Ensure that the follwing variables are set correctly:
#   ANNOTATOR - name of the annotator
#   MA_IN_FILE - must be the file created through the "Best In Batch" section above.
#   MA_OUT_FILE - output file that will hold annotations. (Please keep the annotator name in the filename).
# Run this cell to start an interactive session to annotate each generated text.
# This will walk through the generated texts randomly and present one text at a time. 
# It will prompt you to evaluate on one metric at a time. Each metric is evaluated on a scale of 1 to 7 with 1 being "Unusable" and 7 being "Outstanding".
# You can quit any time by entering "quit" at a prompt. Partial scores will be ignored if you enter quit. 
# If you want to restart from where you left off then set ignore_completed to True.
# Ensure that save_each_step=True so that no data is lost.
# Finally the process saves annotations in a file specific to ANNOTATOR.

annotate(ANNOTATOR, MA_IN_FILE, MA_OUT_FILE, ignore_completed=False, save_each_step=True)

Total annotation batch size: 4
Score scale: 


  0%|          | 0.00/4.00 [00:00<?, ?item/s]

PROMPT: Lara felt very sad and scared.
GENERATED: Continue the next sentence of the story making the language appropriate for kids between 6 and 12 years old:  "She was crying for no reason. She was not crying because she was scared, she wasn't
Enter one of outstanding(7), very good(6), good(5), neutral(4), bad(3), very bad(2), unusable(1)
	Score generated text for relevance: 
Your score (or "quit"): 7
	Score generated text for readability: 
Your score (or "quit"): 7
	Score generated text for grammaticality: 
Your score (or "quit"): 7
	Score generated text for non-redundancy: 
Your score (or "quit"): 7
	Score generated text for kid-friendly: 
Your score (or "quit"): 7
PROMPT: All the dragons of the world lived on one mountain called the dragon mountain.
GENERATED: Continue the next sentence of the story making the language appropriate for kids between 6 and 12 years old: The dragon is a dragon that lived in the mountains of North America
Enter one of outstanding(7), very good(6), good(

Unnamed: 0,batch_id,name,model,tuned,dataset,prompt,score,generated,model_top_score,human_top_score,annotated,relevance,readability,grammaticality,non-redundancy,kid-friendly
40,9,baseline,facebook/opt-350m,False,,Lara felt very sad and scared.,0.00148,Continue the next sentence of the story making...,False,True,True,7,7,7,7,7
42,9,baseline,facebook/opt-350m,False,,Lara felt very sad and scared.,0.011939,Continue the next sentence of the story making...,True,False,True,5,5,5,5,5
85,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006471,Continue the next sentence of the story making...,False,True,True,6,6,6,6,6
88,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006506,Continue the next sentence of the story making...,True,False,True,4,4,4,4,4


In [8]:
df_annotated = pd.read_csv(MA_OUT_FILE)
df_annotated

Unnamed: 0,gen_id,batch_id,name,model,tuned,dataset,prompt,score,generated,model_top_score,human_top_score,annotated,relevance,readability,grammaticality,non-redundancy,kid-friendly
0,40,9,baseline,facebook/opt-350m,False,,Lara felt very sad and scared.,0.00148,Continue the next sentence of the story making...,False,True,True,7,7,7,7,7
1,42,9,baseline,facebook/opt-350m,False,,Lara felt very sad and scared.,0.011939,Continue the next sentence of the story making...,True,False,True,5,5,5,5,5
2,85,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006471,Continue the next sentence of the story making...,False,True,True,6,6,6,6,6
3,88,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006506,Continue the next sentence of the story making...,True,False,True,4,4,4,4,4


In [5]:
df_annotated = pd.read_csv(MA_OUT_FILE)
df_annotated

Unnamed: 0.1,Unnamed: 0,batch_id,name,model,tuned,dataset,prompt,score,generated,model_top_score,human_top_score,annotated,relevance,readability,grammaticality,non-redundancy,kid-friendly
0,40,9,baseline,facebook/opt-350m,False,,Lara felt very sad and scared.,0.00148,Continue the next sentence of the story making...,False,True,False,-1,-1,-1,-1,-1
1,42,9,baseline,facebook/opt-350m,False,,Lara felt very sad and scared.,0.011939,Continue the next sentence of the story making...,True,False,True,7,6,6,6,7
2,85,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006471,Continue the next sentence of the story making...,False,True,False,-1,-1,-1,-1,-1
3,88,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006506,Continue the next sentence of the story making...,True,False,True,5,3,6,1,7


# Score Models

In [None]:
ANNOTATORS = ['ghiwa', 'nico', 'ram']
ANNOTATION_FILE_FORMAT = '{}{}_final_annotations.csv' # Needs GDRIVE_BASE and annotator.
METRICS = [
    'relevance',
    'readability',
    'grammaticality',
    'non-redundancy',
    'kid-friendly']
SCALES = [
    ('outstanding', 7),
    ('very good', 6),
    ('good', 5),
    ('neutral', 4),
    ('bad', 3),
    ('very bad', 2),
    ('unusable', 1)]

def validate(annotations):
  num_items = annotations.items()[0].shape[0]
  # Ensure all entries have been annotated.
  for annotator, df_annotations in annotations.items():
    if df_annotations.shape[0] != num_items:
      raise Exception('Number of entries do not match.')
    if (df_annotations['annotated'] == False).any():
      raise Exception('Not all entries are annotated by {annotator}.')

def load_annotations(annotators):
  annotations = {}
  for annotator in annotators:
    file = ANNOTATION_FILE_FORMAT.format(GDRIVE_BASE, annotator)
    df_annotations = pd.read_csv(file)
    annotations[annotator] = df_annotations
  validate(annotations)

  


