# Tuned - Inference

## Install Dependencies

In [1]:
GDRIVE_BASE = 'drive/MyDrive/MIDS/w266/project/'

!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.4-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 KB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.27.4
Looking in indexes: https://pypi.org/simple, https://us

## Connect to Google Drive
We will be loading data from google drive and also save trained models to google drive. So lets mount google drive.

In [2]:
import sys
from google.colab import drive
drive.mount('/content/drive')
sys.path.insert(0, GDRIVE_BASE)

Mounted at /content/drive


## Imports and Constants

In [3]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer, OPTForCausalLM
from transformers import BertTokenizerFast, BertTokenizer, BertConfig
from transformers import EncoderDecoderModel, EncoderDecoderConfig
import torch
import transformers
from collections import deque
from tqdm.notebook import tqdm
import common

print(f'common.__version__: {common.__version__}')

transformers.logging.set_verbosity_error()
tuning_configs = common.create_configs(GDRIVE_BASE)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

common.__version__: 1.4


In [4]:
def tokenizer_for(config):
  if config.model_family == 'bert':
    return BertTokenizerFast.from_pretrained("bert-base-cased")
  else:
    return AutoTokenizer.from_pretrained(config.base_model)

def model_for(config):
  if config.model_family == 't5':
    if config.tuned:
      return T5ForConditionalGeneration.from_pretrained(config.tuned_model_path).to(device)
    else:
      return T5ForConditionalGeneration.from_pretrained(config.model_name).to(device)

  if config.model_family == 'opt':
    if config.tuned:
      return OPTForCausalLM.from_pretrained(config.tuned_model_path).to(device)
    else:
      return OPTForCausalLM.from_pretrained(config.model_name).to(device)  

  return EncoderDecoderModel.from_pretrained(config.tuned_model_path).to(device)


# Story Bot

In [None]:
class StoryBot:
  """Class to mimic a bot that continues the story."""
  def __init__(self, inferencer, n_iters=20, lines_to_use=1):
    """
      Creates the interactive story bot.
      inferencer - class to use for generating lines of the story.
      n_iters - number of iterations to do for story generation.
      lines_to_use - The number of lines from the story to use as context for 
                     generating the next line.
    """
    self._n_iters = n_iters
    self.lines_to_use = lines_to_use
    self.inferencer = inferencer
    self.re_init()

  def re_init(self):
    self.story = []
    # Initialize queue to hold just the last "lines_to_use" lines of story.
    self.context_lines = deque([], self.lines_to_use)

  def display_line_choices(self, output_lines):
    print('Choose the line of your choice:')
    for i, line in enumerate(output_lines):
      print(f'{i}:', line)
    print(f'{i+1}: Regenerate')
    print(f'{i+2}: End')

  def get_user_choice(self):
    output_lines = self.inferencer(self.context_lines)
    if len(output_lines) > 1:
      self.display_line_choices(output_lines)
      user_opt = -1
      while user_opt == -1:
        try:
            user_input = input('Input the number of your choice (or ): ')
            user_opt = int(user_input)
            if user_opt < len(output_lines):
              return output_lines[user_opt]
            elif user_opt == len(output_lines):
              return 'regenerate'
            elif user_opt == len(output_lines) + 1:
              return 'end'
        except ValueError:
            user_opt = -1
    else:
      return output_lines[0]

  def print_story(self):
    for i, line in enumerate(self.story):
      if i%2 == 0:
        print(f'User: {line}') 
      else:
        print(f'Generated: {line}') 

  def __call__(self):
    print('*'*50)
    print('Welcome to StoryBot!\n')
    print('This program simulates an MMS kind of interaction with a bot to create a story sequentially.')
    print('When the prompt appears below, start typing as if it were the input on your mobile.')
    print('Enter end to end the story and restart to restart.') 
    print('*'*50, '\n')
    restart = False
    i = 0
    while i < self._n_iters:
      if i > 0:
        print('The story so far:')
        self.print_story()
      i = i + 1
      # get the sentence from the user
      sentence_in = input('Enter next line (or end): ').strip()
      # accomodate special prompts
      if sentence_in == 'end':
        break
      if sentence_in == 'restart':
        i = 0
        self.re_init()
        continue
      self.context_lines.append(sentence_in)
      self.story.append(sentence_in)
      output = 'regenerate'
      while output == 'regenerate':
        output = self.get_user_choice()
      if output == 'end':
        break
      self.context_lines.append(output)
      self.story.append(output)

    print()
    print('\n======== Final story: =========\n')
    self.print_story()



In [None]:
def run_story_bot(config, device):
  tokenizer = tokenizer_for(config)
  model = model_for(config)
  if config.model_family == 't5':
    inferencer = common.T5Inferencer(device, model, tokenizer, prompt=config.prompt)
  elif config.model_family == 'opt':
    inferencer = common.OptInferencer(device, model, tokenizer)
  else:
    inferencer = common.B2BInferencer(device, model, tokenizer)
  story_bot = StoryBot(inferencer, n_iters=5, lines_to_use=1)
  story_bot()


In [None]:
# Run story bot on t5 s1
run_story_bot(tuning_configs['b2b_s1'], device)

**************************************************
Welcome to StoryBot!

This program simulates an MMS kind of interaction with a bot to create a story sequentially.
When the prompt appears below, start typing as if it were the input on your mobile.
Enter end to end the story and restart to restart.
************************************************** 

Enter next line (or end): He did not like what he saw.




Choose the line of your choice:
0: He had never seen such a thing in his life.
1: He had never seen such a thing in his life, and he was afraid of it.
2: He had never seen such a thing before.
3: Regenerate
4: End
Input the number of your choice (or ): 1



User: He did not like what he saw.
Generated: He had never seen such a thing in his life, and he was afraid of it.


# Batch Inferencing

In [5]:
def max_score_index(scores):
  max_score = scores[0]
  max_score_index = 0
  for i, score in enumerate(scores):
    if score > max_score:
      # New max score
      max_score = score
      max_score_index = i
  return max_score_index

def generate_next_line(batch_id, config, line, device, num_sequences=5, max_new_tokens=50):
  tokenizer = tokenizer_for(config)
  model = model_for(config)
  test_inputs = tokenizer([config.prompt + line], return_tensors='pt')
  input_ids = test_inputs['input_ids'].to(device)
  if config.model_family == 't5':
    outputs = model.generate(
        input_ids,
        num_beams=num_sequences,
        no_repeat_ngram_size=2,
        num_return_sequences=num_sequences,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_k=0,
        return_dict_in_generate=True,
        output_scores=True,
        renormalize_logits=True,
      )
  elif config.model_family == 'opt':
    outputs = model.generate(
      input_ids,
      num_beams=num_sequences,
      no_repeat_ngram_size=2,
      num_return_sequences=num_sequences,
      max_length = max_new_tokens,
      do_sample=True,
      top_k=0,
      early_stopping=True,
      return_dict_in_generate=True,
      output_scores=True,
      renormalize_logits=True
    )
  else:
    outputs = model.generate(
        input_ids, 
        # attention_mask=attention_mask,
        max_new_tokens=max_new_tokens,
        num_beams=num_sequences,
        no_repeat_ngram_size=2,
        early_stopping=True,
        num_return_sequences=num_sequences,
        return_dict_in_generate=True,
        output_scores=True,
        renormalize_logits=True)
    
  scores = (-1 * outputs.sequences_scores).tolist()
  test_output_ids = outputs.sequences
  decoded = [tokenizer.decode(out_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).replace('\n', ' ') for out_ids in test_output_ids]

  result = [[batch_id, config.name, config.model_name, config.tuned, config.dataset, line, score, output, False] for score, output in zip(scores, decoded)]
  result[max_score_index(scores)][-1] = True
  return result

def generate_for(first_lines, device, models=None):
  """
    Generates a new line for each line in first_lines and for each model. 
    If models is not None then the generation is limited to the models specified 
    else it is generated for all models in tuning_configs.
  """
  if models is None:
    models = tuning_configs.keys()
  with tqdm(total=len(models) * len(first_lines), unit='text generation', unit_scale=True) as pbar:
    results = []
    batch_id = 1
    for line in first_lines:
      for name in models:
        config = tuning_configs[name]
        pbar.set_postfix(model=name, prompt=line[0:10] + '...', refresh=True)
        results.extend(generate_next_line(batch_id, config, line, device))
        batch_id = batch_id + 1
        pbar.update(1)
    return pd.DataFrame(results, columns=['batch_id', 'name', 'model', 'tuned', 'dataset', 'prompt', 'score', 'generated', 'model_top_score'])



In [6]:
%%time
# Warmup run to force all downloads.
first_lines = ['"I have spoken." he said.']
df_warmup = generate_for(first_lines, device, ['t5_s1', 'opt_s2', 'b2b_s2', 'baseline'])


  0%|          | 0.00/4.00 [00:00<?, ?text generation/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/605 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/685 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/644 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/441 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/663M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

CPU times: user 27.8 s, sys: 6.66 s, total: 34.5 s
Wall time: 1min 42s


In [8]:
%%time
first_lines = ['Lara felt very sad and scared.', 'All the dragons of the world lived on one mountain called the dragon mountain.']
df_results = generate_for(first_lines, device)
  

  0%|          | 0.00/18.0 [00:00<?, ?text generation/s]



CPU times: user 1min 30s, sys: 11.3 s, total: 1min 42s
Wall time: 3min 53s


In [10]:
df_results.to_csv(common.annotation_input_loc(GDRIVE_BASE), index=None)
df_results

Unnamed: 0,batch_id,name,model,tuned,dataset,prompt,score,generated,model_top_score
0,1,t5_s1,google/t5-v1_1-base-s1-finetuned,True,s1,Lara felt very sad and scared.,0.000038,"""Are you going to do that?"" asked Lavinia.",False
1,1,t5_s1,google/t5-v1_1-base-s1-finetuned,True,s1,Lara felt very sad and scared.,0.000031,"""It is a pity that it is so old,"" she said.",False
2,1,t5_s1,google/t5-v1_1-base-s1-finetuned,True,s1,Lara felt very sad and scared.,0.000079,The old woman sat down and waited.,True
3,1,t5_s1,google/t5-v1_1-base-s1-finetuned,True,s1,Lara felt very sad and scared.,0.000033,"""You are so kind, sir,"" she cried.",False
4,1,t5_s1,google/t5-v1_1-base-s1-finetuned,True,s1,Lara felt very sad and scared.,0.000035,"""I'm afraid of her!"" she whispered.",False
...,...,...,...,...,...,...,...,...,...
85,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006471,Continue the next sentence of the story making...,False
86,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.002831,Continue the next sentence of the story making...,False
87,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.004651,Continue the next sentence of the story making...,False
88,18,baseline,facebook/opt-350m,False,,All the dragons of the world lived on one moun...,0.006506,Continue the next sentence of the story making...,True
