<a href="https://colab.research.google.com/github/google-research/tapas/blob/master/notebooks/retrieval_predictions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##### **Copyright 2020 The Google AI Language Team Authors**

This notebook is adapted from the original repository [here](https://github.com/google-research/tapas/blob/master/DENSE_TABLE_RETRIEVER.md).

Licensed under the Apache License, Version 2.0 (the "License");

In [3]:
# Copyright 2021 The Google AI Language Team Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Run TAPAS retrieval models description
This notebook shows how to use retrieval models, which was introduced in the paper: [Open Domain Question Answering over Tables via Dense Retrieval](https://arxiv.org/pdf/2103.12011.pdf).
1.   Load pre-trained and fine-tuned models.
      > * the dual encoder. (called tapas_retriever)
      > * the reader models. (called tapas_reader)
2.   Add handcrafted query and extract the interactions and tf-examples.
3.   Get nearest neighbors for each query, and extract the interactions to pass to the reader.
4.   Call the reader on the new interactions and print 
      > * the query.
      > * the probability of this table containing the answer.
      > * the table with a highlighted answer found by the reader.




In [4]:
# Imports
import os, ast, csv, sys, shutil, IPython

sys.path.append(os.path.join(os.getcwd(), ".."))

import pandas as pd
import tensorflow._api.v2.compat.v1 as tf

tf.get_logger().setLevel('ERROR')

from apache_beam.runners.direct import direct_runner
from tapas.utils import (create_utils, 
                         tf_example_utils, 
                         prediction_utils, 
                         number_annot_utils, 
                         eval_table_retriever_utils)
from tapas.protos import interaction_pb2


In [5]:
# 1.   Load pre-trained and fine-tuned models if you haven't already.

# # The dual encoder model
# ! gsutil cp "gs://tapas_models/2021_04_27/tapas_nq_hn_retriever_medium.zip" "tapas_retriever.zip" && unzip tapas_retriever.zip
# ! mv tapas_nq_hn_retriever_medium/ tapas_retriever

# # The reader model
# ! gsutil cp "gs://tapas_models/2021_04_27/tapas_nq_hn_reader_large.zip" "tapas_reader.zip" && unzip tapas_reader.zip
# ! mv tapas_nq_hn_reader_large tapas_reader

# # Load the released nq_tables data.
# os.makedirs('tapas_models_nq_tables', exist_ok=True)
# ! gsutil -m cp -R gs://tapas_models/2021_07_22/nq_tables/* data/


# Code

In [6]:
# 2.   Add handcrafted tables, and queries
# 2.1.   Create the needed directories.
def create_directories():
  """Create directories."""

  # To be used for the dual encoder.
  os.makedirs('results/nq_retrieval/model', exist_ok=True)
  with open('results/nq_retrieval/model/checkpoint', 'w') as f:

    f.write('model_checkpoint_path: "model.ckpt-0"')

  for suffix in ['.data-00000-of-00001', '.index', '.meta']:

    shutil.copyfile(f'tapas_retriever/model.ckpt{suffix}', f'results/nq_retrieval/model/model.ckpt-0{suffix}')

  shutil.copyfile(f'tapas_retriever/tables.tsv', f'results/nq_retrieval/model/tables.tsv')
  shutil.copyfile(f'tapas_retriever/bert_config.json', f'results/nq_retrieval/model/bert_config.json')

  # To be used for nq_reder.
  os.makedirs('results/nq_retrieval/tf_examples', exist_ok=True)
  os.makedirs('results/nq_retrieval/queries', exist_ok=True)
  # os.makedirs('results/nq_retrieval/tables', exist_ok=True)

  os.makedirs('results/nq_reader/model', exist_ok=True)
  os.makedirs('results/nq_reader/queries', exist_ok=True)
  os.makedirs('results/nq_reader/nq_retrieval/tf_examples', exist_ok=True)
  os.makedirs('results/nq_reader/nq_retrieval/model', exist_ok=True)

  with open('results/nq_reader/model/checkpoint', 'w') as f:

    f.write('model_checkpoint_path: "model.ckpt-0"')

  for suffix in ['.data-00000-of-00001', '.index', '.meta']:

    shutil.copyfile(f'tapas_reader/model.ckpt{suffix}', f'results/nq_reader/model/model.ckpt-0{suffix}')

# 2.2.   Code to extract the data: the interactions than the tf_examples.
# interaction_pb2.Interaction` protobuf object is the data structure we use to
# store examples, and then to call the prediction script.
def get_table(document_title, table_data):

  """Extracts the interaction for an str table.
   
   Args:
    table_data: str table where the columns are separated by '|'.
    document_title: str title of the page containing the table or a table title
      it also could be empty str."""
  
  table = [list(map(lambda s: s.strip(), row.split("|"))) 
           for row in table_data.split("\n") if row.strip()]
  table_interaction = interaction_pb2.Table()
  table_interaction.document_title = document_title
  table_interaction.table_id = document_title
  if not table:
    return table_interaction
  
  for header in table[0]:

    table_interaction.columns.add().text = header

  for line in table[1:]:
    
    row = table_interaction.rows.add()
    for cell in line:

      row.cells.add().text = cell

  return table_interaction

def extract_queries(queries):
  """Extracts the interaction for a list of queries.
   
   This is used to create the interaction queries file.
   Args:
    queries: list of str queries."""
  for idx, query in enumerate(queries):

    interaction = interaction_pb2.Interaction()
    interaction.id = f"queries_{idx}"
    question = interaction.questions.add()
    question.original_text = query
    question.id = f"{interaction.id}-0_0"
    interaction.table.CopyFrom(get_table("FAKE", " | \n | \n"))
    number_annot_utils.add_numeric_values(interaction) 

    yield interaction

def write_tfrecord(filename, examples):
  """From interactions examples to tfrecord."""
  with tf.io.TFRecordWriter(filename) as writer:

    for example in examples:

      writer.write(example.SerializeToString())

def get_config():
  max_seq_length = 512
  vocab_file = "tapas_retriever/vocab.txt"
  config=tf_example_utils.RetrievalConversionConfig(
      vocab_file=vocab_file,
      max_seq_length=max_seq_length,
      max_column_id=max_seq_length,
      max_row_id=max_seq_length,
      strip_column_names=False,
      cell_trim_length=-1,
      use_document_title=True,
  )
  return config

def extract_queries_data(queries):
  """Extracts the interactions then the tf_examples.
  
  Args:
    queries: list of str queries.
  """
  examples = extract_queries(queries)
  input_queries = "results/nq_retrieval/queries/queries.tfrecord"
  write_tfrecord(input_queries, examples)
  config = get_config()
  pipeline = create_utils.build_retrieval_pipeline(
        input_files=[input_queries],
        output_files=[os.path.join("results/nq_retrieval/tf_examples",
                                   "queries.tfrecord")],
        input_format=create_utils.InputFormat.INTERACTION,
        config=config,
    )
  direct_runner.DirectRunner().run(pipeline).wait_until_finish()

# 3.   Retrieval: Extract the queries interactions to pass to the reader
# 3.1.   Extracts the queries embeddings.
def get_queries_embeddings():

  !python -m retrieval_main --config-name retrieval_predict

  with open("results/nq_retrieval/model/queries/predict_results_1.tsv") as csvfile_reader:

    reader = csv.DictReader(csvfile_reader, delimiter='\t')
    for i, row in enumerate(reader):

      print("Adding query_id: ", row["query_id"], ":", queries[i])

# 3.2.   Get nearest tables neighbors for each query.
def get_nearest_neighbors(num_neighbors):
      
  queries_pred = eval_table_retriever_utils.read_queries("results/nq_retrieval/model/queries/predict_results_1.tsv")
  tables = eval_table_retriever_utils.read_tables("results/nq_retrieval/model/tables.tsv", make_tables_unique=False)
  index = eval_table_retriever_utils.build_table_index(tables)
  similarities, neighbors = eval_table_retriever_utils._retrieve(queries_pred, index)
  selected_tables = {}
  for i, s in enumerate(similarities):

    print("Query index", i, ":", queries[i])
    selected_tables[i]={}
    for pos in range(num_neighbors):

      table_id = tables[neighbors[i][pos]].table_id
      selected_tables[i][table_id] = (s[pos], neighbors[i][pos])
      print("           Related table id:", table_id)
      print("           Table's score ", s[pos])
      print("           ----------------------------------------------")

  return  selected_tables

# 3.3.   Extract the interactions to pass to the reader.
def iterate_tables(input_file):
    
  """Reads interaction_pb2.Table()."""
  for value intf.io.tf_record_iterator(input_file):

    table = interaction_pb2.Table()
    table.ParseFromString(value)
        
    yield table

def create_queries_tables_interactions(selected_tables):
        
  """Creates the interaction by linking the query to the selected table."""
  queries_interactions = prediction_utils.iterate_interactions(
      "results/nq_retrieval/queries/queries.tfrecord")
  all_tables = iterate_tables("data/tables/tables.tfrecord")
  tables = {table.table_id: table for table in all_tables}
  for i, q in enumerate(queries_interactions):
        
    print("\n Query index:", i, ":", queries[i])
    t = selected_tables[i]
    for table_id in t.keys():

      if table_id in tables.keys():
        table = tables[table_id]
        print("  > Converted query:", q.questions[0].original_text)
        new_interaction = interaction_pb2.Interaction()
        new_interaction.CopyFrom(q)
        new_interaction.id = f"{new_interaction.id}_{table.table_id}"
        new_interaction.questions[0].id = f"{new_interaction.id}_0"
        new_interaction.table.CopyFrom(table)
        yield new_interaction

      else:
        print("  > Not found in table file.")

      print("      Related table id:  ", table_id)
      print("      Table's score:     ", t[table_id][0])
      print("  ----------------------------------------------")

def create_interactions_for_reader(selected_tables):

  examples = create_queries_tables_interactions(selected_tables)
  write_tfrecord("results/nq_reader/queries/reader_queries.tfrecord", examples)

# 4.   Reader: Get the answer given the question and the table
def get_converter(max_seq_length):
  """Get a clssifier conferter."""
  config = tf_example_utils.ClassifierConversionConfig(
      vocab_file="tapas_reader/vocab.txt",
      max_seq_length=max_seq_length,
      max_column_id=max_seq_length,
      max_row_id=max_seq_length,
      strip_column_names=False,
      add_aggregation_candidates=False,
  )
  return tf_example_utils.ToClassifierTensorflowExample(config)

def convert_interactions_to_examples(converter):
  """Calls Tapas converter to convert interaction to example."""
  interactions = prediction_utils.iterate_interactions(
      "results/nq_reader/queries/reader_queries.tfrecord")
  for interaction in interactions:
    try:
      yield converter.convert(interaction, 0)

    except ValueError as e:
      print(f"Can't convert interaction: {interaction.id} error: {e}")
        
def write_tf_example(filename, examples):
  with tf.io.TFRecordWriter(filename) as writer:
    for example in examples:

      writer.write(example.SerializeToString())

class Colors:
  """Used to highlight the answers."""
  ANSWER = '\033[94m'
  BASE = '\033[95m'
  BLACK = '\033[0m'

  
def set_answer_color(input, begin, end):
  """Highlights the answers."""
  list_output = [i.original_text for i in input]
  list_output[begin] = Colors.ANSWER + list_output[begin]
  list_output[end - 1] = list_output[end - 1] + Colors.BASE

  return " ".join(list_output)


def get_table_df(table):
  """Extracts a dataframe table for a better visualisation."""
  printabe_table = [[Colors.BASE + c.text + Colors.BASE for c in table.columns]] 
  for r in table.rows:
  
    printabe_table.append([Colors.BASE + c.text  + Colors.BASE for c in r.cells])
  
  return pd.DataFrame(printabe_table)

def predict():
  """Predict the answer given the query and the table."""
  max_seq_length = 512
  # Extracts the tf examples given the interactions.
  converter = get_converter(max_seq_length)
  examples = convert_interactions_to_examples(converter)
  write_tf_example("results/nq_reader/nq_retrieval/tf_examples/test.tfrecord", examples)
  write_tf_example("results/nq_reader/nq_retrieval/tf_examples/dev.tfrecord", [])
  # Run prediction
  !python -m reader_main --config-name reader_predict
        
  # Display results
  results_path = "results/nq_reader/model/test.tsv"
  
  interactions = prediction_utils.iterate_interactions(
      "results/nq_reader/queries/reader_queries.tfrecord")
  tables = {
      interaction.questions[0].id : (get_table_df(interaction.table),
                                     interaction.table.table_id,
                                     converter._tokenize_table(interaction.table),
                                     interaction.questions[0].original_text)
      for interaction in interactions}

  with open(results_path) as csvfile:
    reader = csv.DictReader(csvfile, delimiter='\t')
    
    for row in reader:
      # question_id
      df, table_id, table_tokens, query_text = tables[row["question_id"]]
      print(Colors.BLACK)
      print("query >", query_text)
      print("            > table id: ", table_id)
      print("            > table prediction score: ", row["logits_cls"])
      answers = ast.literal_eval(row["answers"])
      for a in answers:
        
        index_r = a["row_index"]
        index_c = a["column_index"]
        colored_answer = set_answer_color(
            table_tokens.rows[index_r+1][index_c],
            a["begin_token_index"], a["end_token_index"])
        df.iat[index_r+1, index_c] = colored_answer
        print("            > Answer cell:", Colors.BASE + colored_answer)
        print(Colors.BLACK + "            > Answer score:", a["score"], "\n")

      with pd.option_context(
          'display.max_rows', None, 'display.max_columns', None,
          'expand_frame_repr', False, 'display.unicode.ambiguous_as_wide', False,
          'display.max_colwidth', None):

        print("Table:\n")
        print(Colors.BASE + df.to_string(index=False, header=False))
      print(Colors.BLACK +"-------------------------------------------------------------------------------------\n")


## Run predict

In [7]:
# Change current directory (only if not already adjusted) and make new ones
if os.path.split(os.getcwd())[-1] != "IR2":
    os.chdir(os.path.join(os.getcwd(), ".."))

create_directories()

# Prompts are taken from the NQ Dev Set
queries = ["how many seasons are there in one tree hill",               # 9
           "when does the movie the thinning come out",                 # October\u00a012,\u00a02016
           "who plays the mom in charlie and the chocolate factory",    # Helena Bonham Carter
           "what is bam from alaskan bush people's real name",          # Joshua
           "when did the new ipad come out 2017"]                       # "March 24, 2017"

extract_queries_data(queries)
get_queries_embeddings()

num_neighbors = 4 #@param {type:"integer"}
selected_tables = get_nearest_neighbors(num_neighbors=num_neighbors)
create_interactions_for_reader(selected_tables=selected_tables)

predict()




2024-12-15 15:26:28.404221: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'cudart64_101.dll'; dlerror: cudart64_101.dll not found
2024-12-15 15:26:28.404859: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Instructions for updating:
non-resource variables are not supported in the long term




INFO:tensorflow:Using config: {'_model_dir': 'C:\\Users\\Greg\\AppData\\Local\\Temp\\tmpjvlx4_n5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 4.0, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experim





[2024-12-15 15:26:36,698][tensorflow][INFO] - Using config: {'_model_dir': 'C:\\Users\\Greg\\AppData\\Local\\Temp\\tmpjvlx4_n5', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 1000, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 4.0, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=1000, num_shards=1, num_cores_per_replica=

2024-12-15 15:31:03.061942: W tensorflow/stream_executor/platform/default/dso_loader.cc:55] Could not load dynamic library 'cudart64_101.dll'; dlerror: cudart64_101.dll not found
2024-12-15 15:31:03.062379: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
Instructions for updating:
non-resource variables are not supported in the long term
2024-12-15 15:31:10.014437: I tensorflow/core/platform/cpu_feature_guard.cc:143] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2
2024-12-15 15:31:10.027804: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x1c6b7095e70 initialized for platform Host (this does not guarantee that XLA will be used). Devices:
2024-12-15 15:31:10.028528: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): Host, Default Version
2024-12-15 15:31:10.032546: I tensorflow/stream_executor/platform/default/dso_loader.cc

[0m
query > how many seasons are there in one tree hill
            > table id:  One Tree Hill (season 5)_46AFF134C6496E14
            > table prediction score:  -9.5666485
            > Answer cell: [95mseason 1 season 2 season 3 season 4 season [94m5[95m season 6 season 7 season 8 season 9
[0m            > Answer score: -2.733701467514038 

Table:

[95m                                                       [95m[hide] v t e One Tree Hill[95m                                                                                                                                                                                                                                                                                                                                                                                                                        [95m[hide] v t e One Tree Hill.1[95m
 season 1 season 2 season 3 season 4 season [94m5[95m season 6 season 7 season 8 season 9        