In [1]:
!pip install openai



In [2]:
!pip install transformers
!pip install ujson



In [3]:
import logging
import os
import os.path
import shutil
import tempfile
import zipfile
import json
import pandas as pd
import numpy as np
import torch
import glob
from os.path import exists
import io


import openai
import pickle
from transformers import GPT2TokenizerFast,AutoTokenizer, AutoModelForCausalLM

import collections
from contextlib import nullcontext
from collections import namedtuple
import numpy as np
import random
import re 
import string
from typing import List


In [4]:
!git clone -b cpu_inference https://github.com/stanford-futuredata/ColBERT.git

Cloning into 'ColBERT'...
remote: Enumerating objects: 1817, done.[K
remote: Counting objects: 100% (1191/1191), done.[K
remote: Compressing objects: 100% (473/473), done.[K
remote: Total 1817 (delta 820), reused 952 (delta 691), pack-reused 626[K
Receiving objects: 100% (1817/1817), 1.25 MiB | 4.04 MiB/s, done.
Resolving deltas: 100% (1088/1088), done.


In [5]:
import os
import sys
sys.path.insert(0, 'ColBERT/')

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Collection
# from colbert.searcher import Searcher

NOTE: Redirects are currently not supported in Windows or MacOs.
  assert(not torch.cuda.is_available(), "cupy must be installed in GPU mode")


We also install a small portion from the SQA training dataset, for demonstration purposes. This is a TSV file containing table-question pairs. Besides this, we also download the `table_csv` directory, which contains the actual tabular data.

Note that you can download the entire SQA dataset on the [official website](https://www.microsoft.com/en-us/download/details.aspx?id=54253).

Let's create a new dataframe that groups questions which are asked in a sequence related to the table. We can do this by adding a `sequence_id` column, which is a combination of the `id` and `annotator` columns:

In [6]:
import smart_open
import pandas as pd

In [7]:
json_path = 'https://raw.githubusercontent.com/wenhuchen/OTT-QA/master/data/traindev_tables.json'
with smart_open.open(json_path, 'r') as file:
    train_dev_tables = json.load(file)

qa_train_json_path = 'https://raw.githubusercontent.com/wenhuchen/OTT-QA/master/released_data/train.json'
with smart_open.open(qa_train_json_path, 'r') as file:
    qa_train = json.load(file)

qa_dev_json_path = 'https://raw.githubusercontent.com/wenhuchen/OTT-QA/master/released_data/dev.json'
with smart_open.open(qa_dev_json_path, 'r') as file:
    qa_dev = json.load(file)

In [8]:
# some clean-ups
def _rm_table_context(tbl):
    return [[cell[0] for cell in row] for row in tbl]

def _rm_header_context(headers):
    return [header[0] for header in headers]

def _preprocess_table(table):
    return "\n".join([table.to_csv(index=False, line_terminator='<row>')])

def _get_link_pair(tbl):
    pairs = []
    for row in tbl:
        for cell in row:
            if len(cell[1])>0:
                pairs.append({cell[1][0]:cell[0]})
    return pairs

In [9]:
tables = train_dev_tables.values()
tables_df = pd.DataFrame(tables)

tables_df['data_clean'] = tables_df['data'].apply(_rm_table_context)
tables_df['header_clean'] = tables_df['header'].apply(_rm_header_context)
tables_df['table_context'] = tables_df.apply(lambda row:f'{row.title} | {_preprocess_table(pd.DataFrame(row.data_clean, columns=row.header_clean))}', axis = 1)
tables_df['link_pairs'] = tables_df['data'].apply(_get_link_pair)

In [10]:
qa_train_df = pd.DataFrame(qa_train)
qa_dev_df = pd.DataFrame(qa_dev)

# join with the question df
qa_train_enriched_df = qa_train_df.merge(tables_df, how = 'left', left_on = 'table_id', right_on = 'uid')
qa_dev_enriched_df = qa_dev_df.merge(tables_df, how = 'left', left_on = 'table_id', right_on = 'uid')

In [39]:
from collections import namedtuple
import pandas as pd 

sys.path.insert(0, './')
# from dpr import has_answer, DPR_normalize
from utility.utils.dpr import has_answer, DPR_normalize

def ottQaDF2Squad(ottqaDf, filter_w_answer = False):
  SquadExample = namedtuple("SquadExample",  "id table_id title context question_context question answers")
  examples = []
  processed_tables = []
  processed_context = []
  for idx, row in ottqaDf.iterrows():
    if filter_w_answer==False or (has_answer([DPR_normalize(ans) for ans in [row['answer-text']]], row.table_context)):
        id = row['question_id']
        table_id = row['table_id']
        title = row['title']    
        question = row['question']
        answers = [row['answer-text']]
        question_context = f"""Question: {row.question.strip()}, Table: {row.table_context.strip()}"""
        examples.append(SquadExample(id, table_id, title, row.table_context, question_context, question, answers))

        processed_tables.append(row.table_context)
        processed_context.append(question_context)
  return examples, processed_tables, processed_context

training_examples, training_flatten_tables, training_question_context = ottQaDF2Squad(qa_train_enriched_df)
test_examples, test_flatten_tables, test_question_context = ottQaDF2Squad(qa_dev_enriched_df)
len(test_examples), len(training_examples)

(2214, 41469)

# Pinecone


In [13]:
!pip install datasets pinecone-client sentence_transformers
# torch-scatter
# !pip install pinecone-client

Collecting torch>=1.6.0
  Using cached torch-2.0.0-cp39-none-macosx_10_9_x86_64.whl (139.8 MB)
Installing collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 1.13.1
    Uninstalling torch-1.13.1:
      Successfully uninstalled torch-1.13.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blanc 0.3.0 requires torch<2.0,>=1.0, but you have torch 2.0.0 which is incompatible.[0m[31m
[0mSuccessfully installed torch-2.0.0


In [14]:
!pip install pinecone-client



In [15]:
import pinecone
# connect to pinecone environment
pinecone.init(
    api_key="04975cde-ba97-4e6c-9b6d-72b8e42800bd",
    environment="us-west4-gcp"  # find next to API key in console
)

# # you can choose any name for the index
index = pinecone.Index('table-qa-test')
index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2214}},
 'total_vector_count': 2214}

In [16]:
pinecone.init(
    api_key='cdb240cb-c0c3-4ac1-bdba-aa9c62751239',
    environment='eu-west1-gcp'  # find next to API key in console
)
index_qc = pinecone.Index('table-qa-train')
index_qc.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 41469}},
 'total_vector_count': 41469}

In [17]:
from tqdm.auto import tqdm
import torch
# !pip install sentence_transformers
from sentence_transformers import SentenceTransformer
# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load the table embedding model from huggingface models hub
retriever = SentenceTransformer('deepset/all-mpnet-base-v2-table', device=device)

In [18]:
import pinecone
# connect to pinecone environment
pinecone.init(
    api_key="04975cde-ba97-4e6c-9b6d-72b8e42800bd",
    environment="us-west4-gcp"  # find next to API key in console
)

# you can choose any name for the index
index_name = 'table-qa-test'
# check if the table-qa index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=768,
        metric='cosine'
    )
# connect to table-qa index we created
index = pinecone.Index(index_name)


In [19]:
from tqdm.auto import tqdm
import torch
from sentence_transformers import SentenceTransformer
# set device to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# load the table embedding model from huggingface models hub
retriever = SentenceTransformer('deepset/all-mpnet-base-v2-table', device=device)

def embed_table_to_pinecone(index, processed_tables, batch_size=64):
  # we will use batches of 64 by default

  for i in tqdm(range(0, len(processed_tables), batch_size)):
      # find end of batch
      i_end = min(i+batch_size, len(processed_tables))
      # extract batch
      batch = processed_tables[i:i_end]
      # generate embeddings for batch
      emb = retriever.encode(batch).tolist()
      # create unique IDs ranging from zero to the total number of tables in the dataset
      ids = [f'{idx}' for idx in range(i, i_end)]
      # add all to upsert list
      to_upsert = list(zip(ids, emb))
      # upsert/insert these records to pinecone
      _ = index.upsert(vectors=to_upsert)
  # check that we have all vectors in index

# embed_table_to_pinecone(index=index, processed_tables=test_flatten_tables)

index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 2214}},
 'total_vector_count': 2214}

In [20]:
pinecone.init(
    api_key='cdb240cb-c0c3-4ac1-bdba-aa9c62751239',
    environment='eu-west1-gcp'  # find next to API key in console
)

# you can choose any name for the index
index_name = 'table-qa-train'
# check if the table-qa index exists
if index_name not in pinecone.list_indexes():
    # create the index if it does not exist
    pinecone.create_index(
        index_name,
        dimension=768,
        metric='cosine'
    )
# connect to table-qa index we created
index_qc = pinecone.Index(index_name)

def embed_table_to_pinecone(index, processed_tables, batch_size=64):
  # we will use batches of 64 by default

  for i in tqdm(range(0, len(processed_tables), batch_size)):
      # find end of batch
      i_end = min(i+batch_size, len(processed_tables))
      # extract batch
      batch = processed_tables[i:i_end]
      # generate embeddings for batch
      emb = retriever.encode(batch).tolist()
      # create unique IDs ranging from zero to the total number of tables in the dataset
      ids = [f'{idx}' for idx in range(i, i_end)]
      # add all to upsert list
      to_upsert = list(zip(ids, emb))
      # upsert/insert these records to pinecone
      _ = index.upsert(vectors=to_upsert)
  # check that we have all vectors in index


# embed_table_to_pinecone(index= index_qc, processed_tables=training_question_context)

index_qc.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 41469}},
 'total_vector_count': 41469}

In [None]:
# from transformers import pipeline, TapasTokenizer, TapasForQuestionAnswering
# model_name = 'google/tapas-base-finetuned-wtq'
# # load the tokenizer and the model from huggingface model hub
# tokenizer = TapasTokenizer.from_pretrained(model_name)
# model = TapasForQuestionAnswering.from_pretrained(model_name, local_files_only=False)
# # load the model and tokenizer into a question-answering pipeline
# pipe = pipeline('table-question-answering',  model=model, tokenizer=tokenizer, device=0)

# # pipe(table=df_result.astype(str), query=query)


In [None]:
# def query_pinecone(query, tables):
#     # generate embedding for the query
#     xq = retriever.encode([query]).tolist()
#     # query pinecone index to find the table containing answer to the query
#     result = index.query(xq, top_k=1)
#     # return the relevant table from the tables list
#     return tables[int(result['matches'][0]['id'])]

# def get_answer_from_table(txt_input, query):
#     # run the table and query through the question-answering pipeline
#     answers = pipe(table=txt_input, query=query)
#     return answers

# query = test_examples[0].question
# table = query_pinecone(query, tables_df['table_context'])

# # import io
# # df = pd.read_csv(io.StringIO(table))
# get_answer_from_table(table, query)


In [21]:
def _find_generated_answer(tokens, newline="\n" ): 
    """Our LMs tend to insert initial newline characters before
    they begin generating text. This function ensures that we 
    properly capture the true first line as the answer while
    also ensuring that token probabilities are aligned."""        
    answer_token_indices = []
    char_seen = False            
    for i, tok in enumerate(tokens):
        # This is the main condition: a newline that isn't an initial
        # string of newlines:
        if tok == newline and char_seen:
            break
        # Keep the initial newlines for consistency:
        elif tok == newline and not char_seen:
            answer_token_indices.append(i)
        # Proper tokens:
        elif tok != newline:
            char_seen = True
            answer_token_indices.append(i)
    return answer_token_indices

In [22]:
def build_few_shot_qa_prompt(question, passage, train_exs, joiner="\n\n"):
    segs = []
    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n"""
    segs += [header]
    for t in train_exs:
        if len(t.answers)>=1:
            ans = t.answers[0]
        else:
            ans = t.answers
        segs += [
            f"Title: {t.title}",
            f"Background: {t.context}",
            f"Q: {t.question}",
            f"A: {ans}"
        ]
    title, context = passage.split(" | ", 1)
    segs += [
        f"Title: {title}",
        f"Background: {context}",
        f"Q: {question}",
        f"A:"
    ]
    return joiner.join(segs)

In [1]:
# from neeva.aws import fetch_secret
renee_api_key = ''

def run_gpt_chat(prompt, model="gpt-4", **gpt3_kwargs):

    # Fill this in with the value from your OpenAI account. First
    # verify that your account is set up with a spending limit that
    # you are comfortable with. If you just opened your account,
    # you should have $18 in credit and so won't need to supply any
    # payment information.
    openai.api_key = renee_api_key

    # assert engine.startswith("text"), \
    #     "Please use an engine whose name begins with 'text'."
    data = []
    api_response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an analyst trying to get answer from the table"},
            {"role": "user", "content": prompt},
        ]
    )
    response = api_response["choices"][0]["message"]["content"]
    data.append({
        "prompt": prompt,
        "generated_answer": response})

    return data     


def run_gpt3(prompts, engine= 'text-davinci-003', temperature=0.7, top_p=0.95, **gpt3_kwargs):
    """To use this function, sign up for an OpenAI account at
        
    https://beta.openai.com/signup

    
    Parameters
    ----------
    prompts : iterable of str
    engine : str
        This has to be one of the models whose name begins with "text".
        The "instruct" class of models can't be used, since they seem
        to depend on some kinds of QA-relevant supervision.        
        For options, costs, and other details: 
        https://beta.openai.com/docs/engines/gpt-3                
    temperature : float
        It seems best to set it low for this task!
    top_p : float
        
    For information about values for `gpt3_kwargs`, see
    
    https://beta.openai.com/docs/api-reference/completions
    
    Returns
    -------
    list of dicts   
    
    """
    # Fill this in with the value from your OpenAI account. First
    # verify that your account is set up with a spending limit that
    # you are comfortable with. If you just opened your account,
    # you should have $18 in credit and so won't need to supply any
    # payment information.
    openai.api_key = renee_api_key

    # assert engine.startswith("text"), \
    #     "Please use an engine whose name begins with 'text'."
        
    response = openai.Completion.create(
        engine=engine,      
        prompt=prompts,
#         temperature=temperature,
#         top_p=top_p,
        echo=False,   # This function will not work
        logprobs=1,   # properly if any of these
        n=1)

    # From here, we parse each example to get the values
    # we need:
    data = []
    for ex, prompt in zip(response["choices"], prompts):
        tokens = ex["logprobs"]["tokens"]
        logprobs = ex["logprobs"]["token_logprobs"]        
        probs = list(np.exp(logprobs))
        if "<|endoftext|>" in tokens:
            end_i = tokens.index("<|endoftext|>")
            tokens = tokens[ : end_i]  # This leaves off the "<|endoftext|>"
            probs = probs[ : end_i]    # token -- perhaps dubious.
        ans_indices = _find_generated_answer(tokens)
        answer_tokens = [tokens[i] for i in ans_indices]
        answer_probs = [probs[i] for i in ans_indices]
        answer = "".join(answer_tokens)        
        data.append({
            "prompt": prompt,
            "generated_text": ex["text"],
            "generated_tokens": tokens,
            "generated_probs": probs,
            "generated_answer": answer,
            "generated_answer_tokens": answer_tokens,
            "generated_answer_probs": answer_probs})
    return data     

In [25]:
def normalize_answer(s: str) -> str:
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
        return re.sub(regex, ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))


def get_tokens(s: str) -> List[str]:
    """Normalize string and split string into tokens."""
    if not s:
        return []
    return normalize_answer(s).split()


def compute_exact(a_gold: str, a_pred: str) -> int:
    """Compute the Exact Match score."""
    return int(normalize_answer(a_gold) == normalize_answer(a_pred))


def compute_f1_from_tokens(gold_toks: List[str], pred_toks: List[str]) -> float:
    """Compute the F1 score from tokenized gold answer and prediction."""
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())

    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(pred_toks)
    recall = 1.0 * num_same / len(gold_toks)

    f1 = (2 * precision * recall) / (precision + recall)
    return f1

def compute_recall_from_tokens(gold_toks: List[str], pred_toks: List[str]) -> float:
    """Compute the F1 score from tokenized gold answer and prediction."""
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())

    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)

    if num_same == 0:
        return 0

    recall = 1.0 * num_same / len(gold_toks)
    return recall

def compute_precision_from_tokens(gold_toks: List[str], pred_toks: List[str]) -> float:
    """Compute the F1 score from tokenized gold answer and prediction."""
    common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
    num_same = sum(common.values())

    if len(gold_toks) == 0 or len(pred_toks) == 0:
        # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
        return int(gold_toks == pred_toks)

    if num_same == 0:
        return 0

    precision = 1.0 * num_same / len(pred_toks)
    return precision

def compute_recall(a_gold: str, a_pred: str) -> float:
    """Compute the recall score."""
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    return compute_recall_from_tokens(gold_toks, pred_toks)

def compute_precision(a_gold: str, a_pred: str) -> float:
    """Compute the precision score."""
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    return compute_precision_from_tokens(gold_toks, pred_toks)

def compute_f1(a_gold: str, a_pred: str) -> float:
    """Compute the F1 score."""
    gold_toks = get_tokens(a_gold)
    pred_toks = get_tokens(a_pred)
    return compute_f1_from_tokens(gold_toks, pred_toks)

In [192]:
def evaluate(examples, prompts, gens):
    """Generic evalution function.
    
    Parameters
    ----------
    examples: iterable of `SquadExample` instances
    prompts: list of str
    preds: list of LM-generated texts to evaluate as answers
    
    Returns
    -------
    dict with keys "em_per", "macro_f1", "examples", where
    each "examples" value is a dict
    
    """        
    results = []
    for ex, prompt, gen in zip(examples, prompts, gens):
        answers = ex.answers
        pred = extract_answer(gen['generated_answer'])
        # The result is the highest EM from the available answer strings:
        em = max([compute_exact(ans, pred) for ans in answers])
        f1 = max([compute_f1(ans, pred) for ans in answers])
        recall = max([compute_recall(ans, pred) for ans in answers])
        precision = max([compute_precision(ans, pred) for ans in answers])
        gen.update({
            "id": ex.id, 
            "question": ex.question, 
            "prompt": prompt,
            "prediction": pred, 
            "answers": answers, 
            "em": em,
            "f1": f1,
            'recall': recall,
            'precision': precision
        })
        results.append(gen)
    data = {}        
    data["macro_f1"] = np.mean([d['f1'] for d in results])
    data["recall"] = np.mean([d['recall'] for d in results])
    data["precision"] = np.mean([d['precision'] for d in results])
    data["em_per"] = sum([d['em'] for d in results]) / len(results)
    data["examples"] = results
    return data

In [27]:
# from utility.utils.dpr import has_answer, DPR_normalize

def success_at_k(examples,tables, k=20):
    scores = []
    for ex in examples: 
        scores.append(evaluate_retrieval_example(ex,tables, k=k))
    return sum(scores) / len(scores)

# def pinecone_search(query, tables,k=1):
#     # generate embedding for the query
#     xq = retriever.encode([query]).tolist()
#     # query pinecone index to find the table containing answer to the query
#     result = index.query(xq, top_k=k)
#     return result       

def query_pinecone(query, tables, k=3):
    # generate embedding for the query
    xq = retriever.encode([query]).tolist()
    # query pinecone index to find the table containing answer to the query
    result = index.query(xq, top_k=k)
    # return the relevant table from the tables list
    return [tables[int(r['id'])] for r in result['matches']]
            
def evaluate_retrieval_example(ex, tables,k=3):    
    # results = searcher.search(ex.question, k=k)
    results = query_pinecone(ex.question, tables, k=k)
    for passage in results:
        if ex.context == passage:
            return 1
    # for passage_id, passage_score,_ in zip(*results):
    #     passage = searcher.collection[passage_id]
    #     score = has_answer([DPR_normalize(ans) for ans in ex.answers], passage)
    #     if score:
    #         return 1
    return 0

In [40]:
len(test_examples)

2214

In [42]:
%%time
print(success_at_k(test_examples, test_flatten_tables, k=1))

0.9968383017163505
CPU times: user 24min 43s, sys: 5min 9s, total: 29min 52s
Wall time: 6min 21s


In [43]:
%%time
print(success_at_k(test_examples, test_flatten_tables, k=3))

In [44]:
def build_few_shot_open_qa_prompt(question, passage, train_exs, joiner="\n\n"):
    """Few-shot OpenQA prompts.

    Parameters
    ----------
    question : str
    passage : str
        Presumably something retrieved via search.
    train_exs : iterable of SQuAD train examples
        These can be obtained via a random sample from 
        `squad_train` as defined above.
    joiner : str
        The character to use to join pieces of the prompt 
        into a single str.

    Returns
    -------
    str, the prompt

    """
    ##### YOUR CODE HERE
    segs = []  
    for t in train_exs:
        segs += [
            f"Title: {t.title}",
            f"Background: {t.context}",
            f"Q: {t.question}",
            f"A: {t.answers[0]}"
        ]
    title, context = passage.split(" | ", 1)
    segs += [
        f"Title: {title}",
        f"Background: {context}",
        f"Q: {question}",
        f"A:"
    ]
    return joiner.join(segs) 


In [45]:
def answer_scoring(passages, passage_probs, prompts, gen_func=run_gpt3):
    """Implements our basic scoring strategy.

    Parameters
    ----------
    passages : list of str
    passage_probs : list of float
    prompts : list of str
    gen_func : either `run_eleuther` or `run_gpt3`

    Returns
    -------
    list of pairs (score, dict), sorted with the largest score first.
    `dict` should be the return value of `gen_func` for an example.

    """
    data = []
    for passage, passage_prob, prompt in zip(passages, passage_probs, prompts):
        # Run `gen_func` on [prompt] (crucially, the singleton list here),
        # and get the dictionary `gen` from the singleton list `gen_func`
        # returns, and then use the values to score `gen` according to our
        # scoring method.
        #
        # Be sure to use "generated_answer_probs" for the scores.

        gen = gen_func(prompt)[0]
        gen_probs = gen['generated_answer_probs'] 
        score = passage_prob*np.prod(gen_probs)
        data.append((score, gen)) 

    # Return `data`, sorted with the highest scoring `(score, gen)`
    # pair given first.

    return sorted(data, key=lambda x: x[0], reverse=True)


In [128]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def get_top_hyper_passages(question, table_id, k):
    def _find_top_k_relevant_texts(question, context, k):
        vectorizer = TfidfVectorizer()
        # Add the question to the list of context texts
        texts = [question] + list(context.values())
        # Calculate the tf-idf matrix
        tfidf_matrix = vectorizer.fit_transform(texts)
        # Calculate the cosine similarity between the question and context texts
        cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
        # Find the indices of the top k most similar texts
        top_k_indices = np.argsort(cosine_similarities)[-k:][::-1]
        # Find the corresponding ids in the context dictionary
        top_k_ids = [list(context.keys())[idx] for idx in top_k_indices]
        return top_k_ids

    # check & return the highest rouge as hack
    json_path = f'https://raw.githubusercontent.com/wenhuchen/OTT-QA/master/data/traindev_request_tok/{table_id}.json'
    with smart_open.open(json_path, 'r') as file:
        hyper_passages = json.load(file)
    tbl_pairs = tables_df[tables_df.uid == table_id]['link_pairs'].values[0]
    tbl_pairs_dict = {k: v for d in tbl_pairs for k, v in d.items()}
    top_k_ids = _find_top_k_relevant_texts(question, hyper_passages, k)
    return [(tbl_pairs_dict[i], f'{tbl_pairs_dict[i]}: {hyper_passages[i]}') for i in  top_k_ids]
    

def get_passages_with_scores(question, exs, tables, k=5, embed_hyper_psg = False):
    """Pseudo-probabilities from the retriever.

    Parameters
    ----------
    question : str
    k : int
        Number of passages to retrieve.

    Returns
    -------
    passages (list of str), passage_probs (np.array)

    """
    # Use the `searcher` to get `k` passages for `questions`:

    # generate embedding for the query
    xq = retriever.encode([question]).tolist()
    # query pinecone index to find the table containing answer to the query
    results = index.query(xq, top_k=k)

    passages, tbl_ids, scores= [],[], []
    for r in results['matches']:
        table_passage = tables[int(r['id'])]
        table_id = exs[int(r['id'])].table_id

        if embed_hyper_psg:
            # TODO implement below; just pick two most relevant passage
            hyper_passages = get_top_hyper_passages(question, table_id, 15)
            for psg in hyper_passages:
                table_passage = table_passage.replace(psg[0], psg[1])
        
        passages.append(table_passage)
        tbl_ids.append(table_id)
        scores.append(r['score'])

    # Softmax normalize the scores and convert the list to
    # a NumPy array:
    import scipy
    passage_probs = scipy.special.softmax(scores)
    
    return passages,tbl_ids

In [47]:
def get_question_context(question, passage, squd_examples):
    """Pseudo-probabilities from the retriever.

    Parameters
    ----------
    question : str
    k : int
        Number of passages to retrieve.

    Returns
    -------
    passages (list of str), passage_probs (np.array)

    """
    # Use the `searcher` to get `k` passages for `questions`:

    # generate embedding for the query
    qc = question + " | " + passage
    xq = retriever.encode([qc]).tolist()
    # query pinecone index to find the table containing answer to the query
    results = index_qc.query(xq, top_k=1)

    return squd_examples[int(results['matches'][0]['id'])]

In [230]:

def create_few_shots_query_exs():
    # USING A FIXED EXAMPLE FOR NOW
    return f"""
    Question: Which award did the first book of Gary Zukav receive?
    Rationale: Let's think step by step. To answer this question, we first need to find out the name of the first book of Gary Zukav.
    Search Query: "What is the name of Gary Zukav's first book?"

    ---

    Question: The heir to the Du Pont family fortune sponsored what wrestling team?
    Rationale: Let's think step by step. To answer this question, we first need to find out who the heir to the Du Pont family fortune is.
    Search Query: "Heir to the Du Pont family fortune"

    ---

    Question: Who was the director of the 2009 movie featuring Peter Outerbridge as William Easton?
    Rationale: Let's think step by step. To answer this question, we first need to find out the name of the 2009 movie featuring Peter Outerbridge as William Easton.
    Search Query: "2009 movie featuring Peter Outerbridge as William Easton"
    """

def create_few_shots_answer_exs(train_exs):
    few_shots = []
    for i in train_exs:
        few_shots.append(f"Question: {i.question} \n Answer: {i.answers[0]}")
    return '\n'.join(few_shots)

def search_query_prompt(few_shot_exs, question):
    return f"""
    Write a search query in a question format that will help answer a complex question.

    ---

    Follow the following format.

    Question: ${{the question to be answered}}
    Rationale: Let's think step by step. To answer this question, we first need to find out ${{the missing information}}
    Search Query: ${{a simple question for seeking the missing information}}

    ---
    {few_shot_exs}
    ---

    Question: {question}
    Rationale: Let's think step by step. To answer this question, we first need to find out
    """

def search_query_w_context_prompt(few_shot_exs, question, context, prior_search_query):
    return f"""
    Write a search query that will help answer a complex question.

    ---

    Follow the following format.

    Context:
    ${{sources that may contain relevant content}}

    Question: ${{the question to be answered}}

    Rationale: Let's think step by step. Based on the context, we have learned the following. ${{information from the context that provides useful clues}}

    Search Query: ${{a simple question for seeking the missing information}}

    ---
    {few_shot_exs}
    ---

    Context:{context}

    Question: {question}

    Rationale: Let's think step by step. First we find {prior_search_query}, based on the context, we have learned the following: 
    """

def create_combined_context(tbl_psg, hyper_psg):
    psg = f"""Table: \n {tbl_psg} \n"""
    hyper_psg_txt = []
    for i in hyper_psg:
        hyper_psg_txt.append(i[1])
    return psg + '\n'.join(hyper_psg_txt)
    
def answer_prompt(few_shots_exs, context, question):
        
    return f"""
    Answer questions with short factoid answers.

    ---
    {few_shots_exs}
    ---

    Follow the following format.

    Context:
    ${{sources that may contain relevant content}}

    Question: ${{the question to be answered}}

    Rationale: Let's think step by step. ${{a step-by-step deduction that identifies the correct response, which will be provided below}}

    Answer: ${{a short factoid answer, often between 1 and 5 words}}

    ---

    Context: {context}

    Question: {question}

    Rationale: Let's think step by step. 
    """

def extract_search_query(output):
    return output.split('Search Query:')[1].strip()

def extract_answer(output):
    answer = output.split('Answer:')[1].strip()
    if len(answer) == 0:
        answer = 'Not-answerable'
    return answer


In [226]:

def run_multi_hop(question, exs, flatten_tables, model="gpt-3.5-turbo"):    
    prompts = []
    generations = []
    
    # hop 0
    init_prompt = search_query_prompt(create_few_shots_query_exs(), question)
    result0 = run_gpt_chat(init_prompt, model)
    prompts.append(result0[0]['prompt'])
    generations.append(result0[0])
    search_query0 = extract_search_query(result0[0]['generated_answer'])
    # RM 0
    passages, table_ids = get_passages_with_scores(question, exs, flatten_tables, k=1)
    
    # hop 1
    prompt1 = search_query_w_context_prompt(create_few_shots_query_exs(), question, passages[0], search_query0)
    result1 = run_gpt_chat(prompt1, model)
    prompts.append(result1[0]['prompt'])
    generations.append(result1[0])
    search_query1 = extract_search_query(result1[0]['generated_answer'])
    # RM 1
    hyper_psgs = get_top_hyper_passages(search_query1, table_ids[0], 3)
    
    # hop 2
    combined_txt = create_combined_context(passages[0], hyper_psgs)
    train_few_shots = create_few_shots_answer_exs(training_examples[:3])
    prompt2 = answer_prompt(train_few_shots, combined_txt, question)
    result2 = run_gpt_chat(prompt2, model)
    prompts.append(result2[0]['prompt'])
    generations.append(result2[0])
    answer = extract_answer(result2[0]['generated_answer'])

    return answer, prompts, generations

def evaluate_my_answer_scoring_multi_hop(
        examples,
        full_examples,
        full_flatten_tables,
        model = "gpt-3.5-turbo"):
    samples = []
    prompts = []
    # A list of dicts that you get from `gen_func`.
    gens = []
    answer = ''

    for exs in examples:
        try:
            answer, prompts_multi_hop, generations_multi_hop = run_multi_hop(exs.question, full_examples, full_flatten_tables, model)
            prompts += [generations_multi_hop]
            # just keep last generation
            gens += [generations_multi_hop[-1]]
            samples.append(exs)
        except:
            pass

        print(exs.question)
        print('expected answer:', exs.answers[0])
        print('predicted answer:', answer)
        print("#"*10)
    return evaluate(samples,prompts,gens)

# result =evaluate_my_answer_scoring(examples=dev_exs,squad_train=squad_train,joiner="\n\n",gen_func=run_gpt3)['macro_f1']


In [200]:
# def answer_scoring_eval(passages, passage_probs, prompts, avg_answer_len=3, gen_func=run_gpt3):
#     """Implements our basic scoring strategy.

#     Parameters
#     ----------
#     passages : list of str
#     passage_probs : list of float
#     prompts : list of str
#     gen_func : either `run_eleuther` or `run_gpt3`

#     Returns
#     -------
#     list of pairs (score, dict), sorted with the largest score first.
#     `dict` should be the return value of `gen_func` for an example.

#     """
#     data = []
#     for passage, passage_prob, prompt in zip(passages, passage_probs, prompts):
#         # Run `gen_func` on [prompt] (crucially, the singleton list here),
#         # and get the dictionary `gen` from the singleton list `gen_func`
#         # returns, and then use the values to score `gen` according to our
#         # scoring method.
#         #
#         # Be sure to use "generated_answer_probs" for the scores.
#         gen = gen_func(prompt)[0]
#         gen_probs = gen['generated_answer_probs'] 
#         score_n = passage_prob*np.minimum(np.prod(gen_probs)*len(gen_probs)/avg_answer_len,1)
#         data.append((score_n, gen,prompt)) 

#     # Return `data`, sorted with the highest scoring `(score, gen)`
#     # pair given first.

#     return sorted(data, key=lambda x: x[0], reverse=True)

def answer_scoring_eval_gpt(prompts, avg_answer_len=3, model = 'gpt-4', gen_func=run_gpt_chat):
    """Implements our basic scoring strategy.

    Parameters
    ----------
    passages : list of str
    prompts : list of str
    gen_func : either `run_gpt_chat`

    Returns
    -------
    list of pairs (score, dict), sorted with the largest score first.
    `dict` should be the return value of `gen_func` for an example.

    """
    data = []
    for prompt in prompts:
        # Run `gen_func` on [prompt] (crucially, the singleton list here),
        # and get the dictionary `gen` from the singleton list `gen_func`
        # returns, and then use the values to score `gen` according to our
        # scoring method.
        #
        # Be sure to use "generated_answer_probs" for the scores.
        gen = gen_func(prompt, model)[0]
        data.append((prompt, gen))

    return data


def evaluate_my_answer_scoring(
        examples,
        flatten_tables,
        squad_train,
        joiner="\n\n",
        gen_func=run_gpt3):

    joiner="\n\n",
    gen_func=run_gpt3

    prompts = []
    # A list of dicts that you get from `gen_func`.
    gens = []
    samples = []

    for exs in examples:
        passages, passage_probs = get_passages_with_scores(exs.question, examples, flatten_tables, k=1)
        train_exs = [] 
        for passage in passages:
            train_exs.append(get_question_context(exs.question, passage, squad_train))
        # TODO modify here - need to build better multi-hop...
        ps_candidates = [build_few_shot_open_qa_prompt(exs.question, p, train_exs) for p in passages]
        try:
#             results = answer_scoring_eval(passages, passage_probs, ps_candidates,avg_answer_len=10,gen_func=run_gpt3)
            results = answer_scoring_eval_gpt(ps_candidates, avg_answer_len=10, model="gpt-3.5-turbo", gen_func=run_gpt_chat)
            prompts += [results[0][1]]
            gens += [results[0][2]]
            samples.append(exs)
        except:
            pass
        print(exs.question)
        print('expected answer:', exs.answers[0])
        print('predicted answer:', results[0][1]['generated_answer'])
        print("#"*10)
    return evaluate(samples,prompts,gens)

# result =evaluate_my_answer_scoring(examples=dev_exs,squad_train=squad_train,joiner="\n\n",gen_func=run_gpt3)['macro_f1']


In [188]:
# # dev_exs = sorted(test_examples, key=lambda x: hash(x.id))[: 50]
# dev_exs = test_examples[0:30]
# result = evaluate_my_answer_scoring(examples=dev_exs, flatten_tables=test_flatten_tables, squad_test = test_examples, squad_train=training_examples,joiner="\n\n",gen_func=run_gpt3)

In [233]:
dev_exs = test_examples[0:30]
result_ = evaluate_my_answer_scoring_multi_hop(examples=dev_exs, full_examples = test_examples, full_flatten_tables=test_flatten_tables)

Who created the series in which the character of Robert , played by actor Nonso Anozie , appeared ?
expected answer: Lynda La Plante
predicted answer: Lynda La Plante.
##########
What did the 2nd championship win at the Sevens Grand Prix Series for the team with the most top 4 finishes qualify them for ?
expected answer: 2016 Summer Olympics
predicted answer: Insufficient information provided to answer the question.
##########
This 70 's Kishore Kumar song was in a film produced by Alankar Chitra and directed by Shanker Mukherjee ?
expected answer: Main Pyaasa Tum
predicted answer: "Main Pyaasa Tum" from the film Faraar.
##########
What is the full birth name of the Bradford A.F.C player that only played for the team in 2011 ?
expected answer: Guy Peter Bromley Branston
predicted answer: Peter Thorne
##########
How many academic staff are at the university in Budapest that has the official abbreviation BME ?
expected answer: 961
predicted answer: Peter Thorne
##########
The 1995 Toohey

In [231]:
# dev_exs = sorted(test_examples, key=lambda x: hash(x.id))[: 50]
dev_exs = test_examples[5:6]
result = evaluate_my_answer_scoring_multi_hop(examples=dev_exs, full_examples = test_examples, full_flatten_tables=test_flatten_tables, model = 'gpt-4')

The 1995 Tooheys 1000 driver who was second-to-last in the Tooheys Top 10 was born where ?
expected answer: Sydney
predicted answer: Sydney
##########


In [232]:
print(result['examples'][0]['prompt'][1]['generated_answer'])

"1995 Tooheys 1000 | "Pos","No",Team,"Driver","Car",TT10,Qual<row>"Pole",015,"Holden Racing Team","Craig Lowndes","Holden VR Commodore",2:11.5540,2:12.32<row>2,1,Winfield Racing,"Mark Skaife","Holden VR Commodore",2:11.9504,2:12.10<row>3,11,"Castrol Perkins Racing","Larry Perkins","Holden VR Commodore",2:12.4759,2:11.58<row>4,7,"Coca-Cola Racing","Wayne Gardner","Holden VR Commodore",2:12.5422,2:12.36<row>5,30,"Peter Jackson Racing","Glenn Seton","Ford EF Falcon",2:12.5438,2:11.60<row>6,35,"Peter Jackson Racing","Alan Jones","Ford EF Falcon",2:12.7075,2:12.79<row>7,18,Shell FAI Racing,"Steven Johnson","Ford EF Falcon",2:13.1149,2:12.46<row>8,17,Shell FAI Racing,"Dick Johnson","Ford EF Falcon",2:13.1854,2:11.90<row>9,25,"Castrol Longhurst Ford","Tony Longhurst","Ford EF Falcon",2:13.8883,2:13.11<row>10,05,"Holden Racing Team","Peter Brock","Holden VR Commodore",DNF,2:12.03<row>"

Based on the context, the second-to-last driver in the Tooheys Top 10 is Tony Longhurst.

Search Query: "Ton

In [None]:
# Random test

In [None]:
# Multi-hop logic 
# First try without stopping criteria
# Then try add stopping criteria
# All try 2 hops