# Enterprise Search Q & A Automation notebook 
_Augmented with document retrieval from Google Enterprise Search_

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/retrieval-augmented/enterprise-search/examples/question_answering.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/retrieval-augmented/enterprise-search/examples/question_answering.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/retrieval-augmented/enterprise-search/examples/question_answering.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

## Install pre-requisites

If running in Colab install the pre-requisites into the runtime. Otherwise it is assumed that the notebook is running in Vertex Workbench. In that case it is recommended to install the pre-requistes from a terminal using the `--user` option.

In [None]:
! pip install google-cloud-aiplatform google-cloud-discoveryengine langchain==0.0.229 pydantic==1.10.8 typing-inspect==0.8.0 typing_extensions==4.5.0 --upgrade --user

---

#### ⚠️ Do not forget to click the "RESTART RUNTIME" button above.

---

## Authenticate

If running in Colab authenticate with `google.colab.google.auth` otherwise assume that running on Vertex Workbench.

In [None]:
import os
import sys
import json
if "google.colab" in sys.modules:
    from google.colab import auth as google_auth
    google_auth.authenticate_user()

## Configure notebook environment

In [115]:
# Local import of retriever class
import sys, os
sys.path.append(os.path.abspath(".."))
from utils.es import es_raw_search_summary

## Some #defines for the index values
"Query"
"Golden Doc"
"Golden Doc Page Number"
"Golden Answer"
"Top 5 Docs"
"Top 5 extractive answers"
"Top 5 extractive segments"
"Answer / Summary"

In [None]:
queryindex = 0
goldendocindex = 1
goldendocpagenum = 2
goldenanswer = 3
top5docsindex = 4
top5extansindex = 5
top5extsegindex = 6
answerindex = 7

## Define function to read a single column from the csv file

In [None]:
def readthequestions(filename, header_name="question"):
    import csv
    # Specify the CSV file path
    csv_file_path = filename
    # Specify the header name of the desired column
    header_name = 'question'  # Change this to the desired header name

    # Initialize an empty array to store the column values
    questions = []

    # Open the CSV file and read column values
    with open(csv_file_path, 'r') as csvfile:
        csv_reader = csv.DictReader(csvfile)
        header = csv_reader.fieldnames

        for row in csv_reader:
            questions.append(row)

    # Print the extracted column values
    print(questions)
    return header, questions

## Function to convert results to something we can track

In [None]:
def convert_results_tojson(query, results):
    import json
    """Converts search response to a list of LangChain documents."""
    documents = []
    jsonresults = []
    if results: 
        for result in results:
            if hasattr(result.document, "derived_struct_data"):
                metadata = getattr(result.document, "struct_value", {})
                doc_data = result.document.derived_struct_data
                jsonresult = {}
                jsonresult["query"] = query
                jsonresult["id"] =  result.document.id
                # Get extractive answer
                chunk_type = "extractive_answers"
                for chunk in doc_data.get(chunk_type, []):
                    jsonresult["documentlink"] = doc_data.get('link', '')
                    jsonresult["extractive_answer_content"] = chunk.get("content", "")
                    jsonresult["extractive_answer_content_page"] = chunk.get('pageNumber', '')

                #Chunk Type = "extractive_segments"
                chunk_type = "extractive_segments"            
                for chunk in doc_data.get(chunk_type, []):
                    data = page_content=chunk.get("content", "")
                    jsonresult["extractive_segment_content"] = data
                       
                chunk_type = "snippets"            
                for chunk in doc_data.get(chunk_type, []):
                    #print("snippets")

                    data =chunk.get("snippet", "")
                    jsonresult["snippet"] = data
                #Snippet                
                jsonresults.append(jsonresult)
        
        #Retrofit the ranking
        numresults = len(jsonresults)
        count = 1
        for nr in jsonresults:
            nr['rank'] = str(count) + "/" + str(numresults)
            count = count + 1
    else:
        jsonresult = {}
        jsonresult['rank'] = None

        jsonresult["query"] = query
        jsonresult["id"] =  None
        jsonresult["extractive_segment_content"] = None
        jsonresult["documentlink"] = None
        jsonresult["extractive_answer_content_page"] = None
        jsonresults.append(jsonresult)

    return jsonresults

In [None]:
def format_results(results, topN = 5):
    """This function returns the top 5 extractive segments, answers, summarized answer"""
    top5docs = ""
    top5segments = ""
    top5answers = ""
    
    documents = []
    if results:
        ext_ans_cnt = 0
        ext_seg_cnt = 0
        for result in results:
            
            if hasattr(result.document, "derived_struct_data"):
                metadata = getattr(result.document, "struct_value", {})
                doc_data = result.document.derived_struct_data
                # Get extractive answer
                chunk_type = "extractive_answers"
                if (ext_ans_cnt < topN):
                    for chunk in doc_data.get(chunk_type, []):
                        content = chunk.get("content", "")
                        content = content.replace("\n","")
                        top5answers = top5answers + content + "\n\n"
                        #jsonresult["extractive_answer_content_page"] = chunk.get('pageNumber', '')
                        top5docs = top5docs + "Doc: " + doc_data.get('link', '') + "  " + "Page: " + chunk.get('pageNumber', '') + "\n\n"
                        ext_ans_cnt = ext_ans_cnt + 1

                chunk_type = "extractive_segments"     
                if (ext_seg_cnt < topN):
                    for chunk in doc_data.get(chunk_type, []):
                        data = page_content = chunk.get("content", "")
                        data = data.replace("\n","")
                        top5segments = top5segments + data + "\n\n"
                        ext_seg_cnt = ext_seg_cnt + 1

    return top5docs, top5segments, top5answers

### Set the following constants to reflect your environment
* The queries used in the examples here relate to a GCS bucket containing Alphabet investor PDFs, but these should be customised to your own data.

In [None]:
#PROJECT_ID = "<PROJECT_ID>"
#SEARCH_ENGINE_ID = "<ES Store ID>"

### Gather all the enterprise search results possible

In [None]:
import utils.es
import csv
import json

# First read the Questions 
header, questions = readthequestions("input.csv")

In [116]:
with open("output.tsv", "w", newline="") as tsvfile:
    
    tsvwriter = csv.writer(tsvfile, delimiter='\t')
    tsvwriter.writerow(header)    
    for q in questions:
        output = {}
        output = q
        print("Question is {}".format(q['Query']))
        summary, rawresult = es_raw_search_summary(PROJECT_ID, SEARCH_ENGINE_ID, q['Query'])
        output['Answer / Summary'] = summary
        output['Top 5 Docs'], output['Top 5 extractive segments'], output['Top 5 extractive answers'] = format_results(rawresult, topN = 5)        
        tsvwriter.writerow(output.values())

Question is What is braca1?
Filter is :None
Question is is there a test for braca2 ?
Filter is :None
