# **Vertexai Init**

In [1]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

In [2]:
PROJECT_ID = "gen-lang-client-0341374211"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

if "google.colab" in sys.modules:
    # Define project information
    PROJECT_ID = PROJECT_ID
    LOCATION = LOCATION

    # Initialize Vertex AI
    import vertexai
    vertexai.init(project=PROJECT_ID, location=LOCATION)

In [3]:
search_query = """Sea food near Googleplex
1600 Amphitheatre Parkway
Mountain View, CA 94043
United States"""
#'how to make a great pastrami sandwich'

# **Realtime Google Search with Langchain**

In [4]:
!pip install -U duckduckgo_search
!python3 -m pip install googlesearch-python
!pip install -q langchain playwright beautifulsoup4 html2text

Collecting duckduckgo_search
  Downloading duckduckgo_search-4.1.0-py3-none-any.whl (25 kB)
Collecting curl-cffi>=0.5.10 (from duckduckgo_search)
  Downloading curl_cffi-0.5.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m53.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: curl-cffi, duckduckgo_search
Successfully installed curl-cffi-0.5.10 duckduckgo_search-4.1.0
Collecting googlesearch-python
  Downloading googlesearch-python-1.2.3.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googlesearch-python
  Building wheel for googlesearch-python (setup.py) ... [?25l[?25hdone
  Created wheel for googlesearch-python: filename=googlesearch_python-1.2.3-py3-none-any.whl size=4209 sha256=d4143c48a3be7c07ea91a1be8ce2b2e5774956e9537aa63bf4007d9dd66b53db
  Stored in directory: /root/.cache/pip/wheels/98/24/e9/6c22550294

In [5]:
search_query = 'Sea food near Googleplex\n1600 Amphitheatre Parkway\nMountain View, CA 94043\nUnited States'

In [6]:
google_search_results = []
structured_response = []

In [7]:
number_of_results = 2
from googlesearch import search
results = search(search_query, lang="en", num_results=number_of_results)

In [8]:
for result in results:
  if not result.startswith("https://www.tripadvisor.com"):
    google_search_results.append(result)

In [9]:
google_search_results

['https://www.yelp.com/search?cflt=seafood&find_loc=Mountain+View%2C+CA+94043',
 'https://us.trip.com/travel-guide/mountain-view-34682-restaurant/googleplex-18697240/']

In [10]:
import html2text
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer

async def do_webscraping(link):
    try:
        urls = [link]
        loader = AsyncHtmlLoader(urls)
        docs = loader.load()

        html2text_transformer = Html2TextTransformer()
        docs_transformed = html2text_transformer.transform_documents(docs)

        if docs_transformed != None and len(docs_transformed) > 0:
            metadata = docs_transformed[0].metadata
            title = metadata.get('title', '')
            return {
                'summary': docs_transformed[0].page_content,
                'title': title,
                'metadata': metadata,
                'clean_content': html2text.html2text(docs_transformed[0].page_content)
            }
        else:
            return None

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [11]:
for link in google_search_results:
  print(link)
  response = await do_webscraping(link)
  if response != None:
    structured_response.append(response)

https://www.yelp.com/search?cflt=seafood&find_loc=Mountain+View%2C+CA+94043


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.63s/it]


https://us.trip.com/travel-guide/mountain-view-34682-restaurant/googleplex-18697240/


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.42s/it]


In [12]:
structured_response

[{'summary': 'Yelp\n\nYelp for Business\n\nWrite a Review\n\nLog InSign Up\n\nRestaurants\n\nDelivery\n\nBurgers\n\nChinese\n\nItalian\n\nReservations\n\nJapanese\n\nMexican\n\nThai\n\nHome Services\n\nContractors\n\nElectricians\n\nHome Cleaners\n\nHVAC\n\nLandscaping\n\nLocksmiths\n\nMovers\n\nPlumbers\n\nAuto Services\n\nAuto Repair\n\nAuto Detailing\n\nBody Shops\n\nCar Wash\n\nCar Dealers\n\nOil Change\n\nParking\n\nTowing\n\nMore\n\nDry Cleaning\n\nPhone Repair\n\nBars\n\nNightlife\n\nHair Salons\n\nGyms\n\nMassage\n\nShopping\n\nMore\n\nFilters\n\n$$$$$$$$$$\n\nSuggested\n\nOpen Now\n\n\\--:--\n\nOffers Delivery\n\nReservations\n\nFree Wi-Fi\n\nOutdoor Seating\n\nDogs Allowed\n\nFeatures\n\nOffers Takeout\n\nGood for Groups\n\nGood for Dinner\n\nGood for Kids\n\nSee all\n\nDistance\n\nBird\'s-eye View\n\nDriving (5 mi.)\n\nBiking (2 mi.)\n\nWalking (1 mi.)\n\nWithin 4 blocks\n\nYelpRestaurantsSeafood\n\n# The Best 10 Seafood Restaurants near Mountain View, CA 94043\n\nSort:Recom

# **Information Extraction**

In [13]:
import re
import json

def extract_json(input_string):
    # Extract JSON within ``` block
    matches = re.findall(r'```(.*?)```', input_string, re.DOTALL)

    if matches:
        # Join the matches into a single string
        json_content = ''.join(matches)

        # Remove periods
        json_content = re.sub(r'\.', '', json_content)

        return json_content
    else:
        print("No ``` block found.")
        return None

In [14]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part

def execute_prompt(prompt, max_output_tokens=8192):
  model = GenerativeModel("gemini-pro")
  responses = model.generate_content(
    prompt,
    generation_config={
        "max_output_tokens": max_output_tokens,
        "temperature": 0,
        "top_p": 1
    },
  stream=True,
  )

  final_response = []

  for response in responses:
      final_response.append(response.candidates[0].content.parts[0].text)

  return ".".join(final_response)

In [15]:
def get_text_extract_prompt(title, summary):
  prompt = f"""
  Here is its title: {title}
  Here is some text extracted:
  ---------
  {summary}
  ---------

  Web pages can have a lot of useless junk in them.
  For example, there might be a lot of ads, or a
  lot of navigation links, or a lot of text that
  is not relevant to the topic of the page. We want
  to extract only the useful information from the text.

  You can use the url and title to help you understand
  the context of the text.
  Please extract only the useful information from the text.
  Try not to rewrite the text, but instead extract
  only the useful information from the text.
  """
  return prompt

In [16]:
summarries = []

In [17]:
for structured_response_item in structured_response:
    title = structured_response_item['title']
    summary = structured_response_item['summary']
    if summary != "<html><body></body></html>":
      print(f'Summary for Title: {title}\n')
      text_extract_prompt = get_text_extract_prompt(title, summary)
      prompt_response = execute_prompt(text_extract_prompt)
      summarries.append(prompt_response)

Summary for Title: THE BEST 10 Seafood Restaurants near MOUNTAIN VIEW, CA 94043 - Last Updated December 2023 - Yelp

Summary for Title: Googleplex restaurants, addresses, phone numbers, photos, real user reviews, 1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA, Mountain View restaurant recommendations - Trip.com



In [18]:
summarries

['1. Limón: Peruvian seafood and cocktail bar with a delicious menu of mouth.-watering seafood and land animal options.\n2. Pacific Catch: Seafood,. tacos, and sushi bar with outdoor seating and Korean-style seafood pancakes.\n3. The Sea by Alexander’s Steakhouse: Seafood, steakhouse,. and bar with fresh seafood and jumbo shrimp.\n4. Cap’t Loui: Seafood, fish & chips, and Cajun/Creole with free parking. and large group friendly options.\n5. Rustic House Oyster Bar and Grill - Los Altos: Seafood, bar, and American with a grouper special and outdoor seating.\n6. The City Fish: Sandwiches, seafood, and fish &. chips with big portions of quality seafood at a reasonable price.\n7. King’s Fish House - San Jose: Seafood with outdoor seating and a tartare sauce with a great balance of spicy and seafood taste.\n8. La M.area of the Sea: Seafood food stand with fresh oysters and great condiments.\n9. Supreme Crab: Seafood and Cajun/Creole with sports on TV and large group friendly options.\n10. G

# **Retrieval-augmented generation (RAG)**

In [19]:
!pip install -U -q google.generativeai

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/146.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/146.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━[0m [32m81.9/146.9 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m146.9/146.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [20]:
%pip install -Uq chromadb pydantic typing-extensions==4.6.0

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m508.6/508.6 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m381.9/381.9 kB[0m [31m30.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.1/93.1 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.3/60.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m78.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.9/57.9 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━

In [21]:
import textwrap
import chromadb
import numpy as np
import pandas as pd
from google.colab import userdata

import google.generativeai as genai
import google.ai.generativelanguage as glm

# Used to securely store your API key
from google.colab import userdata

from IPython.display import Markdown
from chromadb import Documents, EmbeddingFunction, Embeddings
genai.configure(api_key=userdata.get('google_key'))

In [22]:
collection_name = 'localstore'

In [23]:
class GeminiEmbeddingFunction(EmbeddingFunction):
  def __call__(self, input: Documents) -> Embeddings:
    model = 'models/embedding-001'
    title = "Custom query"
    return genai.embed_content(model=model,
                                content=input,
                                task_type="retrieval_document",
                                title=title)["embedding"]
def create_chroma_db(documents, name):
  chroma_client = chromadb.Client()
  db = chroma_client.create_collection(name=name, embedding_function=GeminiEmbeddingFunction())

  for i, d in enumerate(documents):
    db.add(
      documents=d,
      ids=str(i)
    )
  return db

In [24]:
# Set up the DB
db = create_chroma_db(summarries, collection_name)

In [25]:
pd.DataFrame(db.peek(3))

Unnamed: 0,ids,embeddings,metadatas,documents,uris,data
0,0,"[0.059015557169914246, -0.04665600508451462, -...",,1. Limón: Peruvian seafood and cocktail bar wi...,,
1,1,"[0.038881026208400726, -0.018561283126473427, ...",,- Googleplex is located at 1600 Amphitheatre P...,,


In [26]:
def get_relevant_passage(query, db):
  passage = db.query(query_texts=[query], n_results=1)['documents'][0][0]
  return passage

In [27]:
def make_prompt(query, relevant_passage):
  escaped = relevant_passage.replace("'", "").replace('"', "").replace("\n", " ")
  prompt = ("""You are a helpful and informative bot that answers questions using text from the reference passage included below. \
  If the passage is irrelevant to the answer, you may ignore it.
  QUESTION: '{query}'
  PASSAGE: '{relevant_passage}'

    ANSWER:
  """).format(query=query, relevant_passage=escaped)

  return prompt

In [28]:
query = "where can I get seafood"
passage = get_relevant_passage(query, db)
Markdown(passage)
prompt = make_prompt(query, passage)
Markdown(prompt)

You are a helpful and informative bot that answers questions using text from the reference passage included below.   If the passage is irrelevant to the answer, you may ignore it.
  QUESTION: 'where can I get seafood'
  PASSAGE: '1. Limón: Peruvian seafood and cocktail bar with a delicious menu of mouth.-watering seafood and land animal options. 2. Pacific Catch: Seafood,. tacos, and sushi bar with outdoor seating and Korean-style seafood pancakes. 3. The Sea by Alexander’s Steakhouse: Seafood, steakhouse,. and bar with fresh seafood and jumbo shrimp. 4. Cap’t Loui: Seafood, fish & chips, and Cajun/Creole with free parking. and large group friendly options. 5. Rustic House Oyster Bar and Grill - Los Altos: Seafood, bar, and American with a grouper special and outdoor seating. 6. The City Fish: Sandwiches, seafood, and fish &. chips with big portions of quality seafood at a reasonable price. 7. King’s Fish House - San Jose: Seafood with outdoor seating and a tartare sauce with a great balance of spicy and seafood taste. 8. La M.area of the Sea: Seafood food stand with fresh oysters and great condiments. 9. Supreme Crab: Seafood and Cajun/Creole with sports on TV and large group friendly options. 10. Gochi - Mountain View: Japanese, wine bar, and seafood with a live wait time of 31-.46 minutes.'

    ANSWER:
  

In [29]:
model = genai.GenerativeModel('gemini-pro')
answer = model.generate_content(prompt)
Markdown(answer.text)

1. Limón: Peruvian seafood and cocktail bar with a delicious menu of mouth.-watering seafood and land animal options. 
2. Pacific Catch: Seafood,. tacos, and sushi bar with outdoor seating and Korean-style seafood pancakes. 
3. The Sea by Alexander’s Steakhouse: Seafood, steakhouse,. and bar with fresh seafood and jumbo shrimp. 
4. Cap’t Loui: Seafood, fish & chips, and Cajun/Creole with free parking. and large group friendly options. 
5. Rustic House Oyster Bar and Grill - Los Altos: Seafood, bar, and American with a grouper special and outdoor seating. 
6. The City Fish: Sandwiches, seafood, and fish &. chips with big portions of quality seafood at a reasonable price. 
7. King’s Fish House - San Jose: Seafood with outdoor seating and a tartare sauce with a great balance of spicy and seafood taste. 
8. La M.area of the Sea: Seafood food stand with fresh oysters and great condiments. 
9. Supreme Crab: Seafood and Cajun/Creole with sports on TV and large group friendly options. 
10. Gochi - Mountain View: Japanese, wine bar, and seafood with a live wait time of 31-.46 minutes.