# **Vertexai Init**

In [1]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

In [2]:
PROJECT_ID = "gen-lang-client-0341374211"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

if "google.colab" in sys.modules:
    # Define project information
    PROJECT_ID = PROJECT_ID
    LOCATION = LOCATION

    # Initialize Vertex AI
    import vertexai
    vertexai.init(project=PROJECT_ID, location=LOCATION)

# **Realtime Google Search with Langchain**

In [3]:
search_query = """Sea food near Googleplex
1600 Amphitheatre Parkway
Mountain View, CA 94043
United States"""

In [4]:
!pip install -U duckduckgo_search
!python3 -m pip install googlesearch-python
!pip install -q langchain playwright beautifulsoup4 html2text

Collecting duckduckgo_search
  Downloading duckduckgo_search-4.1.0-py3-none-any.whl (25 kB)
Collecting curl-cffi>=0.5.10 (from duckduckgo_search)
  Downloading curl_cffi-0.5.10-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: curl-cffi, duckduckgo_search
Successfully installed curl-cffi-0.5.10 duckduckgo_search-4.1.0
Collecting googlesearch-python
  Downloading googlesearch-python-1.2.3.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: googlesearch-python
  Building wheel for googlesearch-python (setup.py) ... [?25l[?25hdone
  Created wheel for googlesearch-python: filename=googlesearch_python-1.2.3-py3-none-any.whl size=4209 sha256=86c7795aaa152e5b0a303b7138934f002913deae780fd9c9484361579b214250
  Stored in directory: /root/.cache/pip/wheels/98/24/e9/6c22550294

In [5]:
search_query = 'Sea food near Googleplex\n1600 Amphitheatre Parkway\nMountain View, CA 94043\nUnited States'

In [6]:
google_search_results = []
structured_response = []

In [7]:
number_of_results = 2
from googlesearch import search
results = search(search_query, lang="en", num_results=number_of_results)

In [8]:
for result in results:
  if not result.startswith("https://www.tripadvisor.com"):
    google_search_results.append(result)

In [9]:
google_search_results

['https://www.yelp.com/search?cflt=seafood&find_loc=Mountain+View%2C+CA+94043',
 'https://us.trip.com/travel-guide/mountain-view-34682-restaurant/googleplex-18697240/']

In [10]:
import html2text
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import Html2TextTransformer

async def do_webscraping(link):
    try:
        urls = [link]
        loader = AsyncHtmlLoader(urls)
        docs = loader.load()

        html2text_transformer = Html2TextTransformer()
        docs_transformed = html2text_transformer.transform_documents(docs)

        if docs_transformed != None and len(docs_transformed) > 0:
            metadata = docs_transformed[0].metadata
            title = metadata.get('title', '')
            return {
                'summary': docs_transformed[0].page_content,
                'title': title,
                'metadata': metadata,
                'clean_content': html2text.html2text(docs_transformed[0].page_content)
            }
        else:
            return None

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [11]:
for link in google_search_results:
  print(link)
  response = await do_webscraping(link)
  if response != None:
    structured_response.append(response)

https://www.yelp.com/search?cflt=seafood&find_loc=Mountain+View%2C+CA+94043


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.44s/it]


https://us.trip.com/travel-guide/mountain-view-34682-restaurant/googleplex-18697240/


Fetching pages: 100%|##########| 1/1 [00:01<00:00,  1.79s/it]


In [12]:
structured_response

[{'summary': 'Yelp\n\nYelp for Business\n\nWrite a Review\n\nLog InSign Up\n\nRestaurants\n\nDelivery\n\nBurgers\n\nChinese\n\nItalian\n\nReservations\n\nJapanese\n\nMexican\n\nThai\n\nHome Services\n\nContractors\n\nElectricians\n\nHome Cleaners\n\nHVAC\n\nLandscaping\n\nLocksmiths\n\nMovers\n\nPlumbers\n\nAuto Services\n\nAuto Repair\n\nAuto Detailing\n\nBody Shops\n\nCar Wash\n\nCar Dealers\n\nOil Change\n\nParking\n\nTowing\n\nMore\n\nDry Cleaning\n\nPhone Repair\n\nBars\n\nNightlife\n\nHair Salons\n\nGyms\n\nMassage\n\nShopping\n\nMore\n\nFilters\n\n$$$$$$$$$$\n\nSuggested\n\nOpen Now\n\n\\--:--\n\nOffers Delivery\n\nReservations\n\nFree Wi-Fi\n\nOutdoor Seating\n\nDogs Allowed\n\nFeatures\n\nOffers Takeout\n\nGood for Groups\n\nGood for Dinner\n\nGood for Kids\n\nSee all\n\nDistance\n\nBird\'s-eye View\n\nDriving (5 mi.)\n\nBiking (2 mi.)\n\nWalking (1 mi.)\n\nWithin 4 blocks\n\nYelpRestaurantsSeafood\n\n# The Best 10 Seafood Restaurants near Mountain View, CA 94043\n\nSort:Recom

# **Information Extraction**

In [13]:
import re
import json

def extract_json(input_string):
    # Extract JSON within ``` block
    matches = re.findall(r'```(.*?)```', input_string, re.DOTALL)

    if matches:
        # Join the matches into a single string
        json_content = ''.join(matches)

        # Remove periods
        json_content = re.sub(r'\.', '', json_content)

        return json_content
    else:
        print("No ``` block found.")
        return None

In [14]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part

def execute_prompt(prompt, max_output_tokens=8192):
  model = GenerativeModel("gemini-pro")
  responses = model.generate_content(
    prompt,
    generation_config={
        "max_output_tokens": max_output_tokens,
        "temperature": 0,
        "top_p": 1
    },
  stream=True,
  )

  final_response = []

  for response in responses:
      final_response.append(response.candidates[0].content.parts[0].text)

  return ".".join(final_response)

In [15]:
def get_text_extract_prompt(title, summary):
  prompt = f"""
  Here is its title: {title}
  Here is some text extracted:
  ---------
  {summary}
  ---------

  Web pages can have a lot of useless junk in them.
  For example, there might be a lot of ads, or a
  lot of navigation links, or a lot of text that
  is not relevant to the topic of the page. We want
  to extract only the useful information from the text.

  You can use the url and title to help you understand
  the context of the text.
  Please extract only the useful information from the text.
  Try not to rewrite the text, but instead extract
  only the useful information from the text.
  """
  return prompt

In [16]:
summarries = []

In [17]:
for structured_response_item in structured_response:
    title = structured_response_item['title']
    summary = structured_response_item['summary']
    if summary != "<html><body></body></html>":
      print(f'Summary for Title: {title}\n')
      text_extract_prompt = get_text_extract_prompt(title, summary)
      prompt_response = execute_prompt(text_extract_prompt)
      summarries.append(prompt_response)

Summary for Title: THE BEST 10 Seafood Restaurants near MOUNTAIN VIEW, CA 94043 - Last Updated December 2023 - Yelp

Summary for Title: Googleplex restaurants, addresses, phone numbers, photos, real user reviews, 1600 Amphitheatre Pkwy, Mountain View, CA 94043, USA, Mountain View restaurant recommendations - Trip.com



In [18]:
summarries

['1. Limón: Peruvian seafood and cocktail bar with a delicious menu featuring mouth.-watering seafood and land animal options.\n2. Pacific Catch: Seafood,. tacos, and sushi bar with Korean-style seafood pancakes and poke.\n3. The Sea by Alexander’s Steakhouse: Seafood, steakhouse, and. bar with fresh seafood and jumbo shrimp.\n4. La Marea of the Sea: Seafood food stand with fresh oysters and great condiments.\n5. Cap.’t Loui: Seafood, fish & chips, and Cajun/Creole restaurant with a large group-friendly atmosphere.\n6. Rustic House Oyster Bar and Grill - Los Altos: Seafood, bar, and American restaurant with a grou.per special.\n7. King’s Fish House - San Jose: Seafood restaurant with a tartare sauce that has a great balance of spicy and seafood taste.\n8. Cook’s Seafood: Seafood, seafood market, and fish. & chips restaurant with big portions of quality seafood at a reasonable price.\n9. The City Fish: Sandwiches, seafood, and fish & chips restaurant with big portions of quality seafood

# **Topic Extraction**

In [19]:
topics = []

In [20]:
def get_topic_extraction_prompt(content):
    prompt = f"""Label the main topic or topics in the following text: {content}"""
    prompt = prompt + """1. Identify and list the primary topic or category or provide a short description of the main subject matter of the text.
      2. If there are subtopics or secondary themes mentioned in the text, list them as well. If the text discusses multiple topics, provide a list of these topics and describe their relevance.
      3. Consider the context and tone of the text to determine the most appropriate topics. Take into account keywords, phrases, or specific terms that relate to the topics.
      4. If any notable entities (people, places, brands, products, etc.) are mentioned in the text that play a role in the topics, mention them and their associations.
      5. If the text suggests any actions, decisions, or recommendations related to the identified topics, provide a brief summary of these insights.

      Ensure that your labeling is clear, concise, and reflects the most significant topics or categories found in the text.

      Here's the output schema:

      ```
      {
          "Topic": "",
          "Subtopics": [""],
          "Context": "",
          "NotableEntities": [],
          "Recommendations": ""
      }
      ```

      Do not respond with your own suggestions or recommendations or feedback."""
    return prompt


In [21]:
for structured_response_item in structured_response:
  title = structured_response_item['title']
  summary = structured_response_item['summary']
  if summary != "<html><body></body></html>":
    print(f'Topics: {title}\n')
    prompt = get_topic_extraction_prompt(summary)
    response = execute_prompt(prompt)
    print(response)
    extracted_json = extract_json(response)
    if extracted_json != None:
      topics.append(extracted_json)

Topics: THE BEST 10 Seafood Restaurants near MOUNTAIN VIEW, CA 94043 - Last Updated December 2023 - Yelp

```
{
  "Topic": "Seafood Restaurants",
  "Sub.topics": [
    "Top 10 Seafood Restaurants near Mountain View,. CA 94043",
    "Related Searches in Mountain View, CA",
    "Trending Searches in Mountain View, CA",
    ."Related Articles",
    "Frequently Asked Questions and Answers"
  ],
  "Context": "The text is a Yelp page that lists the top. 10 seafood restaurants near Mountain View, CA. It also includes related searches, trending searches, related articles, and frequently asked questions about seafood restaurants in the area.",
  "NotableEntities": [
    "Yelp",
.    "Mountain View, CA",
    "Limón",
    "Pacific Catch",
    "The Sea by Alexander's Steakhouse",
    "La Marea of the Sea",
    "Cap't Lou.i",
    "Rustic House Oyster Bar and Grill - Los Altos",
    "King's Fish House - San Jose",
    "Cook's Seafood",
    "The City Fish",
    "Gochi - Mountain View"
  ],
  "Recomme

In [22]:
topics

['\n{\n  "Topic": "Seafood Restaurants",\n  "Subtopics": [\n    "Top 10 Seafood Restaurants near Mountain View, CA 94043",\n    "Related Searches in Mountain View, CA",\n    "Trending Searches in Mountain View, CA",\n    "Related Articles",\n    "Frequently Asked Questions and Answers"\n  ],\n  "Context": "The text is a Yelp page that lists the top 10 seafood restaurants near Mountain View, CA It also includes related searches, trending searches, related articles, and frequently asked questions about seafood restaurants in the area",\n  "NotableEntities": [\n    "Yelp",\n    "Mountain View, CA",\n    "Limón",\n    "Pacific Catch",\n    "The Sea by Alexander\'s Steakhouse",\n    "La Marea of the Sea",\n    "Cap\'t Loui",\n    "Rustic House Oyster Bar and Grill - Los Altos",\n    "King\'s Fish House - San Jose",\n    "Cook\'s Seafood",\n    "The City Fish",\n    "Gochi - Mountain View"\n  ],\n  "Recommendations": []\n}\n',
 '\n{\n  "Topic": "Googleplex",\n  "Subtopics": [\n    "Reviews o

# **Keyword Extraction**

In [23]:
keywords = []

In [24]:
def get_keyword_extraction_prompt(content):
    prompt = f"""Extract key keywords or phrases from the following text: {content}"""
    prompt = prompt + """1. Identify and list the most important keywords or key phrases in the text. These keywords should capture the main topics, concepts, or subjects discussed in the text.
      2. If there are subtopics or secondary themes mentioned in the text, list them as well. Ensure that the extracted keywords accurately represent the content's context.
      3. Include the exact text span or sentence where each keyword or phrase is found in the original text.
      4. If there are any ambiguous keywords or phrases, indicate the uncertainty and provide possible interpretations or context that might clarify the intended meaning.
      5. Consider the context, relevance, and frequency of the keywords when determining their significance.
      6. If the text suggests any actions, decisions, or recommendations related to the extracted keywords, provide a brief summary of these insights.

      Ensure that your keyword extraction results are relevant, concise, and capture the essential topics within the text.

      Here's the output schema:

      ```
      {
          "KeywordExtraction": [
              {
                  "Keyword": "",
                  "Context": "",
                  "TextSpan": ""
              }
          ]
      }
      ```

      Do not respond with your own suggestions or recommendations or feedback.
    """
    return prompt


In [25]:
summarries

['1. Limón: Peruvian seafood and cocktail bar with a delicious menu featuring mouth.-watering seafood and land animal options.\n2. Pacific Catch: Seafood,. tacos, and sushi bar with Korean-style seafood pancakes and poke.\n3. The Sea by Alexander’s Steakhouse: Seafood, steakhouse, and. bar with fresh seafood and jumbo shrimp.\n4. La Marea of the Sea: Seafood food stand with fresh oysters and great condiments.\n5. Cap.’t Loui: Seafood, fish & chips, and Cajun/Creole restaurant with a large group-friendly atmosphere.\n6. Rustic House Oyster Bar and Grill - Los Altos: Seafood, bar, and American restaurant with a grou.per special.\n7. King’s Fish House - San Jose: Seafood restaurant with a tartare sauce that has a great balance of spicy and seafood taste.\n8. Cook’s Seafood: Seafood, seafood market, and fish. & chips restaurant with big portions of quality seafood at a reasonable price.\n9. The City Fish: Sandwiches, seafood, and fish & chips restaurant with big portions of quality seafood

In [26]:
for summary in summarries:
  if summary != "":
    prompt = get_keyword_extraction_prompt(summary)
    response = execute_prompt(prompt)
    extracted_json = extract_json(response)
    if extracted_json != None:
      keywords.append(extracted_json)

In [27]:
keywords

['\n{\n  "KeywordExtraction": [\n    {\n      "Keyword": "Limón",\n      "Context": "Limón: Peruvian seafood and cocktail bar with a delicious menu featuring mouth-watering seafood and land animal options",\n      "TextSpan": "Limón"\n    },\n    {\n      "Keyword": "Seafood",\n      "Context": "Limón: Peruvian seafood and cocktail bar with a delicious menu featuring mouth-watering seafood and land animal options",\n      "TextSpan": "seafood"\n    },\n    {\n      "Keyword": "Cocktail bar",\n      "Context": "Limón: Peruvian seafood and cocktail bar with a delicious menu featuring mouth-watering seafood and land animal options",\n      "TextSpan": "cocktail bar"\n    },\n    {\n      "Keyword": "Pacific Catch",\n      "Context": "Pacific Catch: Seafood, tacos, and sushi bar with Korean-style seafood pancakes and poke",\n      "TextSpan": "Pacific Catch"\n    },\n    {\n      "Keyword": "Seafood tacos",\n      "Context": "Pacific Catch: Seafood, tacos, and sushi bar with Korean-style s

# **Automated Tagger**

In [28]:
taggers = []

In [29]:
def get_automated_tagger_extraction_prompt(content):
    prompt = f"""Automate the tagging of the following unstructured data: {content}"""
    prompt = prompt + """1. Identify and extract the most relevant tags, keywords, or categories for the given data. These tags should succinctly represent the content's main themes, subjects, or topics.
        2. List the extracted tags, and provide a brief description or rationale for each tag to help users understand their significance.
        3. If there are subcategories or hierarchies in the tags, ensure that they are appropriately nested or organized.
        4. Consider the context, content, and domain-specific knowledge when selecting tags. Ensure that the tags accurately reflect the essence of the data.
        5. If any tags are ambiguous or could have multiple interpretations, address these challenges and provide explanations for the chosen tags.
        6. If there are specific tasks or analyses where the tagged data will be used, describe these use cases and how the tags are expected to be applied.
        7. If the data contains temporal or dynamic elements, mention any trends, changes, or time-sensitive aspects that might impact the tags.

        Ensure that your automated tagging results are clear, relevant, and make the data more accessible and useful.

        Here's the output schema:

        ```
        {
            "AutomatedTagging": {
                "Tags": [
                    {
                        "Tag": "",
                        "Sentences": []
                    }
                ]
            }
        }
        ```

        Do not respond with your own suggestions or recommendations or feedback.
    """
    return prompt


In [30]:
for summary in summarries:
  if summary != "":
    prompt = get_automated_tagger_extraction_prompt(summary)
    response = execute_prompt(prompt)
    extracted_json = extract_json(response)
    if extracted_json != None:
      taggers.append(extracted_json)

In [31]:
taggers

['\n{\n    "AutomatedTagging": {\n        "Tags": [\n            {\n                "Tag": "Seafood",\n                "Sentences": [\n                    "Limón: Peruvian seafood and cocktail bar with a delicious menu featuring mouth-watering seafood and land animal options",\n                    "Pacific Catch: Seafood, tacos, and sushi bar with Korean-style seafood pancakes and poke",\n                    "The Sea by Alexander’s Steakhouse: Seafood, steakhouse, and bar with fresh seafood and jumbo shrimp",\n                    "La Marea of the Sea: Seafood food stand with fresh oysters and great condiments",\n                    "Cap’t Loui: Seafood, fish & chips, and Cajun/Creole restaurant with a large group-friendly atmosphere",\n                    "Rustic House Oyster Bar and Grill - Los Altos: Seafood, bar, and American restaurant with a grouper special",\n                    "King’s Fish House - San Jose: Seafood restaurant with a tartare sauce that has a great balance of spi

# **Intent Extraction**

In [32]:
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part

def get_intent_extraction_prompt(content):
  schema = """
  ```"intents":[
    "intent": "",
    "statement": ""
  ]```
  """
  prompt = f"""You are an expert intent detector. Your job is to detect and list down all the intents within the below content. Output the same in the specified JSON schema format.
    Here's the content:
    ---
    {content}
    ---
    Here's the schema:
    {schema}
    Do not respond with your own suggestions or recommendations or feedback.
 """
  return prompt

In [33]:
intents = []

In [34]:
instruct_prompt = f"get me the summary for the following content"
prompt = get_intent_extraction_prompt(instruct_prompt)
response = execute_prompt(prompt)
extracted_json = extract_json(response)
if extracted_json != None:
  intents.append(extracted_json)

In [35]:
intents

['\n{\n  "intents": [\n    {\n      "intent": "GetSummary",\n      "statement": "get me the summary for the following content"\n    }\n  ]\n}\n']