# Getting Started


In [102]:

PROJECT_ID = "analytics-ml-ai"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}
ME_REGION = "us-central1"
ME_DIMENSIONS = 768 # when using Vertex PaLM Embedding
ME_DISPLAY_NAME = "rfpbot_all_products_stage"
ME_DESCRIPTION = "rfpbot across all products stage"
ME_EMBEDDING_DIR   = "gs://rfpbot-stage-me" # @param {type:"string"}
DOC_LIMIT = 10 #0 allows for all links to be scraped
CHUNK_SIZE = 3500
CHUNK_OVERLAP = 100

In [97]:
# Utils
import uuid
import json
import time
import uuid
from typing import List
import numpy as np
import requests
import os


# HTML

from urllib.parse import urljoin
from bs4 import BeautifulSoup

# Vertex AI
import vertexai
from google.cloud import aiplatform
print(f"Vertex AI SDK version: {aiplatform.__version__}")



# Langchain
import langchain

print(f"LangChain version: {langchain.__version__}")

from langchain.embeddings import VertexAIEmbeddings
from langchain.llms import VertexAI


# Import custom Matching Engine packages
from utils.matching_engine import MatchingEngine
from utils.matching_engine_utils import MatchingEngineUtils



Vertex AI SDK version: 1.27.0
LangChain version: 0.0.201


In [50]:
starting_url = 'https://cloud.google.com/'

In [51]:
def crawl_and_collect_URLS(url):
    if url in visited_urls:
        return
    
    visited_urls.add(url)
    
    if DOC_LIMIT != 0 and len(visited_urls) > DOC_LIMIT:  # This is to limit content for testing, remove for full thing
        return
    
    print(f"Crawling: {url}")
    
    try:
        response = requests.get(url)
    except Exception as e:
        print(f"Exception {e} on {url}")

    soup = BeautifulSoup(response.text, 'html.parser')

    with open('urls.json', 'w') as f:
        json.dump(list(visited_urls),f)
        
    #retrieve_content(url, soup)

    links = soup.find_all('a', href=True)
    
    keep_filters = ['https://cloud.google.com']
    drop_filters = ['release-notes',
                    'reference',
                    'samples',
                    '#',
                    '?',
                    'hybrid', #apigee
                    'blog',
                    'signin',
                    'SignUp',
                    'pdf',
                    'json',
                    'changelog',
                    'ex:',
                    'find-a-partner',
                   ]
    kept_links = []
    
    for link in links:
        for f in keep_filters:
            if f in link['href']:
                kept_links.append(link['href'])
        for f in drop_filters:
            for l in kept_links:
                if f in l:
                    kept_links.remove(l)
    
    for link in kept_links:
        absolute_url = urljoin(starting_url, link)
        crawl_and_collect_URLS(absolute_url)

In [52]:
visited_urls = set()
crawl_and_collect_URLS(starting_url)

Crawling: https://cloud.google.com/
Crawling: https://cloud.google.com/why-google-cloud/
Crawling: https://cloud.google.com/why-google-cloud
Crawling: https://cloud.google.com/ai
Crawling: https://cloud.google.com/solutions/
Crawling: https://cloud.google.com/multicloud
Crawling: https://cloud.google.com/infrastructure
Crawling: https://cloud.google.com/data-cloud
Crawling: https://cloud.google.com/open-cloud
Crawling: https://cloud.google.com/trust-center


In [55]:
print(len(visited_urls))

590


In [56]:
with open('urls.json') as f:
      data = json.load(f)

In [57]:
len(data)

10

In [71]:
def get_content(url):
    # if DOC_LIMIT != 0 and len(visited_urls) > DOC_LIMIT:  # This is to limit content for testing, remove for full thing
        # return
    
    filename = '../notebooks/web_text_raw/' + url[8::].replace("/", "_") + '.txt'

    if os.path.isfile(filename):
        return #if we have file, skip writing the rest
        
    print(f"Crawling: {url}")
    
    try:
        response = requests.get(url)
    except Exception as e:
        print(f"Exception {e} on {url}")

    soup = BeautifulSoup(response.text, 'html.parser')
       
    with open(filename, 'w', encoding='utf-8') as f:
        #json.dump(soup.get_text(),f)
        f.write(str(soup))

In [72]:
import threading
from concurrent.futures import ThreadPoolExecutor

executor = ThreadPoolExecutor(max_workers=10)

futures = []
for list_item in data:
    future = executor.submit(get_content, list_item)
    futures.append(future)

for future in futures:
    future.result()

print("All threads finished")

Crawling: https://cloud.google.com/ai
Crawling: https://cloud.google.com/open-cloud
Crawling: https://cloud.google.com/why-google-cloud
Crawling: https://cloud.google.com/
Crawling: https://cloud.google.com/infrastructure
Crawling: https://cloud.google.com/trust-center
Crawling: https://cloud.google.com/data-cloud
Crawling: https://cloud.google.com/why-google-cloud/
Crawling: https://cloud.google.com/solutions/
Crawling: https://cloud.google.com/multicloud
All threads finished


In [73]:
from langchain.docstore.document import Document
from bs4 import BeautifulSoup
from langchain.text_splitter import RecursiveCharacterTextSplitter

import langchain
import os
import json
print(f"LangChain version: {langchain.__version__}")

LangChain version: 0.0.201


In [86]:
# Get all file names in the current directory
import glob
#file_names = os.listdir("web_text_raw")
file_names = glob.glob(os.path.join("web_text_raw/", '*')) #use glob to skip "hidden files in dire
print(len(file_names))

10


In [90]:
def load_and_split(filename):
    if filename.startswith("."):
        return
    
    with open("web_text_raw/" + filename) as f:
        soup = BeautifulSoup(f, 'html.parser')
        
    all_content = ""
    all_docs = {}
    url = 'https://' + filename.replace("_","/")[:-4] # remove .txt

    lines = soup.get_text("\n", strip=True).splitlines()
    for line in lines:
        all_content = all_content + " " + line
    
    all_docs[url] = {}    
    all_docs[url]['content'] = all_content
    
    raw_articles_from_file = []
    for k, v in all_docs.items():
        raw_articles_from_file.append(Document(page_content=v['content'], metadata={'source': k}))
        
    count = 1

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
    doc_splits = text_splitter.split_documents(raw_articles_from_file)
    for doc in doc_splits:
        doc.metadata['id'] = doc.metadata['source'] + "_" + str(count) # should we hash the URL + Heading?
        doc.page_content = doc.page_content + " source: " + url #add source into the page_content since vertex doesnt seem to support metadata
        count = count + 1
    #print(f"# of documents = {len(doc_splits)}")
    
    with open('web_text_processed/' + filename.replace(".txt", ".jsonl"), 'w', encoding='utf-8') as f:
        for item in doc_splits:
            json.dump({'page_content': item.page_content, 'metadata': item.metadata}, f)
            f.write('\n')
            
    return len(doc_splits)

In [91]:
import threading
from concurrent.futures import ThreadPoolExecutor


executor = ThreadPoolExecutor(max_workers=2)
total_splits = 0


futures = []
for list_item in file_names:
    future = executor.submit(load_and_split, list_item.split("/")[1])
    futures.append(future)

for future in futures:
    total_splits = total_splits + future.result()

print("All threads finished")

All threads finished


In [92]:
print(total_splits)

125


## Configure Matching Engine as Vector Store

#### Get Matching Engine Index id and Endpoint id

In [104]:
mengine = MatchingEngineUtils(PROJECT_ID, LOCATION, ME_DISPLAY_NAME)

In [105]:
ME_INDEX_ID, ME_INDEX_ENDPOINT_ID = mengine.get_index_and_endpoint()
print(f"ME_INDEX_ID={ME_INDEX_ID}")
print(f"ME_INDEX_ENDPOINT_ID={ME_INDEX_ENDPOINT_ID}")

ME_INDEX_ID=projects/184378960328/locations/us-central1/indexes/9057504110734999552
ME_INDEX_ENDPOINT_ID=projects/184378960328/locations/us-central1/indexEndpoints/7247057060532060160


In [111]:
REQUESTS_PER_MINUTE = 300


llm = VertexAI(
    model_name='text-bison@001',
    max_output_tokens=512,
    temperature=0.1,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

# Chat instance integrated with langChain
#chat = VertexChat()

# Embeddings API integrated with langChain
embedding = VertexAIEmbeddings(requests_per_minute=REQUESTS_PER_MINUTE)

embedding

VertexAIEmbeddings(client=<vertexai.language_models._language_models._PreviewTextEmbeddingModel object at 0x7f0fea7239a0>, model_name='textembedding-gecko', temperature=0.0, max_output_tokens=128, top_p=0.95, top_k=40, stop=None, project=None, location='us-central1', credentials=None)

In [112]:
# initialize vector store Object
me = MatchingEngine.from_components(
    project_id=PROJECT_ID,
    region=LOCATION,
    gcs_bucket_name=ME_EMBEDDING_DIR,
    embedding=embedding,
    index_id=ME_INDEX_ID,
    endpoint_id=ME_INDEX_ENDPOINT_ID
)

In [116]:
# Get all file names in the current directory
import glob
import os
import jsonlines
import uuid
#file_names = os.listdir("web_text_raw")
file_names = glob.glob(os.path.join("web_text_processed/", '*')) #use glob to skip "hidden files in dire
print(len(file_names))

10


In [124]:
def stream_to_me(filename):
    
    documents = []
    
    if filename.startswith("."):
        return
    
    with jsonlines.open("web_text_processed/" + filename) as reader:
        for line in reader:
            documents.append(Document(page_content=line['page_content'], source=dict(line['metadata'])))

    ids = me.add_documents(documents)
    print(ids)
    # texts = [doc.page_content for doc in documents]
    # me.add_texts(texts=texts, metadatas=[{'url':'https://cloud.google.com//security/compliance/eba-eu/'}])
    # print(documents)
    return len(documents)



In [125]:
import threading
from concurrent.futures import ThreadPoolExecutor


executor = ThreadPoolExecutor(max_workers=4)
total_loaded = 0


futures = []
for list_item in file_names:
    future = executor.submit(stream_to_me, list_item.split("/")[1])
    futures.append(future)

for future in futures:
    total_loaded = total_loaded + future.result()

print("All threads finished")

INFO:root:Indexed 12 documents to Matching Engine.
INFO:root:Indexed 12 documents to Matching Engine.
INFO:root:Indexed 12 documents to Matching Engine.


[UUID('d7a166ef-b358-4dfd-ba32-5aa14f23f57a'), UUID('654af399-a05f-4b1d-bd6f-70373827ee4f'), UUID('cd2a11da-854c-4795-807e-9d713911d42f'), UUID('f32a17f2-bbe9-40c5-97b5-8706dcc5f334'), UUID('71b917d1-e7f1-4348-b220-a04509b31de9'), UUID('c1e72870-959c-45ee-84c8-fb574bfff067'), UUID('5dcbbc76-9114-4845-b578-79ae44bcefa3'), UUID('493c15b6-5d87-4f05-918d-62333df88fb6'), UUID('71e93595-9159-4416-b8fb-b996d13b3a3f'), UUID('7c61f76b-12a0-4b3a-b752-52913ad55fa0'), UUID('99adfd1a-fe9a-47d9-a192-5b71e91f1a87'), UUID('350dfb5a-29c9-4680-a107-bfbb628f4b39')]
[UUID('5badbfe6-0ba2-41c7-ad30-7a71ba0bdace'), UUID('35910f15-3251-4eb7-b204-5780236deec0'), UUID('db14df24-f00f-4717-90dd-2fe1af90c516'), UUID('c7f21a28-8790-4e78-8d6c-f4d6d9f1214b'), UUID('1b42e502-76bd-4bb3-8b89-5286d6feaca1'), UUID('cda6247e-9edf-4b30-9121-f85119cde647'), UUID('e01411f5-ce0d-4e17-9da1-b441c5920b18'), UUID('daaa65e2-21fc-4f73-aa8d-b0053a90c073'), UUID('b0872efc-57b2-4799-9bd4-6f6f533d0e5c'), UUID('a64a8e49-83da-4809-b8e2-de

INFO:root:Indexed 15 documents to Matching Engine.


[UUID('7e4cc8d0-a197-4bb7-9d6e-0355448e0195'), UUID('919fafbf-cf5e-4855-aceb-0d262fd001ad'), UUID('217e3246-b126-4c08-bcf9-6a7fee1d5a2c'), UUID('1a084c03-eb77-47f5-810f-e0b8d1fcd8b6'), UUID('9875efa2-a8d3-4ff9-955c-72f43c547296'), UUID('6bccead0-ea3b-49a2-97f5-7bb74c80fdef'), UUID('5c40aad2-19a5-4043-9b29-55307dceed14'), UUID('82f42331-4956-4148-b9da-11f8b2df1b94'), UUID('fe62323a-9224-4aa8-ac3a-dde23bc14c36'), UUID('4a56f4e4-6c90-42b0-81ff-b52f778800d2'), UUID('0078f2bf-1c07-401a-936d-6f3145cd8596'), UUID('d085d33e-f519-451c-a57c-b33056de159e'), UUID('6ce5153b-4a04-4882-aea4-0673fc1dde24'), UUID('62ba60e6-c304-4312-8787-be6886bc7d7d'), UUID('70b881df-e9fa-44d6-ac80-b307d8de5243')]


INFO:root:Indexed 12 documents to Matching Engine.
INFO:root:Indexed 12 documents to Matching Engine.
INFO:root:Indexed 12 documents to Matching Engine.


[UUID('13884906-3a6d-4689-bc8a-8cbaa1876dc0'), UUID('d4b79276-7fbd-4dfb-ae28-fedc2d842cd8'), UUID('fe6d47e4-1e29-4773-8d8e-e0ac74318df2'), UUID('7ff57763-e858-417d-9ec7-dcfbb29ecd3e'), UUID('7a71df29-c9a4-43da-81a5-2526fde25cfa'), UUID('12962a6d-4dcc-4d7d-9c49-57b59585e1b0'), UUID('35b97cab-cd51-4ee2-abb7-bc5535bd76f2'), UUID('b816bf6f-c235-43fa-8ac6-ffafc2eb82d5'), UUID('10e88a48-5f26-433a-b66a-9f32f500346e'), UUID('cbd96c38-08c8-4e40-a2b6-b454b4668b8a'), UUID('bdfc9682-b40c-419a-8614-51eb7ebdadec'), UUID('c5d0b6f3-48b6-4fc6-b4c1-7f52884a5d0c')]
[UUID('acb94fb9-5dcf-4c66-ae0b-72d2a8af8c6f'), UUID('5dfab393-74b6-46f8-8c06-a32e4dbb7e87'), UUID('b81361a8-893b-4fdd-8177-662c0c761740'), UUID('f18ec3ff-c26e-4939-9d80-1773488945dc'), UUID('671ca17b-9b5c-4268-a52f-849e80181586'), UUID('a34b54d2-b725-45c2-89cd-2c72be01ac7c'), UUID('6e36dc47-c5c1-4ec7-9b5c-ac4751b26ad5'), UUID('15d1233d-a888-48d0-916b-967006b0244d'), UUID('8e41092e-c4e7-4eb1-a018-a40ebade4618'), UUID('5598a820-677c-4fc3-940a-d0

INFO:root:Indexed 13 documents to Matching Engine.


[UUID('e3dcad77-2894-4fe6-9aeb-d5bc688addb5'), UUID('85801d6c-ba30-42a8-a292-0b6451c246d0'), UUID('f96174dc-dcc6-4962-bcb4-e4c55db7f218'), UUID('587cbbd4-d539-48a4-8e27-ee2422aa23b1'), UUID('e42675e3-adc4-4fcc-afe0-c019bf29416f'), UUID('81248466-9f21-426d-aecb-882d6af21f99'), UUID('1ef42295-5e02-48df-aa93-b281e3841d5a'), UUID('13d144cd-614b-4717-b39f-d418f82c54df'), UUID('7f1fabb5-bb2b-4fcb-8a82-cd8bc7911402'), UUID('1f00559e-ba16-4897-8503-c9bd82f1772c'), UUID('51514298-f523-4789-8a1b-3a9d0365ddc9'), UUID('745c3108-b611-4738-a80e-47043a85d51a'), UUID('e062e2a4-fbaf-4028-92a9-7b0d46a86cef')]


INFO:root:Indexed 12 documents to Matching Engine.
INFO:root:Indexed 13 documents to Matching Engine.


[UUID('268d27fd-6aba-42d2-885c-11cb93da0824'), UUID('6b675c4c-5c4a-4940-ae38-c6a3d374289e'), UUID('0a5375d7-1050-4ba5-872e-b06211c5ae43'), UUID('95561ee4-bd19-45d9-ac9d-72458739474e'), UUID('cf852e10-4d32-435b-8469-454af1a82ea6'), UUID('272c444a-4600-4ee1-9497-1f6ebd33c270'), UUID('e5bfe563-6810-4547-b153-acdce1176765'), UUID('dbd4345b-6314-4d4b-b482-e14e6beb04ac'), UUID('428f9d29-2880-44a6-a11d-a680e702cbe7'), UUID('158c670a-233b-4395-9af3-7e2696f380d7'), UUID('07b12adf-8560-43c9-a52b-8d07560db5aa'), UUID('a2821957-3765-4e7f-b00d-e37eb2b9cbd9')]
[UUID('2e5b8af5-ab90-407b-8ac3-199c0363a1b0'), UUID('383d3ba5-fcb1-4e8e-8e0d-e8891348c6de'), UUID('fd8f8483-ec9c-40b3-bcb5-9f02de2dcad4'), UUID('88c376e9-b8b3-4bb5-9a43-2b4c41fb45f2'), UUID('63d68921-6d7e-4782-9776-c715cc81c8c7'), UUID('b5a34381-1085-4ab8-bd31-17f0ce9cc545'), UUID('b66915a8-0f2a-40dc-b747-4445fb206e40'), UUID('5bf354d6-f296-4698-a232-4b62ac143f1c'), UUID('12242bc1-c27b-4581-8444-63c4baa6ed6b'), UUID('ef61a50c-1d1a-4841-93fb-00

In [129]:
# Test whether search from vector store is working
me.similarity_search("Multicloud Solutions", k=2)

[Document(page_content="Smart analytics Artificial Intelligence Security Productivity & work transformation Industry solutions DevOps solutions Small business solutions See all solutions Resources Google Cloud documentation Google Cloud quickstarts Google Cloud Marketplace Learn about cloud computing Support Code samples Cloud Architecture Center Training Certifications Google for Developers Google Cloud for Startups System status Release Notes Engage Contact sales Find a Partner Become a Partner Events Podcasts Developer Center Press Corner Google Cloud on YouTube Google Cloud Tech on YouTube Follow on Twitter Join User Research We're hiring. Join Google Cloud! Google Cloud Community Cookie Settings About Google Privacy Site terms Google Cloud terms Our third decade of climate action: join us Sign up for the Google Cloud newsletter Subscribe English Bahasa Indonesia Deutsch Español Español – América Latina Français Italiano Português – Brasil 中文 – 简体 中文 – 繁體 日本語 한국어 source: https://cl