In [None]:
!pip install --upgrade --user google-cloud-aiplatform google-cloud-storage firebase-admin
!pip install langchain_community
!pip install google-cloud-aiplatform
!pip install google-cloud-storage
!pip install langchain_community
!pip install langchain
!pip install pymuPDF
!pip install -U langchain-google-vertexai


In [None]:
import IPython
from IPython.display import Markdown, display
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

In [None]:
# get project ID
PROJECT_ID = ! gcloud config get project
PROJECT_ID = PROJECT_ID[0]
LOCATION = "us-central1"

# generate an unique id for this session
from datetime import datetime

UID = datetime.now().strftime("%m%d%H%M")

In [None]:
! gcloud services enable compute.googleapis.com aiplatform.googleapis.com storage.googleapis.com --project "{PROJECT_ID}"


In [None]:
from google.cloud import firestore

# Initialize Firestore
db = firestore.Client(project=PROJECT_ID)

# Verify that the database is ready
print("Firestore database initialized.")

In [None]:
import requests

# URL of the PDF file
url = "https://www.nyc.gov/assets/doh/downloads/pdf/rii/fpc-manual.pdf"

# Download the PDF file
response = requests.get(url)

# Save the PDF file
pdf_filename = "fpc-manual.pdf"
with open(pdf_filename, 'wb') as file:
    file.write(response.content)

print(f"PDF downloaded as {pdf_filename}")


In [None]:
import fitz  # PyMuPDF

# Open the PDF document
pdf_document = "fpc-manual.pdf"
pdf = fitz.open(pdf_document)

# Read and store the content of each page
pages_content = []
for page_num in range(len(pdf)):
    page = pdf[page_num]
    pages_content.append(page.get_text())

# Print the first 500 characters of the first page as a sample
print(pages_content[0][:500])


In [None]:
from google.cloud import firestore

# Initialize Firestore
db = firestore.Client(project=PROJECT_ID)

# Firestore collection name
collection_name = "page_content"

# Store each page's content in Firestore with a unique ID (page number)
for i, content in enumerate(pages_content):
    doc_id = str(i + 1)  # Page number as document ID
    db.collection(collection_name).document(doc_id).set({"content": content})
    print(f"Stored page {i + 1} in Firestore with document ID {doc_id}")


In [None]:
from google.cloud import firestore

# Initialize Firestore
db = firestore.Client(project=PROJECT_ID)

# Firestore collection name
collection_name = "page_content"

# Store each page's content in Firestore with a unique ID (page number)
for i, content in enumerate(pages_content):
    doc_id = str(i + 1)  # Page number as document ID
    db.collection(collection_name).document(doc_id).set({"content": content})
    print(f"Stored page {i + 1} in Firestore with document ID {doc_id}")


In [None]:
# Example text data (replace with your actual text data)
texts = [
    "This is a sample text for embedding.",
    "Another example text to generate embeddings."
]

# Assuming you have a model to generate embeddings (this is just a placeholder)
# You need to define your actual model here, e.g., using a PredictionServiceClient
embeddings = [
    [0.1, 0.2, 0.3],  # Example embedding for the first text
    [0.4, 0.5, 0.6]   # Example embedding for the second text
]


In [None]:
import json

# Create the JSON-L file
jsonl_filename = "embeddings.jsonl"
with open(jsonl_filename, 'w') as f:
    for i, embedding in enumerate(embeddings):
        doc_id = str(i + 1)
        entry = {"id": doc_id, "embedding": embedding}
        f.write(json.dumps(entry) + "\n")
    print(f"JSON-L file created: {jsonl_filename}")


In [None]:
from google.cloud import storage

# Initialize a storage client
client = storage.Client()

# Upload the JSON-L file
bucket_name = PROJECT_ID
bucket = client.bucket(bucket_name)
blob = bucket.blob(jsonl_filename)
blob.upload_from_filename(jsonl_filename)

print(f"{jsonl_filename} uploaded to {bucket_name}.")


In [None]:
from google.cloud import aiplatform
from google.cloud import storage
from google.cloud.aiplatform.gapic import PredictionServiceClient
from google.cloud.aiplatform.gapic.schema import predict

# Initialize the Vertex AI API
aiplatform.init(project=PROJECT_ID, location=LOCATION)


In [None]:
import json

# Assuming you have the following variables
# pages_content = [...]  # List of text content for each page
# embeddings = [...]     # Corresponding embeddings generated for each page

# Create a list to hold the JSON-L entries
jsonl_data = []

# Iterate through the embeddings and page numbers
for i, embedding in enumerate(embeddings):
    # Create a dictionary for each entry
    entry = {
        "id": str(i + 1),  # Page number (assuming 1-based index)
        "embedding": embedding  # Directly use the embedding list
    }
    # Append the dictionary to the list
    jsonl_data.append(entry)

# Write the JSON-L data to the file
jsonl_filename = "embeddings.json"
with open(jsonl_filename, 'w') as jsonl_file:
    for entry in jsonl_data:
        # Convert the dictionary to a JSON string and write it to the file
        jsonl_file.write(json.dumps(entry) + "\n")

print(f"JSON-L file '{jsonl_filename}' created successfully.")


In [None]:
from google.cloud import storage

# Initialize a storage client
client = storage.Client()

# Specify the bucket name
bucket_name = PROJECT_ID

# Upload the JSON-L file to the bucket
bucket = client.bucket(bucket_name)
blob = bucket.blob(jsonl_filename)
blob.upload_from_filename(jsonl_filename)

print(f"File '{jsonl_filename}' uploaded to bucket '{bucket_name}'.")


In [None]:
from google.cloud import aiplatform
from vertexai.language_models import TextEmbeddingModel

# Initialize Vertex AI with your project and location
aiplatform.init(project=PROJECT_ID, location=LOCATION)

# Load the Text Embedding Model
embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@002")


In [None]:
BUCKET_URI = f"gs://{PROJECT_ID}"


In [None]:
! gsutil mb -l "$LOCATION" -p "$PROJECT_ID" "$BUCKET_URI"
! gsutil cp "gs://github-repo/data/vs-quickstart/product-embs.json" "$BUCKET_URI"


In [None]:
! gsutil cp "gs://github-repo/data/vs-quickstart/product-embs.json" . # for query tests


In [None]:
# init the aiplatform package
from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

In [None]:
# create Index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"assessment-index-endpoint",
    contents_delta_uri=BUCKET_URI,
    dimensions=768,
    approximate_neighbors_count=10,
)

In [None]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"assessment-index-endpoint", public_endpoint_enabled=True
)

In [None]:
DEPLOYED_INDEX_ID = f"assessment_index_endpoint"


In [None]:
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)
