### Create structured data from unstructured data

In [2]:
%pip install --upgrade --quiet google-cloud-aiplatform \
                                 langchain-google-vertexai \
                                 langchain

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/7.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/7.1 MB[0m [31m52.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m7.1/7.1 MB[0m [31m101.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m68.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/93.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.3/93.3 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m53.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.2/413.2 kB[0m [31m30.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
!pip install PyPDF2


Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━[0m [32m225.3/232.6 kB[0m [31m7.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [11]:
import json
import os
from pathlib import Path
from PyPDF2 import PdfReader, PdfWriter
import vertexai
from vertexai.generative_models import GenerativeModel, Part
from google.cloud import storage

# Google Cloud Project Details
project_id = "saffatandsourik"
location = "us-central1"
bucket_name = "formula1-ss"

# Folder structure in GCS
raw_folder = "initial-loads/racerpedia/raw/"
split_folder = "initial-loads/racerpedia/split/"
llm_folder = "initial-loads/racerpedia/llm_text/"

# LLM Model Details
model_name = "gemini-2.0-flash"
prompt = """Extract the following details from the document and return the response in JSON format. Ensure that each field is a simple key-value pair (no arrays or nested objects):

- Circuit Name
- City
- Country
- Latitude (as a float)
- Longitude (as a float)
- Capacity (as an integer, exclude commas)
- FIA Grade
- Circuit Status (Active/Inactive)

Ensure that:
- Fields with missing values are set to `null`
- Numbers are properly formatted (e.g., floats should not have unnecessary trailing zeros)
- Text values do not contain newline characters
- The response is **a single flat JSON object** (not an array or nested structure)

Example output:
```json
{
    "circuit_name": "Albert Park Circuit",
    "city": "Melbourne",
    "country": "Australia",
    "latitude": -37.8497,
    "longitude": 144.9683,
    "capacity": 125000,
    "fia_grade": "1",
    "circuit_status": "Active"
}

"""

def split_documents():
    """Splits large PDFs into smaller chunks and saves directly to GCS."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = storage_client.list_blobs(bucket_name, prefix=raw_folder)

    for blob in blobs:
        if blob.name == raw_folder or not blob.name.endswith(".pdf"):
            continue

        print(f"Downloading {blob.name} from GCS for processing...")
        pdf_reader = PdfReader(blob.open("rb"))  # Read PDF directly from GCS
        pdf_writer = PdfWriter()
        start_page = 1

        base_filename = os.path.splitext(os.path.basename(blob.name))[0]  # Remove .pdf extension

        for page_num, page_data in enumerate(pdf_reader.pages, 1):
            pdf_writer.add_page(page_data)

            if page_num % 500 == 0:
                # Construct GCS path
                gcs_path = f"{split_folder}{base_filename}_{start_page}_{page_num}.pdf"
                print(f"Uploading split file: {gcs_path} to GCS")

                # Save to GCS
                blob_split = bucket.blob(gcs_path)
                with blob_split.open("wb") as out:
                    pdf_writer.write(out)

                pdf_writer = PdfWriter()
                start_page = page_num + 1

        if start_page < page_num:
            gcs_path = f"{split_folder}{base_filename}_{start_page}_{page_num}.pdf"
            print(f"Uploading final split file: {gcs_path} to GCS")

            blob_split = bucket.blob(gcs_path)
            with blob_split.open("wb") as out:
                pdf_writer.write(out)

def extract():
    """Extracts text from split PDFs using Gemini model and saves directly to GCS."""
    vertexai.init(project=project_id, location=location)
    model = GenerativeModel(model_name)

    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blobs = storage_client.list_blobs(bucket_name, prefix=split_folder)

    for blob in blobs:
        if blob.name == split_folder or not blob.name.endswith(".pdf"):
            continue

        # Define output path in GCS
        base_filename = os.path.basename(blob.name).replace(".pdf", ".json")
        gcs_output_path = f"{llm_folder}{base_filename}"

        # Check if file already exists in GCS
        if storage.Blob(bucket=bucket, name=gcs_output_path).exists(storage_client):
            print(f"Skipping {gcs_output_path}, already exists in GCS.")
            continue

        print(f"Extracting text from {blob.name}")

        # Process PDF with Gemini model
        file_content = Part.from_uri(f"gs://{bucket_name}/{blob.name}", "application/pdf")

        try:
            resp = model.generate_content([file_content, prompt])
            extracted_text = resp.candidates[0].text.strip()
            extracted_json = json.loads(extracted_text.replace("```json", "").replace("```", ""))

        except Exception as e:
            print(f"Error processing {blob.name}: {e}")
            continue

        # Upload extracted JSON directly to GCS
        blob_json = bucket.blob(gcs_output_path)
        blob_json.upload_from_string(json.dumps(extracted_json, indent=4), content_type="application/json")

        print(f"Successfully uploaded extracted JSON to {gcs_output_path}")

if __name__ == "__main__":
    split_documents()  # Split PDFs and upload to GCS
    extract()  # Extract text and upload to GCS


Downloading initial-loads/racerpedia/raw/Albert_Park_Circuit.pdf from GCS for processing...
Uploading final split file: initial-loads/racerpedia/split/Albert_Park_Circuit_1_9.pdf to GCS
Downloading initial-loads/racerpedia/raw/Algarve_International_Circuit.pdf from GCS for processing...
Uploading final split file: initial-loads/racerpedia/split/Algarve_International_Circuit_1_13.pdf to GCS
Downloading initial-loads/racerpedia/raw/Australian_Grand_Prix.pdf from GCS for processing...
Uploading final split file: initial-loads/racerpedia/split/Australian_Grand_Prix_1_29.pdf to GCS
Downloading initial-loads/racerpedia/raw/Autódromo_Hermanos_Rodríguez.pdf from GCS for processing...
Uploading final split file: initial-loads/racerpedia/split/Autódromo_Hermanos_Rodríguez_1_13.pdf to GCS
Downloading initial-loads/racerpedia/raw/Bahrain_International_Circuit.pdf from GCS for processing...
Uploading final split file: initial-loads/racerpedia/split/Bahrain_International_Circuit_1_13.pdf to GCS
Down

In [8]:
from google.cloud import aiplatform

aiplatform.init(project="saffatandsourik", location="us-central1")

models = aiplatform.Model.list()
for model in models:
    print(model.display_name, model.resource_name)


In [9]:
from vertexai.generative_models import GenerativeModel

model = GenerativeModel("gemini-pro")  # Use the correct model name
