In [2]:
import os
import io
import json
from google.cloud import storage, vision_v1
from google.cloud.vision_v1 import types
from google.protobuf.json_format import MessageToDict
import pandas as pd
import requests
import time
import fitz
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gcloud_key.json"


In [52]:
def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    """Uploads a file to the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)

    print(f"File {source_file_name} uploaded to {destination_blob_name}.")
    return f"gs://{bucket_name}/{destination_blob_name}"

def delete_from_gcs(bucket_name, blob_name):
    """Deletes a file from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(blob_name)
    blob.delete()

    print(f"Blob {blob_name} deleted.")

def download_from_gcs(bucket_name, source_blob_name, destination_file_name):
    """Downloads a file from the bucket."""
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(source_blob_name)
    blob.download_to_filename(destination_file_name)

    print(f"Blob {source_blob_name} downloaded to {destination_file_name}.")

def detect_text_in_pdf(bucket_name, gcs_input_uri, gcs_output_uri, local_output_folder, batch_size = 1):
    client = vision_v1.ImageAnnotatorClient()

    # Configure the request for PDF/TIFF processing
    input_config = vision_v1.InputConfig(
        gcs_source=vision_v1.GcsSource(uri=gcs_input_uri),
        mime_type='application/pdf'  # or 'image/tiff' for TIFF files
    )

    output_config = vision_v1.OutputConfig(
        gcs_destination=vision_v1.GcsDestination(uri=gcs_output_uri),
        batch_size=batch_size  # Specifies how many pages to process at once
    )

    # Create the request
    async_request = vision_v1.AsyncAnnotateFileRequest(
        features=[vision_v1.Feature(type=vision_v1.Feature.Type.DOCUMENT_TEXT_DETECTION)],
        input_config=input_config,
        output_config=output_config
    )

    # Perform the request
    operation = client.async_batch_annotate_files(requests=[async_request])

    print('Waiting for the operation to finish...')
    response = operation.result(timeout=300)

    # Ensure the local output folder exists
    if not os.path.exists(local_output_folder):
        os.makedirs(local_output_folder)

    # Download all output files from GCS
    storage_client = storage.Client()
    blobs = storage_client.list_blobs(bucket_name, prefix="vision/")
    for blob in blobs:
        if blob.name.startswith("vision/output-") and blob.name.endswith(".json"):
            local_output_file = os.path.join(local_output_folder, os.path.basename(blob.name))
            try:
                download_from_gcs(bucket_name, blob.name, local_output_file)
            except Exception as e:
                print(f"Error downloading the output file {blob.name}: {e}")

            # Delete the result file from GCS
            delete_from_gcs(bucket_name, blob.name)

    print(f'Results saved to {local_output_folder}')


In [29]:
delete_from_gcs(bucket_name, f"vision/output-1-to-1.json") #this works

Blob vision/output-1-to-1.json deleted.


In [97]:
source_file_name = 'test1.pdf'

def get_number_of_pages(pdf_path):
    document = fitz.open(pdf_path)
    number_of_pages = document.page_count
    document.close()
    return number_of_pages
num_pages = get_number_of_pages(source_file_name)
print(f"RUN THE NEXT BLOCK IF YOU ARE SURE YOU WANT TO RUN {num_pages} PAGES?")
print("KEEP IN MIND 1000 PAGES COSTS AROUND $1.50+$0.005*1000/5 = $2.5 (upper bound)") 
#i didn't put much effort into estimating $2.5. But I was super conservative with it.

RUN THE NEXT BLOCK IF YOU ARE SURE YOU WANT TO RUN 13 PAGES?
KEEP IN MIND 1000 PAGES COSTS AROUND $1.50+$0.005*1000/5 = $2.5 (upper bound)


In [54]:
bucket_name = 'sri-cloud-vision-api-bucket'
destination_blob_name = 'uploaded_file.pdf'
local_output_folder = 'outputs'
gcs_output_uri = f"gs://{bucket_name}/vision/"
batch_size = 7

gcs_input_uri = upload_to_gcs(bucket_name, source_file_name, destination_blob_name) #this works

os.makedirs("outputs", exist_ok = True)

s = time.time()
detect_text_in_pdf(bucket_name, gcs_input_uri, gcs_output_uri, local_output_folder, batch_size)
e = time.time()
print("slowest operation: ", e-s)
delete_from_gcs(bucket_name, destination_blob_name) #this works

#I mean like 3 sec per page? it's whatever ig. batch it by min((n+1)/2, 10)...
#the gpt4o call is fast af, so overall it's still DMTOCS_pro speed. prob slightly faster if i batch nondegenerately

File test1.pdf uploaded to uploaded_file.pdf.
Waiting for the operation to finish...
Blob vision/output-1-to-7.json downloaded to outputs/output-1-to-7.json.
Blob vision/output-1-to-7.json deleted.
Blob vision/output-8-to-13.json downloaded to outputs/output-8-to-13.json.
Blob vision/output-8-to-13.json deleted.
Results saved to outputs
slowest operation:  36.5544331073761
Blob uploaded_file.pdf deleted.


In [79]:
filelist = list(set(os.listdir("outputs"))-{".DS_Store"})
tl = [(int(file.split('-')[1]), file) for file in filelist]
tl.sort()
ptd = {}
cnter = 0
for file in tl:
    with open(os.path.join("outputs", file[1]), 'r') as file: 
    	td = json.load(file)
    for idx in range(len(td["responses"])):
        assert(td["responses"][idx]["context"]["pageNumber"] == cnter+1)
        cnter+=1
        ptd[cnter] = td["responses"][idx]["fullTextAnnotation"]["text"]
with open("formatted_outputs.json", 'w') as file: 
	json.dump(ptd, file, indent=4)

In [None]:
#ok now I plug context into gpt4o. to get it to do multimodal.
#also i should probably look at token estimates before I do this... as well as page estimates...

In [108]:
#i can select however, but ig i'll for example i'll do it on num pages:
pages = [1,2,3,4,5,6]
full_text = ""
for page in pages:
  full_text+=(ptd[page]+'\n\n')

In [118]:
len_estimate = len(full_text.split())
print(f"ESTIMATED NUMBER OF TOKENS: {2*len_estimate}")
print("THIS COSTS $0.150 PER 1 MILLION TOKENS. BEWARNED")

ESTIMATED NUMBER OF TOKENS: 7422
THIS COSTS $0.150 PER 1 MILLION TOKENS. BEWARNED


In [1]:
from openai import OpenAI

api_key = "open_ai_api_key" 
client = OpenAI(api_key=api_key)

In [116]:
#gpt-4o-mini is mad cheap... 

completion = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": full_text},
    {"role": "user", "content": "Explain the core concepts of this paper, and all key ideas in detail to me."}
  ]
)

print(completion.choices[0].message.content)

In [115]:
print(completion.choices[0].message.content)

The paper titled "GINopic: Topic Modeling with Graph Isomorphism Network" introduces a novel approach to topic modeling that integrates the principles of graph theory with advanced neural network techniques, specifically Graph Isomorphism Networks (GIN). Below are the core concepts and key ideas discussed in the paper:

### 1. **Background on Topic Modeling**
Topic modeling is a technique used to discover the underlying topics present in large collections of texts in an unsupervised manner. Traditional models, like Latent Dirichlet Allocation (LDA), assume that each document is a mixture of topics, represented as distributions over words. While effective, these earlier methods may overlook the complex dependencies between words in a document.

### 2. **Motivation for GINopic**
Recent advances in topic modeling have incorporated contextualized embeddings (e.g., BERT) to account for word semantics. However, these models typically treat documents as sequences of words and often ignore the

In [117]:
#gpt-4o-mini is mad cheap... 

completion = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "sysztem", "content": completion.choices[0].message.content},
    {"role": "user", "content": "Tell me how to build this. Give me some starter code using python, and and recommend infrastructure needed to build it."}
  ]
)

print(completion.choices[0].message.content)

To build **GINopic**, you will need to set up a Python environment with appropriate libraries for data manipulation, graph processing, and neural network training. Below, I will provide starter code, an outline of the architecture, and some recommendations for infrastructure requirements.

### Recommended Infrastructure

1. **Python Environment**: 
   - Use Python 3.7 or higher.
   - Recommended libraries include:
     - `numpy`
     - `pandas`
     - `torch` (PyTorch)
     - `torch_geometric` (for graph neural networks)
     - `scikit-learn` (for data processing and evaluation)
     - `matplotlib` (for visualizations)
     - `transformers` (for embeddings like BERT, if needed)

2. **Compute Resources**: 
   - A machine with a decent GPU (e.g., NVIDIA RTX series) for training your model as GINs can be computationally intensive.
   - At least 16 GB of RAM for handling large datasets.

3. **Storage**: 
   - Sufficient storage for your datasets. SSDs are preferable for faster I/O operatio