In [1]:
import os
os.chdir(os.path.join(os.path.abspath("."),".."))


In [3]:
from databricks.sdk import WorkspaceClient
from databricks.sdk.core import Config
from databricks.connect import DatabricksSession

w = WorkspaceClient()

# get shared cluster id
shared_cluster_id = [
    c.cluster_id for c in w.clusters.list()
    if "Shared Compute" in c.cluster_name
    ][0]

In [4]:
config = Config()
config.cluster_id = shared_cluster_id
spark = DatabricksSession.builder.sdkConfig(config).getOrCreate()

In [5]:
catalog = "scottmckean_catalog"
schema = "blast"
research_table_name = "research_paper_urls"
research_table_path = f"{catalog}.{schema}.{research_table_name}"

In [6]:
# now we can run our batch job to do keyword searches and append new records
from delta.tables import DeltaTable
# Assuming 'existing_table' is a Delta table
existing_research = DeltaTable.forName(spark, research_table_path)

for keywords_search in [
    'geothermal energy',
    'lithium brine',
    'natural gas exploration'
  ]:
    new_research = search_arxiv_papers(spark, keywords_search, max_results=100)

    # Perform merge operation
    existing_research.alias("existing").merge(
        new_research.alias("updates"),
        "existing.unique_id = updates.unique_id"
    ).whenNotMatchedInsertAll().execute()

NameError: name 'search_arxiv_papers' is not defined

In [15]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import BinaryType
import requests

# Define UDF to download PDF
@udf(returnType=BinaryType())
def download_pdf(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.content
        else:
            return None
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return None

In [24]:
# Apply UDF to download PDFs in parallel
source_table = spark.table(research_table_path)
result_df = source_table.limit(5).withColumn("content", download_pdf(col("url")))

In [25]:
result_df.show()

+--------------------+--------------------+-----------+--------------------+
|               title|                 url|  unique_id|             content|
+--------------------+--------------------+-----------+--------------------+
|Geothermal Casimi...|http://arxiv.org/...|0710.4473v2|[3C 3F 78 6D 6C 2...|
|Petrophysical ana...|http://arxiv.org/...|0802.1931v1|[3C 3F 78 6D 6C 2...|
|Geometry-Temperat...|http://arxiv.org/...|0912.0125v1|[3C 3F 78 6D 6C 2...|
|Geothermal Casimi...|http://arxiv.org/...|1003.3420v1|[3C 3F 78 6D 6C 2...|
|Simulation of a H...|http://arxiv.org/...|1205.2449v1|[3C 3F 78 6D 6C 2...|
+--------------------+--------------------+-----------+--------------------+



In [26]:
from io import BytesIO

# Assuming you want to process the first row for demonstration
content = result_df.select("content").first()["content"]
bytes_io_object = BytesIO(content)

In [29]:
from unstructured.partition.auto import partition
partition(file=bytes_io_object)

ImportError: failed to find libmagic.  Check your installation