In [0]:
# notebook setup...
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()
me = w.current_user.me()
display('current user:', me.display_name)




In [0]:
#%pip install biopython
%pip install dateparser

# imports
from Bio import Entrez, Medline
import pandas as pd
import dateparser
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, concat_ws, regexp_replace, to_date, trim, lit, to_date, when, length, udf, regexp_replace
from pyspark.sql.types import StringType

def fallback_dateparser(date_str):
    if not date_str:
        return None
    parsed = dateparser.parse(date_str)
    return parsed.strftime("%Y-%m-%d") if parsed else None

def get_pubmed_date_range_past_months():
    today = datetime.today()
    three_months_ago = today - relativedelta(months=1)
    
    # Format dates as YYYY/MM/DD (PubMed compatible)
    today_str = today.strftime("%Y/%m/%d")
    three_months_ago_str = three_months_ago.strftime("%Y/%m/%d")
    
    return three_months_ago_str, today_str

spark = SparkSession.builder.getOrCreate()
Entrez.email = "mikeandersen622@gmail.com"

# global udfs 
parse_pubmed_date_udf = udf(fallback_dateparser, StringType())

# pubmed search conditions
# pulls documents tagged with MeSH terms for physical conditioning
# TODO: expand to relevant sets of mesh terms, including cardio
start_date, end_date = get_pubmed_date_range_past_3_months()
print(f"Search range: {start_date} to {end_date}")

search_term = f'"Physical Conditioning, Human"[MeSH Terms] AND ("{start_date}"[Date - Publication] : "{end_date}"[Date - Publication])'

# batch in sets of 100 (temporary max of 1000)
batch_size = 100
target_count = 200
pmid_list = []

# get total counts of pubs
handle = Entrez.esearch(db="pubmed", term=search_term, retmax=0)
total_count = int(Entrez.read(handle)["Count"])
print(f"Total available: {total_count}")
displayHTML(f"<b>total available {total_count}</b>")

# fetch results in batches
for start in range(0, min(target_count, total_count), batch_size):
    handle = Entrez.esearch(
        db="pubmed",
        term=search_term,
        retmax=batch_size,
        retstart=start)
    batch_result = Entrez.read(handle)
    pmid_list.extend(batch_result["IdList"])

    displayHTML(f"<b>Fetched {len(pmid_list)} PMIDs so far...</b>")

# fetch details in batches...
all_records = []
for i in range(0, len(pmid_list), batch_size):
    batch_pmids = pmid_list[i:i+batch_size]
    handle = Entrez.efetch(db="pubmed", id=batch_pmids, rettype="medline", retmode="text")
    records = list(Medline.parse(handle))

    for rec in records:
        all_records.append({
            "pmid": rec.get("PMID"),
            "title": rec.get("TI"),
            "abstract": rec.get("AB"),
            "authors": rec.get("AU"),
            "journal": rec.get("JT"),
            "date": rec.get("DP")
        })

# convert to a pyspark df then store
pdf = pd.DataFrame(all_records)
spark = SparkSession.builder.getOrCreate()
df = spark.createDataFrame(pdf)
display(df.show(10, truncate=False))

normalized_df = (
    df.withColumn("abstract", regexp_replace(col("abstract"), "\n", " "))
      .withColumn("title", regexp_replace(trim(col("title")), "\n", ""))
      .withColumn("pub_date", parse_pubmed_date_udf(col("date")))
      .drop("authors")
      .drop("date"))

normalized_df = normalized_df.limit(3)
display(normalized_df)