In [0]:
# DO NOT MODIFY


# This sets up the API utils for creating managed ingestion pipelines in Databricks.


import requests
import json


notebook_context = dbutils.notebook.entry_point.getDbutils().notebook().getContext()
api_token = notebook_context.apiToken().get()
workspace_url = notebook_context.apiUrl().get()
api_url = f"{workspace_url}/api/2.0/pipelines"


headers = {
    'Authorization': 'Bearer {}'.format(api_token),
    'Content-Type': 'application/json'
}


def check_response(response):
    if response.status_code == 200:
        print("Response from API:\n{}".format(json.dumps(response.json(), indent=2, sort_keys=False)))
    else:
        print(f"Failed to retrieve data: error_code={response.status_code}, error_message={response.json().get('message', response.text)}")




def create_pipeline(pipeline_definition: str):
  response = requests.post(url=api_url, headers=headers, data=pipeline_definition)
  check_response(response)




def edit_pipeline(id: str, pipeline_definition: str): 
  response = requests.put(url=f"{api_url}/{id}", headers=headers, data=pipeline_definition)
  check_response(response)




def delete_pipeline(id: str): 
  response = requests.delete(url=f"{api_url}/{id}", headers=headers)
  check_response(response)




def list_pipeline(filter: str):
  body = "" if len(filter) == 0 else f"""{{"filter": "{filter}"}}"""
  response = requests.get(url=api_url, headers=headers, data=body)
  check_response(response)




def get_pipeline(id: str):
  response = requests.get(url=f"{api_url}/{id}", headers=headers)
  check_response(response)




def start_pipeline(id: str, full_refresh: bool=False):
  body = f"""
  {{
    "full_refresh": {str(full_refresh).lower()},
    "validate_only": false,
    "cause": "API_CALL"
  }}
  """
  response = requests.post(url=f"{api_url}/{id}/updates", headers=headers, data=body)
  check_response(response)




def stop_pipeline(id: str):
  print("cannot stop pipeline")


In [0]:
# Do not modify the PREVIEW channel in the code below. 


# If you want to ingest all drives in your SharePoint site, use the schema spec. If you want to ingest only some drives in your SharePoint site, use the table spec.


# By default, the API will use SCD type 1 for the data. This means that it will overwrite the file in the destination if itâ€™s edited in the source. If you prefer to preserve historical file versions and use SCD type 2, then you should specify that in the config.


# schema spec:
#source_schema = SHAREPOINT SITE ID (Enter this URL in browser to find SITE ID https://{yourcompanyname}.sharepoint.com/sites/{yourcompanysite}/_api/site/id). The Edm.Guid will be the SITE ID
#source_table = SHAREPOINT DRIVE NAME

pipeline_spec = """
{
 "name": "sharepoint_ingestion_pipeline",
 "ingestion_definition": {
     "connection_name": "sharepoint_oauth",
     "objects": [
        {
          "schema": {
            "source_schema": "ENTER-YOUR-SHAREPOINT-SITE-ID", 
            "source_table": "Shared Documents",
            "destination_catalog": "sandbox",
            "destination_schema": "bronze",
            "destination_table": "documents",
            "table_configuration": {
              "scd_type": "SCD_TYPE_1"
            }
          }
        }
      ]
 },
 "channel": "PREVIEW"
}
"""


create_pipeline(pipeline_spec)


In [0]:
start_pipeline("ENTER-YOUR-PIPELINE-ID")

In [0]:
# Check the desired file content
from pyspark.sql.functions import col, length, isnull

# Read from bronze table and filter for your Excel file
df = spark.read.table("sandbox.bronze.documents")

excel_file_check = df.filter(col('file_metadata.name').contains("ENTER-YOUR-SHAREPOINT-EXCEL-FILE.xlsx")).select(
    col("file_id"),
    col("file_metadata.name").alias("file_name"),
    col("file_metadata.size_in_bytes").alias("file_size"),
    col("source_metadata.mime_type").alias("mime_type"),
    isnull(col("content.inline_content")).alias("inline_content_is_null"),
    length(col("content.inline_content")).alias("inline_content_length"),
    col("content.content_file_path").alias("content_file_path"),
    isnull(col("content.content_file_path")).alias("file_path_is_null")
)

display(excel_file_check)

In [0]:
%pip install openpyxl
# Auto type-preserving Excel import with safe null handling
import pandas as pd
from io import BytesIO
from pyspark.sql.functions import col, to_date, coalesce

# Step 1: Read from bronze table
df = spark.read.table("sandbox.bronze.documents")

excel_row = (
    df.filter(col('file_metadata.name').contains("ENTER-YOUR-SHAREPOINT-EXCEL-FILE.xlsx"))
      .select(col("content.inline_content").alias("binary_content"))
      .first()
)

binary_content = excel_row["binary_content"]

# Step 2: Read Excel with pandas (auto type inference)
excel_df = pd.read_excel(
    BytesIO(binary_content),
    sheet_name='SampleData',
    engine='openpyxl',
)

print(f"Read Excel: {excel_df.shape[0]} rows {excel_df.shape[1]} columns")

# Step 3: Replace NaN/NaT with None (preserves dtype)
excel_df = excel_df.where(pd.notnull(excel_df), None)

# Step 4: Convert to Spark DataFrame with inferred schema
spark_excel_df = spark.createDataFrame(excel_df)
print("Converted to Spark DataFrame (auto-inferred schema)")
spark_excel_df.printSchema()

# Step 5: Dynamic typecasting for date columns
date_patterns = ["yyyy-MM-dd", "dd-MM-yyyy", "MM/dd/yyyy"]
def dynamic_date_cast(col_name):
    return coalesce(
        *[to_date(col(col_name), fmt) for fmt in date_patterns],
        to_date(col(col_name))
    ).alias(col_name)

date_cols = [f.name for f in spark_excel_df.schema.fields if "date" in f.name.lower() or "last_updated" in f.name.lower()]
other_cols = [col(c) for c in spark_excel_df.columns if c not in date_cols]

final_df = spark_excel_df.select(
    *other_cols,
    *[dynamic_date_cast(c) for c in date_cols]
)

# Step 6: Write final table
final_df.write.mode("overwrite").saveAsTable("sandbox.silver.your_sharepoint_table_name")

# Step 7: Display the final table
display(spark.read.table("sandbox.silver.your_sharepoint_table_name"))

In [0]:
delete_pipeline('ENTER-YOUR-PIPELINE-ID')