# Python Notebooks on Microsoft Fabric

In [None]:
import duckdb
import notebookutils
import pandas as pd
import pyarrow as pa

# Get authentication token using Fabric's credential helper
access_token = notebookutils.credentials.getToken('storage')

# Define storage options for authentication
storage_options = {
    "bearer_token": access_token, 
    "use_fabric_endpoint": "true"
}

# Your ABFSS path
abfss_path = "abfss://ws_fmdk_solution@onelake.dfs.fabric.microsoft.com/lh_fmdk_solution_config.Lakehouse/Files/sample_data/organizations.csv"

# Read CSV with semicolon delimiter into a Pandas DataFrame
pandas_df = pd.read_csv(
    abfss_path, 
    sep=';',  # Use semicolon as delimiter
    storage_options=storage_options
)

# Convert the Pandas DataFrame to a PyArrow table
arrow_table = pa.Table.from_pandas(pandas_df)

# Initialize DuckDB connection
con = duckdb.connect()

# Use DuckDB to query the PyArrow table directly
# Note: We register the arrow_table as a source for DuckDB
query = """
SELECT *
FROM arrow_table
ORDER BY "Index"
LIMIT 20
"""

# Execute query and convert result to a new PyArrow table
result_arrow_table = con.execute(query).arrow()

# Now you have a PyArrow table with the first 20 rows
print(f"Number of rows in result: {len(result_arrow_table)}")
print(f"Columns: {result_arrow_table.column_names}")

# Display the result PyArrow table
print("\nResult PyArrow Table:")
print(result_arrow_table.to_pandas())

# Optional: You can run additional DuckDB queries on the original arrow_table
industry_count_query = """
SELECT "Industry", COUNT(*) as "Organization Count"
FROM arrow_table
GROUP BY "Industry"
ORDER BY "Organization Count" DESC
"""

industry_counts = con.execute(industry_count_query).arrow()
print("\nOrganization counts by industry:")
print(industry_counts.to_pandas())