## **Install Package**

In [None]:
%pip install https://github.com/renan-peres/fabric-remote-tools/raw/main/fabric_remote_tools-0.1.1.tar.gz

## **Import Modules & Authenticate**

In [1]:
from fabric_remote_tools import FabricAuth, OneLakeUtils
import os
from dotenv import load_dotenv
load_dotenv()

# Load Fabric Environmet Variables (.env File)
account_name = os.getenv("ACCOUNT_NAME")
workspace_id = os.getenv("WORKSPACE_ID")
lakehouse_id = os.getenv("LAKEHOUSE_ID")

# Get Authentication Token
token = FabricAuth.get_DefaultAzureCredential()

# Get File System Client
file_system_client = FabricAuth.get_FileSystemClient(token, account_name, workspace_id)

## **Write to Lakehouse (Files/Tables)**

### **Local Tables (Delta)**

In [3]:
# Single Table
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="local",
    source_path="../assets/data/Tables/venture_funding_deals_delta_partitioned",
    target_path="Tables/local_venture_funding_deals_delta_partitioned"
)

# Multiple Tables in a Folder
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="local",
    source_path="../assets/data/Tables",
    target_path="Tables/",
)

Uploading local Delta Table '../assets/data/Tables/venture_funding_deals_delta_partitioned' to 'Tables/local_venture_funding_deals_delta_partitioned'
Successfully uploaded 40 out of 40 files to 'Tables/local_venture_funding_deals_delta_partitioned'
Uploading multiple tables from '../assets/data/Tables' to 'Tables/'
Successfully uploaded 2 out of 2 files to 'Tables/venture_funding_deals_delta'
Successfully uploaded 40 out of 40 files to 'Tables/venture_funding_deals_delta_partitioned'


### **Local Files/Folders**

In [None]:
# Whole Folder
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="local",
    source_path="../assets/data/Files",
    target_path="Files/"
)

# Individual Subfolder inside a Folder
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="local",
    source_path="../assets/data/Files/Contoso",
    target_path="Files/Contoso"
)

# Specific File in a Folder
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="local",
    source_path="../assets/data/Files/Contoso/contoso_sales.csv",
    target_path="Files/Contoso/contoso_sales.csv", 
)

### **GitHub (Public Repo)** 

In [None]:
# Whole GitHub repository
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="github",
    source_path="https://github.com/renan-peres/Polars-Cookbook.git",
    target_path="Files/GitHub/Polars-Cookbook"
)

# Single Table (Delta) in Repository
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="github",
    source_path="https://github.com/renan-peres/Polars-Cookbook.git",
    target_path="Tables/github_venture_funding_deals_delta",
    folder_path="data/venture_funding_deals_delta"
)

# Specific folder from GitHub repository
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="github",
    source_path="https://github.com/renan-peres/Polars-Cookbook.git",
    target_path="Files/GitHub/data",
    folder_path="data"
)

### **GitHub (Private Repo)** 

In [None]:
github_token = os.getenv("GH_PERSONAL_ACCESS_TOKEN")
github_username = os.getenv("GH_USERNAME")
gh_repo_name = os.getenv("GH_REPO_NAME")

# Whole GitHub private repository
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="github_private",
    github_token=github_token,
    github_username=github_username,
    repo_name=gh_repo_name,
    target_path=f"Files/GitHub/{gh_repo_name}"
)

# Specific folder from GitHub private repository
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="github_private",
    github_token=github_token,
    github_username=github_username,
    repo_name=gh_repo_name,
    target_path="Files/GitHub/data",
    folder_path="data"
)

### **Azure DevOps (Private Repo)**

In [None]:
organization_url = os.getenv("ADO_ORGANIZATIONAL_URL")
personal_access_token = os.getenv("ADO_PERSONAL_ACCESS_TOKEN")
project_name = os.getenv("ADO_PROJECT_NAME")
repo_name = os.getenv("ADO_REPO_NAME")

# Whole Azure DevOps repository
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="azure_devops",
    project_name=project_name,
    repo_name=repo_name,
    organization_url=organization_url,
    personal_access_token=personal_access_token,
    target_path=f"Files/AzureDevOps/{repo_name}",
)

# Specific folder from Azure DevOps repository
OneLakeUtils.write_to_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    upload_from="azure_devops",
    project_name=project_name,
    repo_name=repo_name,
    organization_url=organization_url,
    personal_access_token=personal_access_token,
    target_path="Files/AzureDevOps/data",
    folder_path="/data",
)

## **List Items from Lakehouse (Files/Tables)**

In [None]:
# List All Items in Lakehouse
OneLakeUtils.list_items(
    file_system_client=file_system_client
    ,lakehouse_id=lakehouse_id
    ,target_directory_path="Tables" # Tables or Files
    #  ,print_output= True # Optional
)

## **DeltaLake Table Operations**

### **Read Table from Lakehouse into a DataFrame**

In [4]:
from fabric_remote_tools import FabricAuth, OneLakeUtils
import os
from dotenv import load_dotenv
load_dotenv() # Load environment variables from .env file

# Authenticate and obtain access token
file_system_client = FabricAuth().get_ClientSecretCredential()

# Read Table from Lakehouse into Dataframe
workspace_name = os.getenv("WORKSPACE_NAME")
lakehouse_name = os.getenv("LAKEHOUSE_NAME")
table_name = "Tables/venture_funding_deals_delta"
table_path = f"abfss://{workspace_name}@onelake.dfs.fabric.microsoft.com/{lakehouse_name}.Lakehouse/{table_name}"

df = OneLakeUtils().read_deltalake(
    file_system_client=file_system_client,
    table_path=table_path,
    engine='duckdb',  # Supported options: 'duckdb', 'polars'
    # version=11,  # Optional: specify the version to read
    # row_limit=10  # Optional
)

display(df)

┌─────────────────┬────────────────┬───────────────────────┬─────────────────┬─────────────────────────┬───────────────┐
│     Company     │     Amount     │    Lead investors     │    Valuation    │        Industry         │ Date reported │
│     varchar     │    varchar     │        varchar        │     varchar     │         varchar         │    varchar    │
├─────────────────┼────────────────┼───────────────────────┼─────────────────┼─────────────────────────┼───────────────┤
│ Stripe          │ $6,500,000,000 │ n/a                   │ $50,000,000,000 │ Fintech                 │ 3/15/23       │
│ Inflection AI   │ $1,300,000,000 │ Microsoft, Reid Hof…  │ $4,000,000,000  │ Artificial intelligence │ 6/29/23       │
│ Anthropic       │ $1,250,000,000 │ Amazon                │ $4,000,000,000  │ Artificial intelligence │ 9/25/23       │
│ Lessen          │ $500,000,000   │ n/a                   │ $2,000,000,000  │ Real estate             │ 1/11/23       │
│ Databricks      │ $500,000,000

### **Query DataFrame with DuckDB**

In [5]:
%pip install magic_duckdb --upgrade --quiet
%load_ext magic_duckdb

Note: you may need to restart the kernel to use updated packages.


In [6]:
%%dql
PRAGMA disable_print_progress_bar;
SUMMARIZE df;
DESCRIBE SELECT * FROM df;

Unnamed: 0,column_name,column_type,null,key,default,extra
0,Company,VARCHAR,YES,,,
1,Amount,VARCHAR,YES,,,
2,Lead investors,VARCHAR,YES,,,
3,Valuation,VARCHAR,YES,,,
4,Industry,VARCHAR,YES,,,
5,Date reported,VARCHAR,YES,,,


In [8]:
%%dql
CREATE OR REPLACE TABLE df_tranf AS 
    SELECT *
    FROM df
    LIMIT 20;

Unnamed: 0,Count
0,20


### **Write DataFrame to Lakehouse as a Delta Table**

In [9]:
from deltalake.writer import write_deltalake
import duckdb
import pyarrow
import polars as pl

# Write DataFrame to Lakehouse
write_deltalake(
    table_or_uri=table_path
    ,storage_options=file_system_client
    # ,data=df.to_arrow() # Polars DF
    ,data=duckdb.sql("SELECT * FROM df_tranf").arrow() # DuckDB (arrow DF)
    ,mode="append" # Supported options: 'append', 'overwrite'
    ,engine="rust"
)

### **DESCRIBE HISTORY**

In [10]:
from deltalake import DeltaTable
import pandas as pd

# Initialize the DeltaTable
dt = DeltaTable(table_path)

# Retrieve the full history of the DeltaTable
history = dt.history()

# Convert the history list to a pandas DataFrame
history_df = pd.DataFrame(history)

# Parse the timestamp column
history_df['timestamp'] = pd.to_datetime(history_df['timestamp'], unit='ms')

# Display the DataFrame, sorted by version in descending order
display(history_df.sort_values(by='version', ascending=False).head(5))

Unnamed: 0,timestamp,operation,operationParameters,clientVersion,version,readVersion,isolationLevel,isBlindAppend,engineInfo,operationMetrics,txnId,tags
0,2024-07-04 12:31:35.831,WRITE,{'mode': 'Append'},delta-rs.0.18.0,10,,,,,,,
1,2024-07-03 19:21:49.276,WRITE,{'mode': 'Append'},delta-rs.0.18.1,9,,,,,,,
2,2024-07-03 19:21:42.788,WRITE,{'mode': 'Overwrite'},delta-rs.0.18.1,8,,,,,,,
3,2024-07-02 19:52:55.075,VACUUM END,{'status': 'COMPLETED'},,7,6.0,SnapshotIsolation,True,Apache-Spark/3.4.1.5.3.20240528.1 Delta-Lake/2...,"{'numDeletedFiles': '0', 'numVacuumedDirectori...",4ebe44b4-d133-4e06-b659-1deb812dee77,
4,2024-07-02 19:52:53.249,VACUUM START,"{'retentionCheckEnabled': True, 'defaultRetent...",,6,5.0,SnapshotIsolation,True,Apache-Spark/3.4.1.5.3.20240528.1 Delta-Lake/2...,"{'numFilesToDelete': '0', 'sizeOfDataToDelete'...",60f1f933-7958-47c1-bf8e-46031dac75f2,


## **Download Items from Lakehouse (Files/Tables)**

In [3]:
# Download a single table
OneLakeUtils.download_from_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    target_file_path="Tables/venture_funding_deals"
)

# Download a single file
OneLakeUtils.download_from_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    target_file_path="Files/Contoso/contoso_sales.csv"
)

# Download a Files subfolder
OneLakeUtils.download_from_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    target_file_path="Files/Contoso/"
)

# Download all tables
OneLakeUtils.download_from_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    target_file_path="Tables/"
)

# Download all files
OneLakeUtils.download_from_lakehouse(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    target_file_path="Files/"
)

Error downloading 'Tables/venture_funding_deals': name 'OneLakeUtils' is not defined


## **Delete Items from Lakehouse (Files/Tables)**

In [None]:
# Delete a single table
OneLakeUtils.delete_file(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    lakehouse_dir_path="Tables/venture_funding_deals_delta"
)

# Delete a single file
OneLakeUtils.delete_file(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    lakehouse_dir_path="Files/Contoso/contoso_sales.csv"
)

# Delete a subfolder
OneLakeUtils.delete_file(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    lakehouse_dir_path="Files/Contoso"
)

# Delete all tables
OneLakeUtils.delete_file(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    lakehouse_dir_path="Tables/"
)

# Delete all files
OneLakeUtils.delete_file(
    file_system_client=file_system_client,
    lakehouse_id=lakehouse_id,
    lakehouse_dir_path="Files/"
)