# Fabric Lakehouse Replicator


## This notebook clones lakehouses and their tables (including data) across Fabric workspaces.

###### Author: [Rui Cunha]
###### GitHub: [https://github.com/ruippcunha/msfabric_samples]

#### Disclaimer

This notebook is provided as a sample implementation intended for development and testing purposes only. It is not designed or recommended for use in production environments.
Although this notebook was developed by a Microsoft employee, it is not an officially supported solution, use it  at your own risk. . Microsoft and the author assume no responsibility for any issues or data loss that may result from its use in live systems.
Before applying any part of this solution in production, ensure it is thoroughly reviewed, tested, and adapted to meet your organization’s security, compliance, and operational requirements.

Please note that replicating Lakehouse data across workspaces may result in additional storage costs.
You can find more information about Microsoft Fabric storage pricing here:[Microsoft Fabric Pricing](https://azure.microsoft.com/en-us/pricing/details/microsoft-fabric/)



In [4]:
#toggle this as a parameter cell

# Define the source workspace ID from where you want to replicate the Lakehouse(s)
source_workspace_id = "6d421ea5-43d4-47c6-a0f4-99eb71d8dd50"

StatementMeta(, 7d830960-3bb9-4a1f-b898-dfb222c8534a, 6, Finished, Available, Finished)

In [5]:
import time
import requests
import json
from notebookutils import mssparkutils
import re
import sempy.fabric as fabric 
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit
# -------------------------------
# Parameters
# -------------------------------
#source_workspace_id = {source_workspace_id}
if not source_workspace_id:
    raise ValueError("❌ 'source_workspace_id' parameter is required.")
# -------------------------------
# Get Current Workspace ID
# -------------------------------
print("Getting the workspace ID")
client = fabric.FabricRestClient()
workspace_id = fabric.get_notebook_workspace_id()
print(f"Current workspace ID: {workspace_id}")

print(f"Replicating all lakehouses from Workspace {source_workspace_id} to this Workspace with id {workspace_id}")
source_workspace_id = "6d421ea5-43d4-47c6-a0f4-99eb71d8dd50" #workspace where the source LH exists 
# -------------------------------
# Authentication
# -------------------------------
token = mssparkutils.credentials.getToken("https://api.fabric.microsoft.com")
headers = {
    "Authorization": f"Bearer {token}",
    "Content-Type": "application/json"
}

# -------------------------------
# List Lakehouses in Source Workspace
# -------------------------------
print(f"Listing all Lakehouses in workspace {source_workspace_id}")
lakehouse_url = f"https://api.fabric.microsoft.com/v1/workspaces/{source_workspace_id}/items?type=Lakehouse"
response = requests.get(lakehouse_url, headers=headers)
if response.status_code == 200:
    lakehouses = response.json().get("value", [])
    print(f"Found {len(lakehouses)} lakehouses in workspace {source_workspace_id}:\n")
# -------------------------------
# Process Each Lakehouse
# -------------------------------
    for lh in lakehouses:
        lh_id = lh['id']
        lh_name = lh['displayName']
        print(f"\n📘 Processing Lakehouse: {lh_name} (ID: {lh_id})")
        # Step 3.1: Create the Lakehouse in this workspace
        create_lh_url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/lakehouses"
        create_lh_body = {
        "displayName": lh_name,
        "description": f"Cloned from workspace {source_workspace_id}"
        }
        create_lh_response = requests.post(create_lh_url, headers=headers, json=create_lh_body)
        lh = create_lh_response.json()
        
        flag_new = False
        if create_lh_response.status_code == 201: #if LH does not exist
                    new_lh_id = lh["id"]
                    new_lh_name = lh["displayName"]
                    flag_new = True
                    print(f"✅ Created new lakehouse: {lh_name} (ID: {new_lh_id})")
        else:
            print(f"ℹ️ Lakehouse '{lh_name}' already exists. Retrieving existing ID...")
            lakehouse_url = f"https://api.fabric.microsoft.com/v1/workspaces/{workspace_id}/items?type=Lakehouse"
            response = requests.get(lakehouse_url, headers=headers)
            if response.status_code == 200:
                lakehouses = response.json().get("value", [])                
                for lh in lakehouses:
                    lh_name_aux = lh['displayName']
                    if lh_name == lh_name_aux : 
                        new_lh_id = lh['id'] ##finding the lakehouse ID of the existing LH
                        new_lh_name = lh_name
        if flag_new:                    
                    
                    # -------------------------------
                    # Refresh SQL Endpoint
                    # -------------------------------
                    print(f"📄 Refreshing sql endpoint for Lakehouse {lh_name}")
                    
                    #Instantiate the client
                    client = fabric.FabricRestClient()
                    
                    sqlendpoint = None
                    max_attempts = 10 #  limit to avoid infinite loop
                    attempt = 0
                    while sqlendpoint is None and attempt < max_attempts:
                        try:
                            sqlendpoint = fabric.FabricRestClient().get(f"/v1/workspaces/{workspace_id}/lakehouses/{new_lh_id}").json()['properties']['sqlEndpointProperties']['id']
                            
                            if sqlendpoint:
                                print(f"✅ SQL endpoint is now available: {sqlendpoint}")
                                j_son = fabric.FabricRestClient().get(f"/v1/workspaces/{workspace_id}/lakehouses/{new_lh_id}").json()
                                uri = f"/v1/workspaces/{workspace_id}/sqlEndpoints/{sqlendpoint}/refreshMetadata?preview=true" 
                                payload = {} 
                                try:
                                    response = client.post(uri,json= payload, lro_wait = True) 
                                    sync_status = json.loads(response.text)
                                    display(sync_status)
                                except Exception as e: print(e)
                                break
                            else:
                                print(f"ℹ️ Attempt {attempt + 1}: SQL endpoint not available yet. Retrying in 1 minute...")
                            
                        except Exception as e:
                                print(f"ℹ️Attempt {attempt + 1}: SQL endpoint not available yet. Retrying in 1 minute...")
                        time.sleep(60)
                        attempt += 1
        else:
            print(f"ℹ️ sql endpoint for lakehouse {lh_name} already exists. No refresh required")

        # -------------------------------
        # Process Lakehouse tables
        # -------------------------------
        tables_url = f"https://api.fabric.microsoft.com/v1/workspaces/{source_workspace_id}/lakehouses/{lh_id}/tables"
        tables_response = requests.get(tables_url, headers=headers)
        if tables_response.status_code == 200:
            
            tables_data = tables_response.json()
            tables = tables_data.get("data", []) 
            if tables:
                print(f"  📄 Starting processing Lakehouse tables:")
                for table in tables:
                    print(f" Processing table - {table['name']}")
                    table_name = table['name']
                    #get the table schema and recreate the table
                    path = f"abfss://{source_workspace_id}@onelake.dfs.fabric.microsoft.com/{lh_id}/Tables/{table_name}"                    
                    
                    df = spark.read.format("delta").load(path)
                    schema = df.schema
                   # Create an empty DataFrame with the same schema
                    empty_df = spark.createDataFrame([], schema)
                    # Set the target path for the new table in the new lakehouse
                    target_path = f"abfss://{workspace_id}@onelake.dfs.fabric.microsoft.com/{new_lh_id}/Tables/{table_name}"  # f"Tables/{table_name}" # Relative path inside the lakehouse
                    # Write the empty DataFrame as a Delta table
                    empty_df.write.format("delta").mode("overwrite").save(target_path)
                    
                    #copy storage files from source to destination
                    # Define source and target paths
                    source_path = path                    
                    # Clean up the destination path if it exists
                    if mssparkutils.fs.exists(target_path):
                        #print(f"🧹 Cleaning up existing contents at: {target_path}")
                        mssparkutils.fs.rm(target_path, recurse=True)
                    # Recreate target directory                    
                    mssparkutils.fs.mkdirs(target_path)
                    # List all files and folders in the source table directory
                    items = mssparkutils.fs.ls(source_path)
                    # Copy each item to the target path
                    for item in items:
                        source_item_path = item.path 
                        target_item_path = f"{target_path}/{source_item_path.split('/')[-1]}"                     
                        mssparkutils.fs.cp(source_item_path, target_item_path, recurse=True)
                        print(f"✅ Table '{table_name}' copied successfully.")

                    print(f"✅ Successfully processed lakehouse: {lh_name}")
                    
                  
            else:
                print("  ⚠️ No tables found in this lakehouse.")
        else:
            print(f"  ❌ Failed to retrieve tables: {tables_response.status_code} - {tables_response.text}")
else:
    print(f"❌ Failed to retrieve lakehouses: {response.status_code} - {response.text}")

print(f"ℹ️ Lakehouse replication completed")

StatementMeta(, 7d830960-3bb9-4a1f-b898-dfb222c8534a, 7, Finished, Available, Finished)

Getting the workspace ID
Current workspace ID: 2f696004-c373-434e-8cd8-4c6a1c1da359
Replicating all lakehouses from Workspace 6d421ea5-43d4-47c6-a0f4-99eb71d8dd50 to this Workspace with id 2f696004-c373-434e-8cd8-4c6a1c1da359
Listing all Lakehouses in workspace 6d421ea5-43d4-47c6-a0f4-99eb71d8dd50
Found 3 lakehouses in workspace 6d421ea5-43d4-47c6-a0f4-99eb71d8dd50:


📘 Processing Lakehouse: LH_PUBLICHOLIDAYS (ID: 87943777-283d-47dd-83ce-00555a212823)
ℹ️ Lakehouse 'LH_PUBLICHOLIDAYS' already exists. Retrieving existing ID...
ℹ️ sql endpoint for lakehouse LH_PUBLICHOLIDAYS already exists. No refresh required
  📄 Starting processing Lakehouse tables:
 Processing table - publicholidays
🧹 Cleaning up existing contents at: abfss://2f696004-c373-434e-8cd8-4c6a1c1da359@onelake.dfs.fabric.microsoft.com/dca9f735-6fed-4ace-9422-2d534542747f/Tables/publicholidays
✅ Table 'publicholidays' copied successfully.
✅ Table 'publicholidays' copied successfully.
✅ Successfully processed lakehouse: LH_PUBL