##### **1. Installation and Imports**

In [None]:
pip install azure-storage-file-datalake azure-identity

##### **Importing required Libraries**

In [None]:
import os
import requests
import json
from azure.storage.filedatalake import DataLakeServiceClient
from azure.identity import DefaultAzureCredential
import concurrent.futures
from delta.tables import DeltaTable
from pyspark.sql import SparkSession
from concurrent.futures import ThreadPoolExecutor
from trident_token_library_wrapper import PyTridentTokenLibrary as tl


##### **2. Configuration and Environment Setup**

In [None]:
access_token = mssparkutils.credentials.getToken("keyvault")

tenant_id =tl.get_secret_with_token("https://glocalazure2023.vault.azure.net/","tenantid",access_token)
client_id =tl.get_secret_with_token("https://glocalazure2023.vault.azure.net/","clientid",access_token)
client_secret =tl.get_secret_with_token("https://glocalazure2023.vault.azure.net/","clientsecret",access_token)

In [6]:
os.environ["AZURE_CLIENT_ID"] = client_id
os.environ["AZURE_CLIENT_SECRET"] = client_secret
os.environ["AZURE_TENANT_ID"] = tenant_id

StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 8, Finished, Available)

In [7]:
# Use this sesstings to set the retentionhours less than 168 
spark.conf.set("spark.databricks.delta.retentionDurationCheck.enabled", "false")

StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 9, Finished, Available)

In [8]:
# Configs for Delta lake Maintenance 
vacuumRetentionInHours = 0 # Default 7 days (24 * 7 = 168 hours)


StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 10, Finished, Available)

##### **3. Parallel Task Executor Function**

In [9]:
executorCores = (sc._jsc.sc().getExecutorMemoryStatus().keySet().size()-1)*os.cpu_count() 

min_Workers_Heavyprocess = executorCores / 2
# atleast 2 cores per thread will be provided

max_Workers_Lightprocess = executorCores


StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 11, Finished, Available)

In [10]:
def runParallelTasks(task_name, param_list, worker_count):   
    result_dict = {}
    with concurrent.futures.ThreadPoolExecutor(worker_count) as executor:
        task_results = {executor.submit(task_name, param): param for param in param_list}
        for future in concurrent.futures.as_completed(task_results.keys()):
            result = future.result()
            if result:
                result_dict[task_results[future]] = result      
    return result_dict

StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 12, Finished, Available)

##### **4. Fetching Workspace Details and Delta Paths**

In [11]:
# Fetching workspace details and delta paths
resource_url = 'https://analysis.windows.net/powerbi/api'
api_url = 'https://analysis.windows.net/powerbi/api/.default'
auth = DefaultAzureCredential()
access_token = auth.get_token(api_url)
access_token_value = access_token.token
base_url = 'https://api.powerbi.com/v1.0/myorg/groups'
headers = {'Authorization': f'Bearer {access_token_value}'}

response = requests.get(base_url, headers=headers)
if response.status_code == 200:
    workspaces = response.json()['value']
else:
    raise ValueError(f"Error: {response.status_code} - {response.text}")

account_name = 'onelake'
service_client = DataLakeServiceClient(account_url=f"https://{account_name}.dfs.fabric.microsoft.com", credential=DefaultAzureCredential())

# Ignore  lakehouse & tables that dont need to be optimized through this script
# ignorelakehouses = ['DataflowsStagingLakehouse','Storage_Comparision']
# ignoreTables= ['nycgreentaxi_enriched','saleslt_productmodel_rewfd']
ignore_lakehouses = ['NYCTaxiLakeHouse']
ignore_tables = []



StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 13, Finished, Available)

In [12]:
delta_paths = []
for workspace in workspaces:
    workspace_name = workspace['name']
    
    file_system_client = service_client.get_file_system_client(file_system=workspace_name)
    for path in file_system_client.get_paths(recursive=False):
        # Ignore the specified lakehouses
        if any(ignoreLakehouse.lower() in path.name.lower() for ignoreLakehouse in ignore_lakehouses):
            continue
        
        if path.is_directory and '.Lakehouse' in path.name:
            lakehouse_path = path.name
            for table_path in file_system_client.get_paths(path=lakehouse_path + '/Tables', recursive=False):
                # Ignore the specified tables
                if any(ignoreTable.lower() in table_path.name.lower() for ignoreTable in ignore_tables):
                    continue
                
                if table_path.is_directory:
                    #delta_path = f"abfss://{workspace_name}@{account_name}.dfs.fabric.microsoft.com/{lakehouse_path}/Tables/{table_path.name}"
                    delta_path = f"abfss://{workspace_name}@{account_name}.dfs.fabric.microsoft.com/{table_path.name}"
                    delta_paths.append(delta_path)


StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 14, Finished, Available)

##### **Listing All Table's One Lake's Locations**

In [None]:
for path in delta_paths:
    print(path)

##### **Getting Delta Details and Filtering Tables which are not in delta format**

In [14]:

deltaTablesDetails = []
notFoundTables = []

# Class to hold the details of Delta tables
class DeltaTableDetail:
    def __init__(self, tableName, location):
        self.tableName = tableName
        self.location = location




StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 16, Finished, Available)

In [15]:
def getDeltaDetails(delta_path):
    try:
        tableDetail = spark.sql(f"DESCRIBE DETAIL '{delta_path}'").collect()
        if tableDetail[0].format == "delta" and tableDetail[0].id is not None:
            return DeltaTableDetail(delta_path, tableDetail[0].location)
        else:
            return False
    except Exception as e:
        print(f"Exception occurred: {delta_path}")
        notFoundTables.append(delta_path)
        #print(e)
        return False


StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 17, Finished, Available)

In [None]:
max_workers = max_Workers_Lightprocess
deltaTablesDetails = runParallelTasks(getDeltaDetails, delta_paths, max_workers)

# Filter out the False results and print Delta table details
deltaTablesDetails = [tableDetail for tableDetail in deltaTablesDetails.values() if tableDetail]
# Print the details of all Delta tables found
print(f"All Delta Tables found: {len(deltaTablesDetails)}")
for detail in deltaTablesDetails:
    print(f"Path: {detail.location}")

# Print the paths which were not found or were not Delta tables
print(f"Not Found or Not Delta Tables: {len(notFoundTables)}")
for path in notFoundTables:
    print(f"Path: {path}")

##### **6. Optimize and Vacuum Execution**

In [17]:
# Preparing the list of table locations for optimize and vacuum
optimize_vacuum_tasks = [
    {
        "location": detail.location,
        "vacuumRetentionInHours": vacuumRetentionInHours
    }
    for detail in deltaTablesDetails
]


StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 19, Finished, Available)

In [18]:
def execOptimizeVacuum(task):
    location = task["location"]
    vacuumRetentionInHours = task["vacuumRetentionInHours"]
    try:
        print(f"Initiating OPTIMIZE on {location}")
        OPTIMIZE_SQL = f"OPTIMIZE '{location}' VORDER"
        spark.sql(OPTIMIZE_SQL)
        print(f"Completed OPTIMIZE on {location}")

        print(f"Initiating VACUUM on {location} with retention {vacuumRetentionInHours} hours")
        VACUUM_SQL = f"VACUUM '{location}' RETAIN {vacuumRetentionInHours} HOURS"
        spark.sql(VACUUM_SQL)
        print(f"Completed VACUUM on {location}")

    except Exception as e:
        print(f"Error occurred while processing {location}: {e}")
        return False
    
    return True



StatementMeta(, 10f70b56-aca4-4cc1-bcbf-3d2e600baeee, 20, Finished, Available)

In [None]:
# Running the optimize and vacuum tasks in parallel
max_workers = min_Workers_Heavyprocess

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    results = list(executor.map(execOptimizeVacuum, optimize_vacuum_tasks))

print("Optimize and Vacuum processes are completed.")