## Install Required Python Modules
*TO DO*: use requirements.txt and set it at server level

In [None]:
# Moved this step to package installation using "requirements.txt"
# az synapse spark pool update with --library-requirements requirements.txt
# Concern: Long time(~ 20 mins) for the statement to return status
"""
pip install azure-storage-file-datalake
import pkg_resources
for d in pkg_resources.working_set:
     print(d)
"""


In [None]:
import json
from collections import defaultdict
from datetime import datetime

from azure.storage.filedatalake import DataLakeServiceClient
from notebookutils import mssparkutils


## Define Parameters to Enable connection to Storage

In [None]:
permissions_map = {0: "---", 1: "--x", 2: "-w-", 3: "-wx", 4: "r--", 5: "r-x", 6: "rw-", 7: "rwx" }
current_ts = datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
# Parameters
# config_file = './config/sampleconfig.json' # use adls location
keyvault_ls_name = "keyvault_ls_secrets"
storage_acct = "syngudast101"
data_container = "datalake"
data_path_prefix = ""

# Secrets based values
storage_access_key = mssparkutils.credentials.getSecretWithLS(keyvault_ls_name, "config-storage-access-key-new")
storage_acct_connection = f"DefaultEndpointsProtocol=https;AccountName={storage_acct};AccountKey={storage_access_key};EndpointSuffix=core.windows.net"


## Functions to read data from ADLS and update ACLS

In [None]:
# https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-directory-file-acl-python
def initialize_storage_account(storage_account_name, storage_account_key):
    
    global service_client

    try:  
        service_client = DataLakeServiceClient(account_url=f"https://{storage_acct}.dfs.core.windows.net", credential=storage_account_key)
    except Exception as e:
        print(e)

    return service_client


def download_file_from_directory(container, directory, file_name):
    try:
        file_system_client = service_client.get_file_system_client(file_system=container)

        directory_client = file_system_client.get_directory_client(directory)
     
        file_client = directory_client.get_file_client(file_name)

        download = file_client.download_file()

    except Exception as e:
        print(e)
    else:
        configuration = json.loads(download.readall())
        return configuration


# https://learn.microsoft.com/en-us/azure/storage/blobs/data-lake-storage-acl-python
def update_permission_recursively(service_client, container_name, directory_path, is_default_scope, user_type, user_id, permissions):
    
    try:
        file_system_client = service_client.get_file_system_client(file_system=container_name)

        directory_client = file_system_client.get_directory_client(directory_path)
              
        acl = f"{user_type}:{user_id}:{permissions}"

        if is_default_scope:
           acl = f'default:{user_type}:{user_id}:{permissions}'

        directory_client.update_access_control_recursive(acl=acl)

        acl_props = directory_client.get_access_control()
        
        print(f"Permissions for {user_type}:{user_id} are:\n{acl_props['acl']}") 

    except Exception as e:
        print(e)




## Read Config from ADLS

In [None]:
initialize_storage_account(storage_acct, storage_access_key)
config = download_file_from_directory("config", "/", "config.json")

## Evaluate overall ACLs needed in short form

In [None]:
# Assumption: Config contains all the perms needed for a given location. Incremental changes are not allowed.
# Evalauate effective permissions requested.
ad_perms = defaultdict(int)
ad_set = set()
config_check_errors = []
for p_info in config["datalakeProperties"]:
    p_info["lastUpdatedDatalake"] = current_ts
    partition = f"{p_info['year']}/{p_info['month']}"
    partition_path = f"{data_path_prefix}{partition}/"
    
    for perm in p_info["aclPermissions"]:
        for grp in perm["groups"]:
            ad_set.add(grp)
            a_type = perm["type"]
            if a_type == "read":
                ad_perms[(partition_path, grp)] += 4
            elif a_type == "write":
                ad_perms[(partition_path, grp)] += 2
            elif a_type == "execute":
                ad_perms[(partition_path, grp)] += 1
            else:
                config_check_errors.append(f"Invalid acl type value :'{a_type}' specifed for partition '{partition}' . Acl Type must be one among ['read', 'write', 'execute']")
    
# Generate statements that can be run in az cli
print(ad_perms)


# Gather list of ADs and their ids - ids are needed for granting ACLs
# One Option - Requires APP ID - https://github.com/AzureAD/microsoft-authentication-library-for-python
# for now reading from Vault
ad_map = { ad: mssparkutils.credentials.getSecretWithLS(keyvault_ls_name, ad) for ad in ad_set}
print(ad_map)

## Grant ACLs Recursively

In [None]:
# Assumption: ACL Grant statements are run after data copy step is complete. Otherwise we will run into `The specified path does not exist` errors.
for k, v in ad_perms.items():
    (part_path, ad_name) = k
    if ad_name in ad_map:
        update_permission_recursively(service_client, data_container, part_path, 0, 'group', ad_map[ad_name], permissions_map[ad_perms[k]])
    else:
        config_check_errors.append(f"{ad_name} is not a valid ActiveDirectory Group.")
    
if len(config_check_errors) > 0:
    raise ValueError(f"Config file check failed. Errors are: {config_check_errors}")
print("ACL Statements generation and Active Directory Check Complete.")



## Update Config file with latest run time
- rename with timestamp and create a new file

In [None]:
# mssparkutils.fs.help()
source_config = "abfss://config@syngudast101.dfs.core.windows.net/config.json"
backup_config = f"abfss://config@syngudast101.dfs.core.windows.net/config_{current_ts}.json"
mssparkutils.fs.mv(source_config, backup_config, overwrite=True)
mssparkutils.fs.put(source_config, json.dumps(config), overwrite=True)
