In [None]:
# ! pip install Office365-REST-Python-Client

In [None]:
# ! pip install azure-storage-file-datalake

In [None]:
from office365.runtime.auth.authentication_context import AuthenticationContext
from office365.sharepoint.client_context import ClientContext
from office365.sharepoint.files.file import File
from azure.storage.filedatalake import DataLakeServiceClient
import pandas as pd

In [None]:
class SharepointToAzureBlobConnector:

    def __init__(self, username, password, sharepoint_subdomain, sharepoint_site_name, azure_account_name, azure_access_key, container_name):
        self.username = username
        self.password = password
        self.sharepoint_site_name= sharepoint_site_name
        self.url = f"https://{sharepoint_subdomain}.sharepoint.com"
        self.site = f"https://{sharepoint_subdomain}.sharepoint.com/sites/{self.sharepoint_site_name}/"
        self.azure_storage_account_name = azure_account_name
        self.azure_storage_access_key = azure_access_key
        self.container_name = container_name

    def create_sharepoint_authentication(self):
        ctx_auth = AuthenticationContext(self.site)
        try:
            if ctx_auth.acquire_token_for_user(self.username, self.password):
                CTX = ClientContext(self.site, ctx_auth)
                web = CTX.web
                CTX.load(web)
                CTX.execute_query()
                print('Authenticated into SharePoint as:', web.properties['Title'])
                return CTX
            else:
                print("Failed to authenticate.")
        except Exception as e:
            print(f"Error during SharePoint authentication:\n{e}\n{ctx_auth.get_last_error()}")

    def sharepoint_folder_details(self, folder_in_sharepoint):
        try:
            folder_names = []
            CTX = self.create_sharepoint_authentication()
            folder = CTX.web.get_folder_by_server_relative_url(folder_in_sharepoint)
            sub_folders = folder.files
            CTX.load(sub_folders)
            CTX.execute_query()
            for s_folder in sub_folders:
                folder_names.append(s_folder.properties["Name"])
            print(f"Folder details:\n{folder_names}")
        except Exception as e:
            print(f"Error retrieving folder details:\n{e}")

    def create_azure_datalake_authentication(self):
        try:
            # Initialize the DataLakeServiceClient
            service_client = DataLakeServiceClient(
                account_url=f"https://{self.azure_storage_account_name}.dfs.core.windows.net",
                credential=self.azure_storage_access_key
            )
            print("Azure Data Lake Service Client authenticated successfully.")
            return service_client
        except Exception as e:
            print(f"Error during Azure Data Lake authentication:\n{e}")

    def load_sharepoint_file_to_azure_blob(self, sharepoint_file_path: str, azure_directory_name: str):
        try:
            # Create file parameters
            file_path =    f"/sites/{self.sharepoint_site_name}/{sharepoint_file_path}"
            file_name=     file_path.split("/")[-1]
            print(f"\nfile_path:\n{file_path}")
            print(f"file_name:\n{file_name}\n")

            # Load SharePoint file
            CTX = self.create_sharepoint_authentication()
            file_response = File.open_binary(CTX, file_path)

            if file_response.status_code == 200:
                print("Status: 200,\nRequest to sharepoint was successfull!\n")
            else:
                print(f"Status: {file_response.status_code}, Request failed to sharepoint\n")

            # Authenticate Data Lake service client
            service_client = self.create_azure_datalake_authentication()

            # Create or get the file system client
            file_system_client = service_client.get_file_system_client(file_system=self.container_name)

            # Create or get the directory client
            directory_client = file_system_client.get_directory_client(azure_directory_name)

            # Create the file client in the specified directory
            file_client = directory_client.create_file(file_name)

            # Upload the content
            file_content = file_response.content
            file_client.append_data(data=file_content, offset=0, length=len(file_content))
            file_client.flush_data(len(file_content))

            print("File successfully uploaded to Azure Data Lake Storage!")
        except Exception as e:
            print(f"Error during file upload:\n{e}")

In [None]:
# initiate the class & pass parameters
SharepointToAzureConnectorClass= SharepointToAzureBlobConnector(username= xxxxxx, 
                                                                password= xxxxxx, 
                                                                sharepoint_subdomain=xxxxxx, 
                                                                sharepoint_site_name=xxxxxx, 
                                                                azure_account_name="adlsbauapp_xxxx", 
                                                                azure_access_key=xxxxxx, 
                                                                container_name="bronze")

# Get File Name 
SharepointToAzureConnectorClass.sharepoint_folder_details(folder_in_sharepoint="Shared Documents/AzureBlobStorageConnection")

# Load file from sharepoint to Azure blob
SharepointToAzureConnectorClass.load_sharepoint_file_to_azure_blob(sharepoint_file_path= "Shared Documents/AzureBlobStorageConnection/Fleet Mgmt Master Mapping Wkst.xlsx", 
                                                                   azure_directory_name= "sharepoint")

### Using clientid, clientsecret, graph API

In [None]:
class SharepointToAzureBlobConnector:

    def __init__(self, tenant_id, client_id, client_secret, sharepoint_subdomain, sharepoint_site_name,
                 azure_account_name, azure_access_key, container_name):
        self.tenant_id = tenant_id
        self.client_id = client_id
        self.client_secret = client_secret
        self.sharepoint_subdomain = sharepoint_subdomain
        self.sharepoint_site_name = sharepoint_site_name
        self.azure_storage_account_name = azure_account_name
        self.azure_storage_access_key = azure_access_key
        self.container_name = container_name

        # cache Graph auth + site/drive lookup
        self._access_token = None
        self._site_id = None
        self._drive_id = None

    # GRAPH AUTH
    def get_graph_token(self):
        if self._access_token:
            return self._access_token

        url = f"https://login.microsoftonline.com/{self.tenant_id}/oauth2/v2.0/token"
        payload = {
            "client_id": self.client_id,
            "client_secret": self.client_secret,
            "scope": "https://graph.microsoft.com/.default",
            "grant_type": "client_credentials"
        }
        r = requests.post(url, data=payload)
        r.raise_for_status()
        self._access_token = r.json()["access_token"]
        return self._access_token

    # GRAPH SITE + DRIVE
    def resolve_site_and_drive(self):
        if self._site_id and self._drive_id:
            return self._site_id, self._drive_id

        headers = {"Authorization": f"Bearer {self.get_graph_token()}"}

        # Lookup site
        site_url = f"https://graph.microsoft.com/v1.0/sites/{self.sharepoint_subdomain}.sharepoint.com:/sites/{self.sharepoint_site_name}"
        site_resp = requests.get(site_url, headers=headers)
        site_resp.raise_for_status()
        site = site_resp.json()
        self._site_id = site["id"]

        # Lookup default document library drive (usually "Documents")
        drive_url = f"https://graph.microsoft.com/v1.0/sites/{self._site_id}/drives"
        drive_resp = requests.get(drive_url, headers=headers)
        drive_resp.raise_for_status()
        drives = drive_resp.json()["value"]
        self._drive_id = drives[0]["id"]

        return self._site_id, self._drive_id

    # GRAPH DOWNLOAD FILE
    def download_file_from_sharepoint(self, sharepoint_file_path: str) -> bytes:
        """
        Example sharepoint_file_path = "Shared Documents/Folder/file.xlsx"
        """
        site_id, drive_id = self.resolve_site_and_drive()
        headers = {"Authorization": f"Bearer {self.get_graph_token()}"}

        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root:/{sharepoint_file_path}:/content"
        resp = requests.get(url, headers=headers)
        resp.raise_for_status()
        return resp.content

    # ADLS AUTH
    def create_azure_datalake_authentication(self):
        service_client = DataLakeServiceClient(
            account_url=f"https://{self.azure_storage_account_name}.dfs.core.windows.net",
            credential=self.azure_storage_access_key
        )
        print("Azure Data Lake Service Client authenticated successfully.")
        return service_client

    # MAIN TRANSFER
    def list_sharepoint_folder(self, folder_path: str = ""):
        site_id, drive_id = self.resolve_site_and_drive()
        headers = {"Authorization": f"Bearer {self.get_graph_token()}"}

        if folder_path:
            url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root:/{folder_path}:/children"
        else:
            url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root/children"

        resp = requests.get(url, headers=headers)
        resp.raise_for_status()
        items = resp.json()["value"]

        for item in items:
            print(("folder: " if item.get("folder") else "No folder"), item["name"])
        return items
    
    # Get Graph Access Token
    def get_access_token(self):
        if not hasattr(self, "_access_token") or not self._access_token:
            url = f"https://login.microsoftonline.com/{self.tenant_id}/oauth2/v2.0/token"
            payload = {
                "client_id": self.client_id,
                "client_secret": self.client_secret,
                "scope": "https://graph.microsoft.com/.default",
                "grant_type": "client_credentials"
            }
            r = requests.post(url, data=payload)
            r.raise_for_status()
            self._access_token = r.json()["access_token"]
        return self._access_token

    # Get Site ID for SharePoint Site
    def get_site_id(self):
        if not hasattr(self, "_site_id") or not self._site_id:
            headers = {"Authorization": f"Bearer {self.get_access_token()}"}
            url = f"https://graph.microsoft.com/v1.0/sites/{self.sharepoint_subdomain}.sharepoint.com:/sites/{self.sharepoint_site_name}"
            resp = requests.get(url, headers=headers)
            resp.raise_for_status()
            site = resp.json()
            self._site_id = site["id"]
        return self._site_id
    
    # list drives
    def list_drives(self):
        site_id = self.get_site_id()
        headers = {"Authorization": f"Bearer {self.get_access_token()}"}
        url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives"
        resp = requests.get(url, headers=headers)
        resp.raise_for_status()
        drives = resp.json()["value"]
        for d in drives:
            print(f"Drive name: {d['name']}  |  ID: {d['id']}")
        return drives

    # func to load file from sharepoint to blob
    def load_sharepoint_file_to_azure_blob(self, relative_file_path, azure_directory_name, return_file_name=False):
        try:
            # Get drive ID dynamically
            site_id = self.get_site_id()
            headers = {"Authorization": f"Bearer {self.get_access_token()}"}
            drives = requests.get(f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives", headers=headers).json()["value"]
            drive_id = next(d["id"] for d in drives if d["name"] == "Documents")

            # Graph API download URL
            url = f"https://graph.microsoft.com/v1.0/sites/{site_id}/drives/{drive_id}/root:/{relative_file_path}:/content"
            r = requests.get(url, headers=headers)
            r.raise_for_status()
            file_content = r.content

            # Azure Data Lake upload (existing code)
            service_client = self.create_azure_datalake_authentication()
            fs_client = service_client.get_file_system_client(file_system=self.container_name)
            dir_client = fs_client.get_directory_client(azure_directory_name)
            file_name = relative_file_path.split("/")[-1]
            file_client = dir_client.create_file(file_name)
            file_client.append_data(file_content, offset=0, length=len(file_content))
            file_client.flush_data(len(file_content))
            print("File uploaded successfully")

            if return_file_name:
                print(file_name)

        except Exception as e:
            print(f"Error during file upload:\n{e}")

In [None]:
config_path= f'abfss://config@{META_STORAGE_ACCOUNTS}.dfs.core.windows.net/{CONFIG_FILE}'

# Read JSON as text using Spark
json_df = spark.read.text(config_path)
config_json_str = "".join(row.value for row in json_df.collect())
config = json.loads(config_json_str)

# initiate the class & pass parameters
connector = SharepointToAzureBlobConnector(
    tenant_id=config["azure_app_directory"]["tenantId"],
    client_id=config["azure_app_directory"][f"APP{ENV_NAME[3:]}"]["clientId"],
    client_secret=config["azure_app_directory"][f"APP{ENV_NAME[3:]}"]["client_secret"],
    sharepoint_subdomain=config["sharepoint"]["subdomain"],
    sharepoint_site_name=config["sharepoint"]["site"],
    azure_account_name=config["azure_blob_storage"][f"APP{ENV_NAME[3:]}"]["storage_account_name"],
    azure_access_key=config["azure_blob_storage"][f"APP{ENV_NAME[3:]}"]["storage_account_access"],
    container_name= STORAGE_CONTAINER
)

In [None]:
# Load file from sharepoint to Azure blob
sharepoint_file= ["AzureBlobStorageConnection/Fleet Mgmt Master Mapping Wkst.xlsx", 
                  "AzureBlobStorageConnection/CDK LightSpeed Mapping.xlsx"]

For file_name in sharepoint_file:
    connector.load_sharepoint_file_to_azure_blob(
        relative_file_path= file_name, 
        azure_directory_name= STORAGE_FOLDER,
        return_file_name=True
    )