From e252bfb6b56ff22431857f5fabfa3c06f1c94872 Mon Sep 17 00:00:00 2001 From: Javier Torres Date: Tue, 23 Apr 2024 17:53:09 -0500 Subject: [PATCH 1/5] add access control metadata --- .../readers/microsoft_sharepoint/base.py | 84 +++++++++++++++++-- 1 file changed, 77 insertions(+), 7 deletions(-) diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py index 09875f6a0c028..d1f3c39421238 100644 --- a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py @@ -203,6 +203,7 @@ def _download_files_and_extract_metadata( self, folder_id: str, download_dir: str, + current_folder_path: str, include_subfolders: bool = False, ) -> Dict[str, str]: """ @@ -237,13 +238,18 @@ def _download_files_and_extract_metadata( subfolder_metadata = self._download_files_and_extract_metadata( folder_id=item["id"], download_dir=sub_folder_download_dir, + current_folder_path=os.path.join( + current_folder_path, item["name"] + ), include_subfolders=include_subfolders, ) metadata.update(subfolder_metadata) elif "file" in item: - file_metadata = self._download_file(item, download_dir) + file_metadata = self._download_file( + item, download_dir, current_folder_path + ) metadata.update(file_metadata) return metadata else: @@ -276,6 +282,60 @@ def _download_file_by_url(self, item: Dict[str, Any], download_dir: str) -> str: return file_path + def _get_permissions_info(self, item: Dict[str, Any]) -> Dict[str, str]: + """ + Extracts the permissions information for the file. For more information, see: + https://learn.microsoft.com/en-us/graph/api/resources/permission?view=graph-rest-1.0. + + Args: + item (Dict[str, Any]): Dictionary containing file metadata. + + Returns: + Dict[str, str]: A dictionary containing the extracted permissions information. + """ + item_id = item.get("id") + permissions_info_endpoint = ( + f"{self._drive_id_endpoint}/{self._drive_id}/items/{item_id}/permissions" + ) + response = requests.get( + url=permissions_info_endpoint, + headers=self._authorization_headers, + ) + permissions = response.json() + + identity_sets = [] + for permission in permissions["value"]: + # user type permissions + granted_to = permission.get("grantedToV2", None) + if granted_to: + identity_sets.append(granted_to) + + # link type permissions + granted_to_identities = permission.get("grantedToIdentitiesV2", []) + for identity in granted_to_identities: + identity_sets.append(identity) + + # Extract the identity information from each identity set + # they can be 'application', 'device', 'user', 'group', 'siteUser' or 'siteGroup' + # 'siteUser' and 'siteGroup' are site-specific, 'group' is for Microsoft 365 groups + permissions_dict = {} + for identity_set in identity_sets: + for identity, identity_info in identity_set.items(): + id = identity_info.get("id") + display_name = identity_info.get("displayName") + ids_key = f"allowed_{identity}_ids" + display_names_key = f"allowed_{identity}_display_names" + + if ids_key not in permissions_dict: + permissions_dict[ids_key] = [] + if display_names_key not in permissions_dict: + permissions_dict[display_names_key] = [] + + permissions_dict[ids_key].append(id) + permissions_dict[display_names_key].append(display_name) + + return permissions_dict + def _extract_metadata_for_file(self, item: Dict[str, Any]) -> Dict[str, str]: """ Extracts metadata related to the file. @@ -287,21 +347,28 @@ def _extract_metadata_for_file(self, item: Dict[str, Any]) -> Dict[str, str]: - Dict[str, str]: A dictionary containing the extracted metadata. """ # Extract the required metadata for file. + metadata = self._get_permissions_info(item) + metadata.update( + { + "file_id": item.get("id"), + "file_name": item.get("name"), + "url": item.get("webUrl"), + "file_path": item.get("file_path"), + } + ) - return { - "file_id": item.get("id"), - "file_name": item.get("name"), - "url": item.get("webUrl"), - } + return metadata def _download_file( self, item: Dict[str, Any], download_dir: str, + sharepoint_folder_path: str, ): metadata = {} file_path = self._download_file_by_url(item, download_dir) + item["file_path"] = os.path.join(sharepoint_folder_path, item["name"]) metadata[file_path] = self._extract_metadata_for_file(item) return metadata @@ -341,7 +408,10 @@ def _download_files_from_sharepoint( ) return self._download_files_and_extract_metadata( - sharepoint_folder_id, download_dir, recursive + sharepoint_folder_id, + download_dir, + os.path.join(sharepoint_site_name, sharepoint_folder_path), + recursive, ) def _load_documents_with_metadata( From 4367961ef6424c0eae6925bf81848c2ced00e863 Mon Sep 17 00:00:00 2001 From: Javier Torres Date: Tue, 23 Apr 2024 18:01:17 -0500 Subject: [PATCH 2/5] version bump --- .../llama-index-readers-microsoft-sharepoint/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/pyproject.toml index 9372ddfc1884c..ed1df5d1fb109 100644 --- a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/pyproject.toml @@ -29,7 +29,7 @@ license = "MIT" maintainers = ["arun-soliton"] name = "llama-index-readers-microsoft-sharepoint" readme = "README.md" -version = "0.1.7" +version = "0.2.1" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" From 2fe4f7e2663c7b9d210b397a750ba43e2d3982cf Mon Sep 17 00:00:00 2001 From: Javier Torres Date: Wed, 24 Apr 2024 11:58:31 -0500 Subject: [PATCH 3/5] add attach_permision flag --- .../llama_index/readers/microsoft_sharepoint/base.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py index d1f3c39421238..a7a7faa713fac 100644 --- a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py @@ -31,6 +31,8 @@ class SharePointReader(BasePydanticReader): sharepoint_folder_id (Optional[str]): The ID of the SharePoint folder to download from. Overrides sharepoint_folder_path. file_extractor (Optional[Dict[str, BaseReader]]): A mapping of file extension to a BaseReader class that specifies how to convert that file to text. See `SimpleDirectoryReader` for more details. + attach_permission_metadata (bool): If True, the reader will attach permission metadata to the documents. Set to False if your vector store + only supports flat metadata (i.e. no nested fields or lists), or to avoid the additional API calls. """ client_id: str = None @@ -42,6 +44,7 @@ class SharePointReader(BasePydanticReader): file_extractor: Optional[Dict[str, Union[str, BaseReader]]] = Field( default=None, exclude=True ) + attach_permission_metadata: bool = True _authorization_headers = PrivateAttr() _site_id_with_host_name = PrivateAttr() @@ -347,7 +350,11 @@ def _extract_metadata_for_file(self, item: Dict[str, Any]) -> Dict[str, str]: - Dict[str, str]: A dictionary containing the extracted metadata. """ # Extract the required metadata for file. - metadata = self._get_permissions_info(item) + if self.attach_permission_metadata: + metadata = self._get_permissions_info(item) + else: + metadata = {} + metadata.update( { "file_id": item.get("id"), From 1828c0cdf1aa951e72c39f0aef774a60d1736b58 Mon Sep 17 00:00:00 2001 From: Javier Torres Date: Tue, 30 Apr 2024 14:04:13 -0500 Subject: [PATCH 4/5] exclude keys --- .../readers/microsoft_sharepoint/base.py | 27 ++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py index a7a7faa713fac..a4de5a20c8181 100644 --- a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/llama_index/readers/microsoft_sharepoint/base.py @@ -421,6 +421,28 @@ def _download_files_from_sharepoint( recursive, ) + def _exclude_access_control_metadata( + self, documents: List[Document] + ) -> List[Document]: + """ + Excludes the access control metadata from the documents for embedding and LLM calls. + + Args: + documents (List[Document]): A list of documents. + + Returns: + List[Document]: A list of documents with access control metadata excluded. + """ + for doc in documents: + access_control_keys = [ + key for key in doc.metadata if key.startswith("allowed_") + ] + + doc.excluded_embed_metadata_keys.extend(access_control_keys) + doc.excluded_llm_metadata_keys.extend(access_control_keys) + + return documents + def _load_documents_with_metadata( self, files_metadata: Dict[str, Any], @@ -448,7 +470,10 @@ def get_metadata(filename: str) -> Any: file_metadata=get_metadata, recursive=recursive, ) - return simple_loader.load_data() + docs = simple_loader.load_data() + if self.attach_permission_metadata: + docs = self._exclude_access_control_metadata(docs) + return docs def load_data( self, From 388057d351455b8928119dd1917906e785a96ab6 Mon Sep 17 00:00:00 2001 From: Javier Torres Date: Tue, 30 Apr 2024 14:06:06 -0500 Subject: [PATCH 5/5] version bump --- .../llama-index-readers-microsoft-sharepoint/pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/pyproject.toml index ed1df5d1fb109..1ed6491498f15 100644 --- a/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-microsoft-sharepoint/pyproject.toml @@ -29,7 +29,7 @@ license = "MIT" maintainers = ["arun-soliton"] name = "llama-index-readers-microsoft-sharepoint" readme = "README.md" -version = "0.2.1" +version = "0.2.2" [tool.poetry.dependencies] python = ">=3.8.1,<4.0"