Add unit tests for ConfluenceReader loader (#208)

Co-authored-by: Adam Quigley <aquigley@atlassian.com>
run-llama · Apr 21, 2023 · 568f941 · 568f941
1 parent 04267ff
commit 568f941
Show file tree

Hide file tree

Showing 5 changed files with 175 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 *.egg-info/
 .modules
 
-**/__pycache__/
+**/__pycache__/
+.venv
diff --git a/README.md b/README.md
@@ -75,6 +75,16 @@ Finally, add your loader to the `loader_hub/library.json` file so that it may be
 Create a PR against the main branch. We typically review the PR within a day. To help expedite the process, it may be helpful to provide screenshots (either in the PR or in
 the README directly) showing your data loader in action!
 
+## Running tests
+
+```shell
+python3.9 -m venv .venv
+source .venv/bin/activate 
+pip3 install -r test_requirements.txt
+
+python3 -m pytest tests 
+```
+
 ## FAQ
 
 ### How do I test my loader before it's merged?

diff --git a/loader_hub/confluence/base.py b/loader_hub/confluence/base.py
@@ -71,18 +71,20 @@ def load_data(self, space_key: Optional[str] = None, page_ids: Optional[List[str
             # Don't just query all the pages since the number of pages can be very large
             # instead we can page through them
             start = 0
-            # page_limit should be min of 100 and limit
-            page_limit = min(100, limit)
             pages = []
             while True:
-                if len(pages) >= limit:
-                    break
-                pages_iter = self.confluence.get_all_pages_from_space(space_key, start=start, limit=page_limit)
+                pages_iter = self.confluence.get_all_pages_from_space(space_key, start=start, limit=limit)
+
                 if len(pages_iter) == 0:
                     break
-                start += page_limit
+
+                start += len(pages_iter)
                 pages.extend(pages_iter)
 
+                # no more to fetch
+                if len(pages_iter) < limit:
+                    break
+
             for page in pages:
                 doc = self.process_page(page, include_attachments, text_maker)
                 docs.append(doc)

diff --git a/test_requirements.txt b/test_requirements.txt
@@ -6,6 +6,8 @@ pytest-dotenv==0.5.2
 https://github.com/jerryjliu/gpt_index/archive/master.zip
 
 llama-index
+atlassian-python-api
+html2text
 
 # For linting
 # linting stubs

diff --git a/tests/tests_confluence/test_confluence_reader.py b/tests/tests_confluence/test_confluence_reader.py
@@ -0,0 +1,153 @@
+import pytest
+import unittest
+from unittest.mock import patch
+from loader_hub.confluence.base import ConfluenceReader, Document
+
+@pytest.fixture
+def mock_confluence():
+    with patch("atlassian.Confluence") as mock_confluence:
+        yield mock_confluence
+
+
+CONFLUENCE_BASE_URL = "https://example.atlassian.com/wiki"
+MOCK_OAUTH = {
+    "client_id": "your_client_id",
+    "token": {
+        "access_token": "your_access_token",
+        "token_type": "Bearer",
+    },
+}
+
+class TestConfluenceReader:
+
+    def test_confluence_reader_initialization(self, mock_confluence):
+
+        # Test with oauth2
+        ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
+        mock_confluence.assert_called_once_with(url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH, cloud=True)
+
+        # Test without oauth2
+        with unittest.mock.patch.dict("os.environ", {"CONFLUENCE_USERNAME": "user", "CONFLUENCE_API_TOKEN": "api_token"}):
+            ConfluenceReader(base_url=CONFLUENCE_BASE_URL)
+            mock_confluence.assert_called_with(url=CONFLUENCE_BASE_URL, username="user", password="api_token", cloud=True)
+
+    def test_confluence_reader_load_data_invalid_args(self, mock_confluence):
+        confluence_reader = ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
+        confluence_reader.confluence = mock_confluence
+
+        with pytest.raises(ValueError, match="Must specify at least one among `space_key`, `page_ids`, `label`, `cql` parameters."):
+            confluence_reader.load_data()
+
+
+    def test_confluence_reader_load_data_by_page_ids(self, mock_confluence):
+        mock_confluence.get_page_by_id.side_effect = [
+            {'id': '123', 'title': 'Page 123', 'body': {'storage': {'value': '<p>Content 123</p>'}}},
+            {'id': '456', 'title': 'Page 456', 'body': {'storage': {'value': '<p>Content 456</p>'}}}
+        ]
+
+        confluence_reader = ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
+        confluence_reader.confluence = mock_confluence
+
+        mock_page_ids = ['123', '456']
+        documents = confluence_reader.load_data(page_ids=mock_page_ids)
+
+        assert len(documents) == 2
+        assert all(isinstance(doc, Document) for doc in documents)
+        assert documents[0].doc_id == "123"
+        assert documents[0].extra_info == { "title": "Page 123" }
+        assert documents[1].doc_id == "456"
+        assert documents[1].extra_info == { "title": "Page 456" }
+
+        assert mock_confluence.get_page_by_id.call_count == 2
+
+        assert mock_confluence.get_all_pages_from_space.call_count == 0
+        assert mock_confluence.get_all_pages_by_label.call_count == 0
+        assert mock_confluence.cql.call_count == 0
+        assert mock_confluence.get_page_child_by_type.call_count == 0
+
+    def test_confluence_reader_load_data_by_space_id(self, mock_confluence):
+        # one response with two pages
+        mock_confluence.get_all_pages_from_space.return_value = [
+            {
+                'id': '123',
+                'type': 'page',
+                'status': 'current',
+                'title': 'Page 123',
+                'body': {'storage': {'value': '<p>Content 123</p>'}}
+            },
+            {
+                'id': '456',
+                'type': 'page',
+                'status': 'current',
+                'title': 'Page 456',
+                'body': {'storage': {'value': '<p>Content 456</p>'}}
+            }
+        ]
+
+        confluence_reader = ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
+        confluence_reader.confluence = mock_confluence
+
+        mock_space_key = 'spaceId123'
+        documents = confluence_reader.load_data(space_key=mock_space_key)
+
+        assert mock_confluence.get_all_pages_from_space.call_count == 1
+        assert mock_confluence.get_all_pages_from_space.call_args[0][0] == 'spaceId123'
+        assert mock_confluence.get_all_pages_from_space.call_args[1]['start'] == 0
+        assert mock_confluence.get_all_pages_from_space.call_args[1]['limit'] == 50
+
+        assert len(documents) == 2
+        assert all(isinstance(doc, Document) for doc in documents)
+        assert documents[0].doc_id == "123"
+        assert documents[0].extra_info == { "title": "Page 123" }
+        assert documents[1].doc_id == "456"
+        assert documents[1].extra_info == { "title": "Page 456" }
+
+        assert mock_confluence.get_page_by_id.call_count == 0
+        assert mock_confluence.get_all_pages_by_label.call_count == 0
+        assert mock_confluence.cql.call_count == 0
+        assert mock_confluence.get_page_child_by_type.call_count == 0
+
+    def test_confluence_reader_load_data_by_space_id_pagination(self, mock_confluence):
+        # two api responses with one page each
+        mock_confluence.get_all_pages_from_space.side_effect = [
+            [
+                {
+                'id': '123',
+                'type': 'page',
+                'status': 'current',
+                'title': 'Page 123',
+                'body': {'storage': {'value': '<p>Content 123</p>'}}
+                },
+            ],
+            [
+                {
+                'id': '456',
+                'type': 'page',
+                'status': 'current',
+                'title': 'Page 456',
+                'body': {'storage': {'value': '<p>Content 456</p>'}}
+                }
+            ],
+            []
+        ]
+
+        confluence_reader = ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
+        confluence_reader.confluence = mock_confluence
+
+        mock_space_key = 'spaceId123'
+        mock_limit = 1 # fetch one page at a time
+        documents = confluence_reader.load_data(space_key=mock_space_key, limit=mock_limit)
+
+        assert mock_confluence.get_all_pages_from_space.call_count == 3
+
+        assert len(documents) == 2
+        assert all(isinstance(doc, Document) for doc in documents)
+        assert documents[0].doc_id == "123"
+        assert documents[0].extra_info == { "title": "Page 123" }
+        assert documents[1].doc_id == "456"
+        assert documents[1].extra_info == { "title": "Page 456" }
+
+        assert mock_confluence.get_page_by_id.call_count == 0
+        assert mock_confluence.get_all_pages_by_label.call_count == 0
+        assert mock_confluence.cql.call_count == 0
+        assert mock_confluence.get_page_child_by_type.call_count == 0