Skip to content
This repository has been archived by the owner on Mar 1, 2024. It is now read-only.

Commit

Permalink
Add unit tests for ConfluenceReader loader (#208)
Browse files Browse the repository at this point in the history

Co-authored-by: Adam Quigley <aquigley@atlassian.com>
  • Loading branch information
adamjq and Adam Quigley committed Apr 21, 2023
1 parent 04267ff commit 568f941
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 7 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
*.egg-info/
.modules

**/__pycache__/
**/__pycache__/
.venv
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,16 @@ Finally, add your loader to the `loader_hub/library.json` file so that it may be
Create a PR against the main branch. We typically review the PR within a day. To help expedite the process, it may be helpful to provide screenshots (either in the PR or in
the README directly) showing your data loader in action!

## Running tests

```shell
python3.9 -m venv .venv
source .venv/bin/activate
pip3 install -r test_requirements.txt

python3 -m pytest tests
```

## FAQ

### How do I test my loader before it's merged?
Expand Down
14 changes: 8 additions & 6 deletions loader_hub/confluence/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,18 +71,20 @@ def load_data(self, space_key: Optional[str] = None, page_ids: Optional[List[str
# Don't just query all the pages since the number of pages can be very large
# instead we can page through them
start = 0
# page_limit should be min of 100 and limit
page_limit = min(100, limit)
pages = []
while True:
if len(pages) >= limit:
break
pages_iter = self.confluence.get_all_pages_from_space(space_key, start=start, limit=page_limit)
pages_iter = self.confluence.get_all_pages_from_space(space_key, start=start, limit=limit)

if len(pages_iter) == 0:
break
start += page_limit

start += len(pages_iter)
pages.extend(pages_iter)

# no more to fetch
if len(pages_iter) < limit:
break

for page in pages:
doc = self.process_page(page, include_attachments, text_maker)
docs.append(doc)
Expand Down
2 changes: 2 additions & 0 deletions test_requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ pytest-dotenv==0.5.2
https://github.com/jerryjliu/gpt_index/archive/master.zip

llama-index
atlassian-python-api
html2text

# For linting
# linting stubs
Expand Down
153 changes: 153 additions & 0 deletions tests/tests_confluence/test_confluence_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
import pytest
import unittest
from unittest.mock import patch
from loader_hub.confluence.base import ConfluenceReader, Document

@pytest.fixture
def mock_confluence():
with patch("atlassian.Confluence") as mock_confluence:
yield mock_confluence


CONFLUENCE_BASE_URL = "https://example.atlassian.com/wiki"
MOCK_OAUTH = {
"client_id": "your_client_id",
"token": {
"access_token": "your_access_token",
"token_type": "Bearer",
},
}

class TestConfluenceReader:

def test_confluence_reader_initialization(self, mock_confluence):

# Test with oauth2
ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
mock_confluence.assert_called_once_with(url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH, cloud=True)

# Test without oauth2
with unittest.mock.patch.dict("os.environ", {"CONFLUENCE_USERNAME": "user", "CONFLUENCE_API_TOKEN": "api_token"}):
ConfluenceReader(base_url=CONFLUENCE_BASE_URL)
mock_confluence.assert_called_with(url=CONFLUENCE_BASE_URL, username="user", password="api_token", cloud=True)

def test_confluence_reader_load_data_invalid_args(self, mock_confluence):
confluence_reader = ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
confluence_reader.confluence = mock_confluence

with pytest.raises(ValueError, match="Must specify at least one among `space_key`, `page_ids`, `label`, `cql` parameters."):
confluence_reader.load_data()


def test_confluence_reader_load_data_by_page_ids(self, mock_confluence):
mock_confluence.get_page_by_id.side_effect = [
{'id': '123', 'title': 'Page 123', 'body': {'storage': {'value': '<p>Content 123</p>'}}},
{'id': '456', 'title': 'Page 456', 'body': {'storage': {'value': '<p>Content 456</p>'}}}
]

confluence_reader = ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
confluence_reader.confluence = mock_confluence

mock_page_ids = ['123', '456']
documents = confluence_reader.load_data(page_ids=mock_page_ids)

assert len(documents) == 2
assert all(isinstance(doc, Document) for doc in documents)
assert documents[0].doc_id == "123"
assert documents[0].extra_info == { "title": "Page 123" }
assert documents[1].doc_id == "456"
assert documents[1].extra_info == { "title": "Page 456" }

assert mock_confluence.get_page_by_id.call_count == 2

assert mock_confluence.get_all_pages_from_space.call_count == 0
assert mock_confluence.get_all_pages_by_label.call_count == 0
assert mock_confluence.cql.call_count == 0
assert mock_confluence.get_page_child_by_type.call_count == 0

def test_confluence_reader_load_data_by_space_id(self, mock_confluence):
# one response with two pages
mock_confluence.get_all_pages_from_space.return_value = [
{
'id': '123',
'type': 'page',
'status': 'current',
'title': 'Page 123',
'body': {'storage': {'value': '<p>Content 123</p>'}}
},
{
'id': '456',
'type': 'page',
'status': 'current',
'title': 'Page 456',
'body': {'storage': {'value': '<p>Content 456</p>'}}
}
]

confluence_reader = ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
confluence_reader.confluence = mock_confluence

mock_space_key = 'spaceId123'
documents = confluence_reader.load_data(space_key=mock_space_key)

assert mock_confluence.get_all_pages_from_space.call_count == 1
assert mock_confluence.get_all_pages_from_space.call_args[0][0] == 'spaceId123'
assert mock_confluence.get_all_pages_from_space.call_args[1]['start'] == 0
assert mock_confluence.get_all_pages_from_space.call_args[1]['limit'] == 50

assert len(documents) == 2
assert all(isinstance(doc, Document) for doc in documents)
assert documents[0].doc_id == "123"
assert documents[0].extra_info == { "title": "Page 123" }
assert documents[1].doc_id == "456"
assert documents[1].extra_info == { "title": "Page 456" }

assert mock_confluence.get_page_by_id.call_count == 0
assert mock_confluence.get_all_pages_by_label.call_count == 0
assert mock_confluence.cql.call_count == 0
assert mock_confluence.get_page_child_by_type.call_count == 0

def test_confluence_reader_load_data_by_space_id_pagination(self, mock_confluence):
# two api responses with one page each
mock_confluence.get_all_pages_from_space.side_effect = [
[
{
'id': '123',
'type': 'page',
'status': 'current',
'title': 'Page 123',
'body': {'storage': {'value': '<p>Content 123</p>'}}
},
],
[
{
'id': '456',
'type': 'page',
'status': 'current',
'title': 'Page 456',
'body': {'storage': {'value': '<p>Content 456</p>'}}
}
],
[]
]

confluence_reader = ConfluenceReader(base_url=CONFLUENCE_BASE_URL, oauth2=MOCK_OAUTH)
confluence_reader.confluence = mock_confluence

mock_space_key = 'spaceId123'
mock_limit = 1 # fetch one page at a time
documents = confluence_reader.load_data(space_key=mock_space_key, limit=mock_limit)

assert mock_confluence.get_all_pages_from_space.call_count == 3

assert len(documents) == 2
assert all(isinstance(doc, Document) for doc in documents)
assert documents[0].doc_id == "123"
assert documents[0].extra_info == { "title": "Page 123" }
assert documents[1].doc_id == "456"
assert documents[1].extra_info == { "title": "Page 456" }

assert mock_confluence.get_page_by_id.call_count == 0
assert mock_confluence.get_all_pages_by_label.call_count == 0
assert mock_confluence.cql.call_count == 0
assert mock_confluence.get_page_child_by_type.call_count == 0

0 comments on commit 568f941

Please sign in to comment.