# Build an inventory

This notebook focuses on building an inventory of URLs to be used as part of an indexing pipeline in subsequent steps.

## 1. Setup
Ensure that the required environment variables are set.

### 1a. Load environment variables

In [None]:
! pip install python-dotenv --quiet
from dotenv import load_dotenv
load_dotenv('../.env')

## 2. Build the sitemap

### 2a. Load sitemap and fetch endpoint urls

In [None]:
base_url = ''
allowed_paths = [
#   '/allowed-path-1',
]

### 2b. Parse the sitemap and generate a list of urls

In [None]:
! pip install lxml --quiet
! pip install beautifulsoup4 --quiet
! pip install requests --quiet

%run -i ../utilities/sitemap.py

# Validate the base URL
is_url_valid(base_url)

# Fetch the sitemap as a list of URLs
sitemap = parse_sitemap(fetch_sitemap(url=base_url))

# Print a count of the URLs in the sitemap
print(f'The sitemap contains {len(sitemap)} URLs')

### 2c. Fix the urls as follows
- Remove duplicates
- Remove urls that are not within the allowed paths
- Sort the urls

In [None]:
# Filter and sort the URLs. Only include URLs that contain one of the allowed paths
sitemap = sorted(list(set(filter_urls(urls=sitemap, allowed_paths=allowed_paths))))
print(f'After filtering, the sitemap contains {len(sitemap)} unique URLs')

### 2d. Store the inventory of urls to Azure Storage

In [None]:
! pip install azure-data-tables --quiet

%run -i ../utilities/storeInventory.py

create_inventory_from_urls(sitemap)   

### 2e. Cleanup the inventory of urls by deleting the table

In [None]:
# Adding the option to delete the table if needed
#delete_inventory_table()