This notebook uses Beautiful Soup to scrape Valve's video game Deadlock's changelog page to get the links to all patch notes (updates). Each patch note is extracted from the page by parsing the HTML and finding the tags that link to the individual patch notes. Based on the URL structure, loop through and extract the text data from each individual patch note. Store the extracted (raw) data in a .txt file. Data is first stored locally for initial development and then pushed to Google Cloud Storage in batch.

As of 24Nov2024, all patch notes are located in [this forum](https://forums.playdeadlock.com/forums/changelog.10/)

![Deadlock changelog menu](images/phase1-changelog-homepage.png)

In [9]:
import requests
from bs4 import BeautifulSoup
import re

def get_patch_note_links(page_num):
    # determine URL for the current page
    if page_num == 1:
        url = "https://forums.playdeadlock.com/forums/changelog.10/"
    else:
        url = f"https://forums.playdeadlock.com/forums/changelog.10/page-{page_num}"
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # extract all the thread links that are patch notes
    links = []
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']
        if '/threads/' in link and 'update' in link:  # only look for valid patch note threads
            # normalize the URL by removing '/latest' and '/' if present
            normalized_link = re.sub(r'/latest$', '', link)
            normalized_link = re.sub(r'/$', '', normalized_link)
            full_url = f"https://forums.playdeadlock.com{normalized_link}"
            links.append(full_url)

    return links


# initialize
patch_note_links = set()  # use a set to avoid duplicates
page_num = 1

# store links of the previous page
prev_page_links = None 

# loop through pages until no new patch links are found
while True:
    print(f"Checking page {page_num}...")

    # get patch note links 
    current_page_links = get_patch_note_links(page_num)
    
    # compare with the previous page content
    if current_page_links == prev_page_links:
        print(f"Page {page_num} content is the same as page {page_num - 1}. Stopping loop.")
        break
    
    # update prev_content to current page content
    prev_page_links = current_page_links  
    
    # add new patch note links from current page
    patch_note_links.update(current_page_links)

    page_num += 1

# sort links by newest patch notes first
sorted_patch_note_links = sorted(patch_note_links, reverse=True)

# print all collected links
print("Collected Patch Note Links:")
for link in sorted_patch_note_links:
    print(link)


Checking page 1...
Checking page 2...
Checking page 3...
Checking page 4...
Page 4 content is the same as page 3. Stopping loop.
Collected Patch Note Links:
https://forums.playdeadlock.com/threads/11-21-2024-update.47476
https://forums.playdeadlock.com/threads/11-13-2024-update.46391
https://forums.playdeadlock.com/threads/11-10-2024-update.45689
https://forums.playdeadlock.com/threads/11-07-2024-update.44786
https://forums.playdeadlock.com/threads/11-01-2024-update.43705
https://forums.playdeadlock.com/threads/10-29-2024-update.42985
https://forums.playdeadlock.com/threads/10-27-2024-update.42492
https://forums.playdeadlock.com/threads/10-24-2024-update.40951
https://forums.playdeadlock.com/threads/10-18-2024-update.39630
https://forums.playdeadlock.com/threads/10-18-2024-update-2.39693
https://forums.playdeadlock.com/threads/10-15-2024-update.38925
https://forums.playdeadlock.com/threads/10-11-2024-update.37641
https://forums.playdeadlock.com/threads/10-10-2024-update.36958
https://f