This notebook uses Beautiful Soup to scrape Valve's video game Deadlock's changelog page to get the links to all patch notes (updates). Each patch note is extracted from the page by parsing the HTML and finding the tags that link to the individual patch notes. Based on the URL structure, loop through and extract the text data from each individual patch note. Store the extracted (raw) data in a .txt file. Data is first stored locally for initial development and then pushed to Google Cloud Storage in batch.

As of 24Nov2024, all patch notes are located in [this forum](https://forums.playdeadlock.com/forums/changelog.10/)

![Deadlock changelog menu](images/phase1-changelog-homepage.png)

# Extraction

In [21]:
import requests
import re
import os
import json
from datetime import datetime
from bs4 import BeautifulSoup

def get_patch_note_links(page_num):
    # determine URL for the current page
    if page_num == 1:
        url = "https://forums.playdeadlock.com/forums/changelog.10/"
    else:
        url = f"https://forums.playdeadlock.com/forums/changelog.10/page-{page_num}"
    
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # extract all the thread links that are patch notes
    links = []
    for a_tag in soup.find_all('a', href=True):
        link = a_tag['href']
        if '/threads/' in link and 'update' in link:  # only look for valid patch note threads
            # normalize the URL by removing '/latest' and '/' if present
            normalized_link = re.sub(r'/latest$', '', link)
            normalized_link = re.sub(r'/$', '', normalized_link)
            full_url = f"https://forums.playdeadlock.com{normalized_link}"
            links.append(full_url)

    return links


# initialize
patch_note_links = set()  # use a set to avoid duplicates
page_num = 1

# store links of the previous page
prev_page_links = None 

# loop through pages until no new patch links are found
while True:
    print(f"Checking page {page_num}...")

    # get patch note links 
    current_page_links = get_patch_note_links(page_num)
    
    # compare with the previous page content
    if current_page_links == prev_page_links:
        print(f"Page {page_num} content is the same as page {page_num - 1}. Stopping loop.")
        break
    
    # update prev_content to current page content
    prev_page_links = current_page_links  
    
    # add new patch note links from current page
    patch_note_links.update(current_page_links)

    page_num += 1

# sort links by newest patch notes first
sorted_patch_note_links = sorted(patch_note_links)

# print all collected links
print("Collected Patch Note Links:")
for link in sorted_patch_note_links:
    print(link)


Checking page 1...
Checking page 2...
Checking page 3...
Checking page 4...
Checking page 5...
Page 5 content is the same as page 4. Stopping loop.
Collected Patch Note Links:
https://forums.playdeadlock.com/threads/01-12-2025-update.53389
https://forums.playdeadlock.com/threads/01-17-2025-update.53607
https://forums.playdeadlock.com/threads/01-19-2025-update.53961
https://forums.playdeadlock.com/threads/01-27-2025-update.54590
https://forums.playdeadlock.com/threads/05-03-2024-update.427
https://forums.playdeadlock.com/threads/05-10-2024-update.689
https://forums.playdeadlock.com/threads/05-13-2024-update.807
https://forums.playdeadlock.com/threads/05-16-2024-update.902
https://forums.playdeadlock.com/threads/05-19-2024-update.1033
https://forums.playdeadlock.com/threads/05-23-2024-update.1245
https://forums.playdeadlock.com/threads/05-24-2024-update.1359
https://forums.playdeadlock.com/threads/05-30-2024-update.2514
https://forums.playdeadlock.com/threads/06-01-2024-update.3004
https

In [22]:
# poster information and datetime is stored in the 'data-lb-caption-desc' attribute
# an example looks like this "Yoshi · Nov 21, 2024 at 3:21 PM"
def extract_poster_info(poster_str):
    parts = poster_str.split('·')
    if len(parts) != 2:
        return None  
    poster = parts[0].strip()
    date_time_part = parts[1].strip()
    date, time = date_time_part.split(' at ')
    date_obj = datetime.strptime(date.strip(), '%b %d, %Y')
    formatted_date = date_obj.strftime('%Y-%m-%d')
    
    return {
        'poster': poster,
        'date': formatted_date,
        'time': time.strip()
    }

# example
poster_str = "Yoshi · Nov 21, 2024 at 3:21 PM"
info = extract_poster_info(poster_str)
print(info)

{'poster': 'Yoshi', 'date': '2024-11-21', 'time': '3:21 PM'}


In [23]:
# let's see what one patch note looks like first

def extract_patch_note_content(patch_note_url):
    response = requests.get(patch_note_url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # find the user who posted the patch note
    user_div = soup.find('div', class_='message-userContent')
    if user_div and 'data-lb-caption-desc' in user_div.attrs:
        poster_info = extract_poster_info(user_div['data-lb-caption-desc'])
    else:
        poster_info = {"poster": "Unknown", "date": "Unknown", "time": "Unknown"}

    # find the patch note content and preserve newlines
    content_div = soup.find('div', class_='bbWrapper')
    if content_div:
        patch_note_content = content_div.get_text(separator='\n', strip=True)
    else:
        patch_note_content = "No content found."

    return {
        'poster': poster_info['poster'],
        'date': poster_info['date'],
        'time': poster_info['time'],
        'content': patch_note_content
    }

# example
first_patch_note_url = sorted_patch_note_links[0] 
patch_note_data = extract_patch_note_content(first_patch_note_url)

Confirm that the final line of patch notes matches what is on the website <br>
![Deadlock changelog menu](images/phase2-confirm-extracted-content-matches.png)

In [24]:
def save_patch_note(patch_note_data, patch_note_url):
    patch_note_id = f"{patch_note_data['date']}_{patch_note_url.split('.')[-1]}"
    folder = 'json-patch-notes'
    if not os.path.exists(folder):
        os.makedirs(folder)

    file_path = os.path.join(folder, f"{patch_note_id}.json")
    with open(file_path, 'w') as json_file:
        json.dump(patch_note_data, json_file, indent=4)
    print(f"Patch note saved: {file_path}")

In [25]:
# write contents for all patch notes to the json-patch-notes folder
for link in sorted_patch_note_links:
    print(f"Extracting content from: {link}")
    patch_note_data = extract_patch_note_content(link)
    save_patch_note(patch_note_data, link)

Extracting content from: https://forums.playdeadlock.com/threads/01-12-2025-update.53389
Patch note saved: json-patch-notes\2025-01-12_53389.json
Extracting content from: https://forums.playdeadlock.com/threads/01-17-2025-update.53607
Patch note saved: json-patch-notes\2025-01-17_53607.json
Extracting content from: https://forums.playdeadlock.com/threads/01-19-2025-update.53961
Patch note saved: json-patch-notes\2025-01-19_53961.json
Extracting content from: https://forums.playdeadlock.com/threads/01-27-2025-update.54590
Patch note saved: json-patch-notes\2025-01-27_54590.json
Extracting content from: https://forums.playdeadlock.com/threads/05-03-2024-update.427
Patch note saved: json-patch-notes\2024-05-03_427.json
Extracting content from: https://forums.playdeadlock.com/threads/05-10-2024-update.689
Patch note saved: json-patch-notes\2024-05-10_689.json
Extracting content from: https://forums.playdeadlock.com/threads/05-13-2024-update.807
Patch note saved: json-patch-notes\2024-05-13

In [26]:
def load_downloaded_patch_notes():
    if os.path.exists('downloaded_patch_notes.json'):
        with open('downloaded_patch_notes.json', 'r') as file:
            return json.load(file)
    return []

In [27]:
def save_downloaded_patch_notes(downloaded_patch_notes):
    with open('downloaded_patch_notes.json', 'w') as file:
        json.dump(downloaded_patch_notes, file, indent=4)

In [28]:
def check_for_new_patch_notes():
    # Load previously downloaded patch notes
    downloaded_patch_notes = load_downloaded_patch_notes()

    for link in sorted_patch_note_links:  # Assume this is the list of patch note URLs
        patch_note_id = link.split('.')[-1]

        # Skip if the patch note is already downloaded
        if patch_note_id in downloaded_patch_notes:
            print(f"Patch note {patch_note_id} already downloaded.")
            continue

        # Extract and save the new patch note
        patch_note_data = extract_patch_note_content(link)
        save_patch_note(patch_note_data, link)

        # Add the new patch note ID to the downloaded list
        downloaded_patch_notes.append(patch_note_id)

    # Save the updated list of downloaded patch notes
    save_downloaded_patch_notes(downloaded_patch_notes)

All patch notes are now pulled. Let's now start to format the data for each patch note.

In [29]:
# get only .json file names from json-patch-notes folder and store them in a list
json_patch_notes_folder_path = 'json-patch-notes'
existing_json_file_names = [file for file in os.listdir(json_patch_notes_folder_path) if file.endswith('.json')]

for existing_json_file in existing_json_file_names[:5]:
    print(existing_json_file)

2024-05-03_427.json
2024-05-10_689.json
2024-05-13_807.json
2024-05-16_902.json
2024-05-19_1033.json


In [30]:
# test extraction with just the first file
test_file = existing_json_file_names[0]
test_file

'2024-05-03_427.json'

In [31]:
with open(os.path.join(json_patch_notes_folder_path, test_file), 'r') as file:
    data = json.load(file)

data

{'poster': 'Yoshi',
 'date': '2024-05-03',
 'time': '1:02 PM',
 'content': 'General Changes:\n==\n- Added a Recommend A Friend button to the dashboard that you can use to send us requests for people to include in our playtesting\n- Added a Resources page to the dashboard which contains a browsable item shop\n- Added overhead text display when another hero uses active items\n- The hotkeys F1-F5 to change cameras to allied heroes now maps directly to the order of heroes on the top bar left to right\n- Added the Patron to the spectate-when-dead cycle if the enemy is in your base or everyone on your team is dead\n- Added support for Flex Items in the Hero Sandbox\n- Increased the range of the mouse sensitivity slider from 0.5->4.0 to 0.05->8.0\n- Increased the default framerate cap from 120 to 400\n- Improved UI display when endgame objectives are being attacked\n- Added music for when the base is under attack\n- Changed the local player icon on the the minimap to always be on top of enemy

In [32]:
data['content']

'General Changes:\n==\n- Added a Recommend A Friend button to the dashboard that you can use to send us requests for people to include in our playtesting\n- Added a Resources page to the dashboard which contains a browsable item shop\n- Added overhead text display when another hero uses active items\n- The hotkeys F1-F5 to change cameras to allied heroes now maps directly to the order of heroes on the top bar left to right\n- Added the Patron to the spectate-when-dead cycle if the enemy is in your base or everyone on your team is dead\n- Added support for Flex Items in the Hero Sandbox\n- Increased the range of the mouse sensitivity slider from 0.5->4.0 to 0.05->8.0\n- Increased the default framerate cap from 120 to 400\n- Improved UI display when endgame objectives are being attacked\n- Added music for when the base is under attack\n- Changed the local player icon on the the minimap to always be on top of enemy icons\n- Shop music will now only play for shops players can access\n- Pos

# Text Preprocessing

In [33]:
# separate content by \n
content = data['content']
content_lines = content.split('\n')

# remove '==' and empty strings
filtered_content = [line for line in content_lines if line.strip() and line != '==']

filtered_content

['General Changes:',
 '- Added a Recommend A Friend button to the dashboard that you can use to send us requests for people to include in our playtesting',
 '- Added a Resources page to the dashboard which contains a browsable item shop',
 '- Added overhead text display when another hero uses active items',
 '- The hotkeys F1-F5 to change cameras to allied heroes now maps directly to the order of heroes on the top bar left to right',
 '- Added the Patron to the spectate-when-dead cycle if the enemy is in your base or everyone on your team is dead',
 '- Added support for Flex Items in the Hero Sandbox',
 '- Increased the range of the mouse sensitivity slider from 0.5->4.0 to 0.05->8.0',
 '- Increased the default framerate cap from 120 to 400',
 '- Improved UI display when endgame objectives are being attacked',
 '- Added music for when the base is under attack',
 '- Changed the local player icon on the the minimap to always be on top of enemy icons',
 '- Shop music will now only play fo

It looks like there at minimum two categories -- 'General Changes' and 'Gameplay Changes'. For the scope of this work, I will only focus on Gameplay Changes for the heroes

In [34]:
# import the heroes from heroes.txt
with open('heroes.txt', 'r') as file:
    heroes = [line.strip() for line in file.readlines()]

heroes

['Abrams',
 'Bebop',
 'Calico',
 'Dynamo',
 'Grey Talon',
 'Haze',
 'Holliday',
 'Infernus',
 'Ivy',
 'Kelvin',
 'Lady Geist',
 'Lash',
 'McGinnis',
 'Mirage',
 'Mo & Krill',
 'Paradox',
 'Pocket',
 'Seven',
 'Sinclair',
 'Shiv',
 'Vindicta',
 'Viscous',
 'Vyper',
 'Warden',
 'Wraith',
 'Yamato']

In [35]:
# get all of the lines that mention a hero
hero_changes_lines = [line for line in content_lines if any(hero in line for hero in heroes)]

In [36]:
# store entries that start with a heroes name
hero_patch_notes = []

# flag any entries that contain a hero's name but does not start with the hero name
flagged_entries = []

for line in hero_changes_lines:
    if any(line.startswith(f"- {hero}") for hero in heroes): 
        hero_patch_notes.append(line)
    else:
        flagged_entries.append(line)

print("\nHero Patch Notes:")
for hero_patch_note in hero_patch_notes:
    print(hero_patch_note)

print("\nFlagged Entries:")
for flagged in flagged_entries:
    print(flagged)


Hero Patch Notes:
- Abrams: Base Health increased from 550 to 600
- Abrams: Siphon DPS increased from 24 to 35
- Bebop: Weapon no longer has horizontal/vertical recoil
- Dynamo: Singularity DPS increased from 48 to 60
- Dynamo: Singularity T3 Max HP DPS increased from 3.2% to 3.8%
- Grey Talon: Charge Shot damage increased from 95 to 105
- Grey Talon: Charge Shot T2 damage reduced from 80 to 70
- Grey Talon: Guided Owl damage increased from 200 to 300
- Haze: Bullet Dance bonus Fire Rate reduced from +30 to +20
- Haze: Smoke Bomb duration scaling from Spirit improved from 0.2 to 0.3
- Infernus: Catalyst no longer slows Infernus to 1.3 m/s during the cast delay
- Infernus: Concussive Combustion damage increased from 130 to 160
- Kelvin: Frost Grenade T3 bonus damage increased from +100 to +175
- Lady Geist: Blood Bomb damage increased from 80 to 100
- Lady Geist: Blood Bomb T2 damage increased from +65 to +70
- Lady Geist: Blood Bomb tooltip fixed to reference the correct self damage t

In [37]:
# add in the processed data back into the json file
# while I understand that this method can potentially cause data duplication, it is more structured and easier to query
# to some extent, I also do not want to overwrite or remove the original patch note data, and since patch notes are relatively frequent, the sizes should stay small

# add hero_patch_notes and flagged_entries as new keys
data['hero_patch_notes'] = hero_patch_notes
data['flagged_entries'] = flagged_entries

# write the updated data back to json file
with open(os.path.join(json_patch_notes_folder_path, test_file), 'w') as file:
    json.dump(data, file, indent=4)

print("Data successfully added to the JSON file")

Data successfully added to the JSON file


## Functionalize the text preprocessing for all json files in json-patch-notes

In [38]:
def import_heroes():
    # import the heroes from heroes.txt
    with open('heroes.txt', 'r') as file:
        heroes = [line.strip() for line in file.readlines()]

    return heroes

heroes = import_heroes()

In [39]:
def preprocess_content_for_hero_patch_notes_only(json_patch_notes_folder_path, json_file):
    
    with open(os.path.join(json_patch_notes_folder_path, json_file), 'r') as file:
        data = json.load(file)
    
    # separate content by \n
    content = data['content']
    content_lines = content.split('\n')
    
    # remove '==' and empty strings
    filtered_content = [line for line in content_lines if line.strip() and line != '==']

    # filter the patch notes by the mention of changes to heroes
    hero_changes_lines = [line for line in content_lines if any(hero in line for hero in heroes)]

    # store entries that start with a heroes name
    hero_patch_notes = []
    
    # flag any entries that contain a hero's name but does not start with the hero name
    flagged_entries = []
    for line in hero_changes_lines:
        if any(line.startswith(f"- {hero}") for hero in heroes): 
            hero_patch_notes.append(line)
        else:
            flagged_entries.append(line)

    # add hero_patch_notes and flagged_entries as new keys
    data['hero_patch_notes'] = hero_patch_notes
    data['flagged_entries'] = flagged_entries

    # write the updated data back to json file
    with open(os.path.join(json_patch_notes_folder_path, existing_json_file), 'w') as file:
        json.dump(data, file, indent=4)

    print(f"Data successfully added to {existing_json_file}")

    return None

In [40]:
json_patch_notes_folder_path = 'json-patch-notes'

existing_json_file_names = [file for file in os.listdir(json_patch_notes_folder_path) if file.endswith('.json')]

for existing_json_file in existing_json_file_names:
    preprocess_content_for_hero_patch_notes_only(json_patch_notes_folder_path, existing_json_file)

Data successfully added to 2024-05-03_427.json
Data successfully added to 2024-05-10_689.json
Data successfully added to 2024-05-13_807.json
Data successfully added to 2024-05-16_902.json
Data successfully added to 2024-05-19_1033.json
Data successfully added to 2024-05-23_1245.json
Data successfully added to 2024-05-24_1359.json
Data successfully added to 2024-05-30_2514.json
Data successfully added to 2024-06-01_3004.json
Data successfully added to 2024-06-06_4096.json
Data successfully added to 2024-06-07_4443.json
Data successfully added to 2024-06-13_5773.json
Data successfully added to 2024-06-14_6080.json
Data successfully added to 2024-06-16_6424.json
Data successfully added to 2024-06-20_7003.json
Data successfully added to 2024-06-23_7705.json
Data successfully added to 2024-06-27_8486.json
Data successfully added to 2024-07-04_9652.json
Data successfully added to 2024-07-11_10871.json
Data successfully added to 2024-07-18_11831.json
Data successfully added to 2024-07-23_1250