In [60]:
# Import libraries
import urllib
from bs4 import BeautifulSoup
import re
import datetime

# For this example, we will be working with app id 782330 within the Steam store
app_id = 782330

# Set the url we will be working with
url = "https://store.steampowered.com/app/" + str(app_id)

# Get the html from the url
with urllib.request.urlopen(url) as response:
    html = response.read()

# Make the html more readable
soup = BeautifulSoup(html)
#soup.prettify()

We need to obtain the user-defined genres for the game on this website

In [56]:
# Set dictionaries for where we can find game genres within html
dict_for_genres = {'class': 'app_tag'}
genres = []

# Run loop over text within html to get genre (which is found in the section defined by dict), remove tabs, new lines, then apply to list
for link in soup.find_all(attrs=dict_for_genres):
    genre = link.text
    genre = re.sub(r"[\n\t\s]*", "", genre)
    
    genres.append(genre)

# Remove last entry and print genres
genres.remove("+")
print(genres)

['Action', 'FPS', 'GreatSoundtrack', 'Gore', 'Demons', 'Violent', 'Fast-Paced', 'First-Person', 'Shooter', 'Singleplayer', 'Blood', 'Multiplayer', 'Sci-fi', 'Post-apocalyptic', 'Atmospheric', 'Mature', 'StoryRich', 'Adventure', 'Horror', 'Difficult']


In [57]:
# Set dictionaries for where we can find the game title within html
dict_for_name = {"class": "apphub_AppName"}

# There is only one class with this name, so we only need to find the text once and set the game title to the text found within
game_title = soup.find(attrs=dict_for_name).text
print(game_title)

DOOM Eternal


In [61]:
# Set dictionaries for where we can find the release date within html
dict_for_date = {"class":"date"}

# There is only one class with this name, so we only need to find the text once and set the release date to the text found within
release_date = soup.find(attrs=dict_for_date).text

# The date is given in a string, so we will want to convert to a datetime object for potential calculations later on
release_date = datetime.datetime.strptime(release_date, '%b %d, %Y')
print(release_date)

2020-03-19 00:00:00


In [75]:
# Set dictionaries for where we can find the developer within html
dict_for_developer = {"class":"summary column", "id":"developers_list"}

# There is only one class and id combination with this name, so we only need to find the text once and set the developer to the text found within
developer = soup.find(attrs=dict_for_developer).a.text
print(developer)

# Publisher is not unique, so we need to look at the text within the html to find it
# Here, we turn all the text in the html to a list
text_to_list = [text for text in soup.stripped_strings]

# The Publisher name always appears right after the string "Publisher", so we find that index and add 1 to obtain the location within this list
publisher_index = text_to_list.index("Publisher:") + 1

# Set publisher to the name based on the index found above
publisher = text_to_list[publisher_index]
print(publisher)

id Software
Bethesda Softworks


In [84]:
# Set dictionaries for where we can find the user review information within html
dict_for_positive_reviews = {"type":"hidden", "id":"review_summary_num_positive_reviews"}
dict_for_all_reviews = {"type":"hidden", "id":"review_summary_num_reviews"}

# Find the location of positive reviews, take the str from "value" and convert to int
positive_reviews_location = soup.find(attrs=dict_for_positive_reviews)
positive_reviews = positive_reviews_location["value"]
positive_reviews = int(positive_reviews)
print(positive_reviews)

# Find the location of all reviews, take the str from "value" and convert to int
all_reviews_location = soup.find(attrs=dict_for_all_reviews)
all_reviews = all_reviews_location["value"]
all_reviews = int(all_reviews)
print(all_reviews)

81585
91169


In [88]:
# Set dictionaries for where we can find the pricing information within html
dict_for_currency = {"itemprop":"priceCurrency"}
dict_for_price = {"itemprop":"price"}

# Find the location of currency reviews, take the str from "content"
currency_location = soup.find(attrs=dict_for_currency)
currency = currency_location["content"]
print(currency)

# Find the location of price, take the str from "content" and convert to float
price_location = soup.find(attrs=dict_for_price)
price = price_location["content"]
price = float(price)
print(price)

<class 'str'>
<class 'float'>


In [137]:
# Set dictionaries for  discount and if it is there where we can find the discount information within html
dict_for_discount = {"class":"game_area_purchase_game_wrapper", "class":"game_purchase_action", "class":"discount_original_price"}

# First, we look to see if a discount is available
search_for_discount = soup.find(attrs=dict_for_discount)
print(search_for_discount)

# If there is a discount, we obtain the discount data and set discount to that value, else we set discount to 0
if search_for_discount != None:
    discount_location = soup.find(attrs=dict_for_discount)
    discount_original_price = discount_location.text
    discount_original_price = re.sub(r"[$]", "", discount_original_price)
    discount_original_price = float(discount_original_price)
    discount_original_price = discount_original_price
    print(type(discount_original_price), discount_original_price)
else:
    discount_original_price = price


<div class="discount_original_price">$59.99</div>
<class 'float'> 59.99


In [142]:
# We can find the discount percent with a little arithmetic
discount = round(1 - (price / discount_original_price), 2)
print(discount)

0.67
