# Scraping Steam reviews of Baldur's Gate 3

In [2]:
# Import necessary packages
import requests
import pandas as pd
import time
from config import BG3_data_file_path

In [3]:
# FUnction to fetch reviews
def fetch_steam_reviews(appid, num_reviews=500):
    reviews = []
    cursor = '*'
    language = 'english'
    
    while len(reviews) < num_reviews:
        url = f"https://store.steampowered.com/appreviews/{appid}?json=1&filter=all&language={language}&num_per_page=100&cursor={cursor}&day_range=365"
        response = requests.get(url)
        
        if response.status_code != 200:
            print(f"Failed to get reviews: {response.status_code}")
            break
        
        data = response.json()
        
        if data['success'] == 1:
            reviews += data['reviews']
            cursor = data['cursor']
        else:
            print("Failed to get reviews: API returned unsuccessful status")
            break
        
        if len(reviews) >= num_reviews:
            reviews = reviews[:num_reviews]
        
        # Sleep to avoid Steam rate-limiting
        time.sleep(1)
    
    return reviews

appid = '1086940' # Bauldur's Gate 3
reviews = fetch_steam_reviews(appid)
print(f"Fetched {len(reviews)} reviews.") # Message printed if function successful

Failed to get reviews: API returned unsuccessful status
Fetched 400 reviews.


In [22]:
# Create pandas dataframe with reviews
review_df = pd.DataFrame(reviews)
# Rename review column
review_df = review_df.rename(columns={'review_text':'review'})
# Export dataframe to csv file 
data_file_path = BG3_data_file_path
review_df.to_csv(data_file_path, index=False)
review_df.head()

Unnamed: 0,recommendationid,author,language,review,timestamp_created,timestamp_updated,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,steam_purchase,received_for_free,written_during_early_access,hidden_in_steam_china,steam_china_location
0,146197289,"{'steamid': '76561198209057388', 'num_games_ow...",english,my work is really cutting into my Baldur's Gat...,1694565640,1694565640,True,7830,2987,0.9809620380401612,0,True,False,False,True,
1,153859247,"{'steamid': '76561198412806684', 'num_games_ow...",english,You can convince bosses to kill themselves. 10/10,1702909919,1702909919,True,3605,900,0.9807387590408324,10,True,False,False,True,
2,143924257,"{'steamid': '76561198022720435', 'num_games_ow...",english,Possessed a guard and had them open a gate con...,1691783669,1691783669,True,3124,1268,0.9789522290229796,12,True,False,False,True,
3,146776206,"{'steamid': '76561198007607100', 'num_games_ow...",english,I have only one point of criticism:\nThis game...,1695386806,1701885967,True,1320,6,0.97847980260849,28,True,False,False,True,
4,143578821,"{'steamid': '76561198004904749', 'num_games_ow...",english,No microtransactions\nNo in-game purchases\nNo...,1691401740,1691401740,True,15452,51,0.9783660769462584,0,True,False,False,True,


In [12]:
# View info about dataframe
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   recommendationid             500 non-null    object
 1   author                       500 non-null    object
 2   language                     500 non-null    object
 3   review                       500 non-null    object
 4   timestamp_created            500 non-null    int64 
 5   timestamp_updated            500 non-null    int64 
 6   voted_up                     500 non-null    bool  
 7   votes_up                     500 non-null    int64 
 8   votes_funny                  500 non-null    int64 
 9   weighted_vote_score          500 non-null    object
 10  comment_count                500 non-null    int64 
 11  steam_purchase               500 non-null    bool  
 12  received_for_free            500 non-null    bool  
 13  written_during_early_access  500 no

In [14]:
# Show summary stats for dataframe
review_df.describe()

Unnamed: 0,timestamp_created,timestamp_updated,votes_up,votes_funny,comment_count
count,500.0,500.0,500.0,500.0,500.0
mean,1697293000.0,1698477000.0,650.426,104.406,5.418
std,8068341.0,8393158.0,1692.239412,319.628663,21.035976
min,1689325000.0,1689353000.0,5.0,0.0,0.0
25%,1691524000.0,1691719000.0,9.0,0.0,0.0
50%,1692991000.0,1694564000.0,41.0,3.0,0.0
75%,1701094000.0,1702781000.0,478.25,36.5,1.0
max,1720615000.0,1720636000.0,15814.0,2987.0,255.0


In [34]:
# Check whether there are duplicates
duplicates = review_df.duplicated(subset='review', keep='first')
duplicate_entries = review_df[duplicates]
# Show how many duplicates there are. Too many might be a sign of an issue with the fetching process (e.g., some Steam hiccup)
print(len(duplicate_entries))

5
