Code based on https://github.com/zhenzuo2/IS_590_Final/tree/master/Gather_Data

In [1]:
from tqdm import tqdm
import requests
import pandas as pd
import json
import time
import pickle

In [2]:
reviews_folder_directory = '../../review_data'

### Getting Game Details

In [42]:
# IDs for Dota 2, PUBG, CS:GO, Warframe, Rainbow Six: Siege
app_id_list = [570, 578080, 730, 230410, 359550]

In [43]:
# Download game details based on app id
def get_details(app_id: int, max_attempts: int = 10) -> dict:
    attempts = 0
    while attempts < max_attempts:
        response = requests.get("http://store.steampowered.com/api/appdetails?appids=" + str(app_id))
        if response.status_code != 429:
            return response.json()
        # If rate limited, wait and try again
        time.sleep((2 ** attempts) + random.random())
        attempts = attempts + 1
    print(f"Unable to retrieve product details for app_id = {app_id}")
    return None

In [44]:
# Download details for all games in the list
details = {}
for app_id in tqdm(app_id_list):
    d = get_details(app_id)
    if d is not None:
        details.update(d)
        
# Delete the "success" field, and create a dictionary of game data.
for key in details.keys():
    if details[key]['success'] :
        details[key] = details[key]['data']
    else:
        details[key] = None
        


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:03<00:00,  1.56it/s]


In [47]:
# Convert to dataframe and write to disk.
game_details = pd.DataFrame.from_dict(details, orient='index')
game_details.to_csv(f'{reviews_folder_directory}/game_details.csv', sep = '|', escapechar = '@', index = True)

### Downloading Game Reviews

In [6]:
def get_reviews(app_id: int, page: int, max_attempts: int = 10) -> dict:
    """Get a page of reviews for a given game.
    
    Parameters:
    -----------
    app_id : int
        app_id to get reviews of
    page : int
        page number to get reviews of. Page 0 is reviews 0 - 99, page 1 is reviews 100-199, etc.
    max_attempts: int, optional
        maximum number of retries if steam rejects the API call. 
        Total time to timeout = max_attempts * (max_attempts-1)
    
    Returns:
    --------
    list(dict())
        list of reviews, each review being a dictionary
        
    Documentation: https://partner.steamgames.com/doc/store/getreviews
    """
    attempts = 0
    # Each page is 100 reviews, so we offset by 100 each time
    offset = page * 100
    while attempts < max_attempts:
        # Get recent reviews, in english, with 100 per page.
        response = requests.get("http://store.steampowered.com/appreviews/"+str(app_id)+ 
                                "?json=1&filter=recent&language=english&num_per_page=100&start_offset=" + str(offset))
        if response.status_code == 200:
            return response.json()["reviews"]
        # If rate limited, wait and try again
        time.sleep((2 * attempts))
        attempts = attempts + 1
    print(f"Unable to retrieve reviews for app_id = {app_id}")
    return []

In [7]:
def flatten_author(review_json: dict) -> dict:
    """Flatten author data into the main attributes by side effect."""
    author_data = review_json.pop('author')
    author_data = {'author__' + key : value for key, value in author_data.items()}
    review_json.update(author_data)
    

In [9]:
# Download reviews for every game in list and add to a pandas df

# Try and resume from the last page downloaded - otherwise, start from 0.
try:
    with open(f"{reviews_folder_directory}/completed_pages.pkl","rb") as rf:
        completed_pages = pickle.load(rf)
    print(f'Completed pages: {completed_pages}')
except:
    print("No completed pages found, starting from zero.")
    completed_pages = {k : 0 for k in app_id_list}

# For each game, download reviews
for app_id in app_id_list:
    # Remove one to check the page that failed previously
    completed_pages[app_id] = max(0, completed_pages[app_id] - 1)
    
    # Try to add to the existing review dataframe, otherwise create a new one.
    try:
        game_reviews = pd.read_csv(f'{reviews_folder_directory}/{app_id}.csv', sep = '|', )
    except:
        game_reviews = pd.DataFrame(columns=['recommendationid', 'author__steamid', 'author__num_games_owned',
                                             'author__num_reviews', 'author__playtime_forever', 
                                             'author__playtime_last_two_weeks', 'author__last_played',
                                             'language', 'review', 'timestamp_created', 'timestamp_updated', 
                                             'voted_up', # If the review was voted positive
                                             'votes_up', 'votes_funny', 'comment_count',
                                             'steam_purchase', 'received_for_free', 
                                             'written_during_early_access'])

    # Loop until we no longer get a result
    with tqdm(desc=f"Downloading reviews for {app_id} ") as bar:
        try:
            # Get 500 reviews
            for i in range(500):
                current_reviews = get_reviews(app_id, completed_pages[app_id])
                if current_reviews == []:
                    break
                for review in current_reviews:
                    flatten_author(review)
                # Only keep the review information we save as columns
                current_reviews = [{k : review[k] for k in game_reviews.columns} for review in current_reviews]
                game_reviews = game_reviews.append(current_reviews)
                # Incremenet offset
                completed_pages[app_id] += 1
                bar.update(1)
                
        # If we keyboard interupt, stop getting the current review and instead save our data.
        except KeyboardInterrupt:
            pass
    
    # Set reccommendationid to neumeric (for dropping duplicates)
    length_with_dupes = game_reviews.shape[0]
    game_reviews['recommendationid'] = game_reviews['recommendationid'].apply(pd.to_numeric)
    #game_reviews.drop_duplicates('recommendationid', keep = 'last', inplace=True)
    
    print(f"Removed {length_with_dupes - game_reviews.shape[0]} duplicate elements.")
    
    
    # Save results 
    game_reviews.to_csv(f'{reviews_folder_directory}/{app_id}.csv', sep = '|', escapechar = '@', index = False)
        
print(completed_pages)
with open(f"{reviews_folder_directory}/completed_pages.pkl","wb") as wf:
    pickle.dump(completed_pages, wf)


Completed pages: {570: 64, 578080: 1250, 730: 1001, 230410: 13, 359550: 886}


Downloading reviews for 570 : 1it [00:00,  1.42it/s]


Removed 0 duplicate elements.


Downloading reviews for 578080 : 500it [12:02,  1.36s/it]


Removed 0 duplicate elements.


Downloading reviews for 730 : 500it [16:38,  1.75s/it]


Removed 0 duplicate elements.


Downloading reviews for 230410 : 1it [00:00,  1.55it/s]


Removed 0 duplicate elements.


Downloading reviews for 359550 : 2it [00:01,  1.37it/s]


Removed 0 duplicate elements.
{570: 64, 578080: 1749, 730: 1500, 230410: 13, 359550: 887}
