# Download Reviews

This script downloads game metadata and review data for use in later summarisation.

Sections:
* [Download Game Details](#game_details)
* [Download Game Reviews](#game_reviews)


Code based on https://github.com/zhenzuo2/IS_590_Final/tree/master/Gather_Data

Files generated:
* game_details.csv - metadata for each game pulled from the Steam description. E.g. `name`, `app_id`, `genres`, `release_date`.
* completed_pages.pkl - pickled dictionary tracking the number of pages of reviews that have been downloaded for each game. Used to resume download from the previous position.
* [app_id].csv - csv file storing reviews.

Known Limitations:
* Resuming downloads is based on the assumption that minimal new reviews have been posted since the start of the download. While there is logic for handling duplicates, the script will not automatically pull reviews that have been posted since the last download.

In [None]:
from tqdm import tqdm
import requests
import pandas as pd
import json
import time
import pickle
from pathlib import Path
from pprint import pprint

In [None]:
reviews_folder_directory = Path('../review_data_2')

# If true, write game details file to disk. This will overwrite previous files.
WRITE_GAME_DETAILS = True

# IDs for games wanted to download.
app_id_list = [
    570, # Dota 2
    578080, # PUBG
    730, # CS:GO
    230410, # Warframe
    359550, # Rainbow Six: Siege
]

### Download Game Details <a name="game_details"/>

In [None]:
# Download game details based on app id
def get_details(app_id: int, max_attempts: int = 10) -> dict:
    attempts = 0
    while attempts < max_attempts:
        response = requests.get("http://store.steampowered.com/api/appdetails?appids=" + str(app_id))
        if response.status_code != 429:
            return response.json()
        # If rate limited, wait and try again
        time.sleep((2 ** attempts) + random.random())
        attempts = attempts + 1
    print(f"Unable to retrieve product details for app_id = {app_id}")
    return None

In [None]:
# Download details for all games in the list
details = {}
for app_id in tqdm(app_id_list):
    d = get_details(app_id)
    if d is not None:
        details.update(d)
        
# Delete the "success" field, and create a datafrane of game data.
for key in details.keys():
    if details[key]['success'] :
        details[key] = details[key]['data']
    else:
        details[key] = None
        
game_details = pd.DataFrame.from_dict(details, orient='index')
game_details

In [None]:
# Write to disk.
if WRITE_GAME_DETAILS:
    reviews_folder_directory.mkdir(parents=True, exist_ok=True)
    game_details_directory = f'{reviews_folder_directory}/game_details.csv'
    game_details.to_csv(game_details_directory, sep = '|', escapechar = '@', index = True)
    print(f"Wrote game details to {game_details_directory}")

In [None]:
# Generate map from game id to game name
game_name_map = dict(zip(game_details.steam_appid, game_details.name))
pprint(game_name_map)

### Download Game Reviews <a name="game_reviews"/>
This section downloads reviews from Steam. 

In [None]:
def get_reviews(app_id: int, page: int, max_attempts: int = 10) -> dict:
    """Get a page of reviews for a given game.
    
    Parameters:
    -----------
    app_id : int
        app_id to get reviews of
    page : int
        page number to get reviews of. Page 0 is reviews 0 - 99, page 1 is reviews 100-199, etc.
    max_attempts: int, optional
        maximum number of retries if steam rejects the API call. 
        Total time to timeout = (max_attempts * (max_attempts-1)) seconds
    
    Returns:
    --------
    list(dict())
        list of reviews, each review being a dictionary
        
    Documentation: https://partner.steamgames.com/doc/store/getreviews
    """
    attempts = 0
    # Each page is 100 reviews, so we offset by 100 each time
    offset = page * 100
    while attempts < max_attempts:
        # Get recent reviews, in english, with 100 per page.
        response = requests.get("http://store.steampowered.com/appreviews/"+str(app_id)+ 
                                "?json=1&filter=recent&language=english&num_per_page=100&start_offset=" + str(offset))
        if response.status_code == 200:
            return response.json()["reviews"]
        # If rate limited, wait and try again
        time.sleep((2 * attempts))
        attempts = attempts + 1
    print(f"Unable to retrieve reviews for {game_name_map[app_id]} (app_id = {app_id})")
    return []

In [None]:
def flatten_author(review_json: dict) -> dict:
    """In-place function to flatten author data into the main attributes.

    Keys of the review_json["author"] are added to review_json with "author__" prepended.
    """
    author_data = review_json.pop('author')
    author_data = {'author__' + key : value for key, value in author_data.items()}
    review_json.update(author_data)
    

In [None]:
page_download_limit = 50 # Number of pages to download for each game before breaking.

# Download reviews for every game in list and add to a pandas df

# Try and resume from the last page downloaded - otherwise, start from 0.
try:
    with open(f"{reviews_folder_directory}/completed_pages.pkl","rb") as rf:
        completed_pages = pickle.load(rf)
    print("Loaded completed pages file. Starting completition:\n", {game_name_map[k]: v for k,v in completed_pages.items()})
except FileNotFoundError:
    print("No completed pages found, starting from zero.")
    completed_pages = {k : 0 for k in app_id_list}

# For each game, download reviews
for app_id in app_id_list:
    # Remove one to check the page that failed previously
    completed_pages[app_id] = max(0, completed_pages[app_id] - 1)
    
    # Try to add to the existing review dataframe, otherwise create a new one.
    try:
        game_reviews = pd.read_csv(f'{reviews_folder_directory}/{app_id}.csv', sep = '|', )
    except:
        game_reviews = pd.DataFrame(columns=['recommendationid', 'author__steamid', 'author__num_games_owned',
                                             'author__num_reviews', 'author__playtime_forever', 
                                             'author__playtime_last_two_weeks', 'author__last_played',
                                             'language', 'review', 'timestamp_created', 'timestamp_updated', 
                                             'voted_up', # If the review was voted positive
                                             'votes_up', 'votes_funny', 'comment_count',
                                             'steam_purchase', 'received_for_free', 
                                             'written_during_early_access'])
        
    game_reviews = game_reviews.astype({"voted_up":bool,"steam_purchase":bool, "received_for_free":bool, "written_during_early_access":bool})

    # Loop until we no longer get a result
    with tqdm(desc=f"Downloading reviews for {game_name_map[app_id]} (app_id = {app_id}) ") as bar:
        try:
            for i in range(page_download_limit):
                current_reviews = get_reviews(app_id, completed_pages[app_id])
                if current_reviews == []:
                    break
                for review in current_reviews:
                    flatten_author(review)
                # Only keep the review information we save as columns
                current_reviews = [{k : review[k] for k in game_reviews.columns} for review in current_reviews]
                game_reviews = pd.concat((game_reviews, pd.DataFrame(current_reviews)))
                # Incremenet offset
                completed_pages[app_id] += 1
                bar.update(1)
                
        # If we keyboard interupt, stop getting the current review and instead save our data.
        except KeyboardInterrupt:
            pass
    
    # Set reccommendationid to neumeric (for dropping duplicates)
    length_with_dupes = game_reviews.shape[0]
    game_reviews['recommendationid'] = game_reviews['recommendationid'].apply(pd.to_numeric)
    #game_reviews.drop_duplicates('recommendationid', keep = 'last', inplace=True)
    
    print(f"Removed {length_with_dupes - game_reviews.shape[0]} duplicate elements.")
    
    
    # Save results 
    game_reviews.to_csv(f'{reviews_folder_directory}/{app_id}.csv', sep = '|', escapechar = '@', index = False)
       
print()
print("Download finished/interrupted. Completed pages:")
pprint({game_name_map[k]: v for k,v in completed_pages.items()})

with open(f"{reviews_folder_directory}/completed_pages.pkl","wb") as wf:
    pickle.dump(completed_pages, wf)
