# Getting data

In [1]:
# library imports

import numpy as np
import pandas as pd
import csv
import json
import os
import requests
import datetime as dt
import statistics
import time

## Getting game IDs from SteamSpy API

In [2]:
def get_request(pages):
    """
    Return json-formatted response of a get request using number of pages provided

    Parameters
    --------
    pages: integer (number of pages from SteamSpy you want to get)

    Returns
    -------
    steamspy_data
        json-formatted response (dict-like)
    """
    steamspy_data_json = {}  # Dictionary to store the data

    for page in range(pages+1): #added 1 to get data for all selected pages
        url = f'https://steamspy.com/api.php?request=all&page={page}'
        response = None
        while response is None:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    steamspy_data_json.update(response.json())
                    print('Downloading page={} on {}'.format(page, time.asctime()))
                else:
                    print(f'Request for page {page} failed with status code: {response.status_code}')
            except requests.exceptions.RequestException as e:
                print(f"Request for page {page} failed with error: {e}")
            print('Sleeping for 70 seconds on {}'.format(time.asctime()))
            time.sleep(70)  # Delay for 70 seconds between requests
    return steamspy_data_json

In [3]:
pages = 64  # Number of pages
result_steamspy = get_request(pages)

Downloading page=0 on Wed Jun 14 01:02:40 2023
Sleeping for 70 seconds on Wed Jun 14 01:02:40 2023
Downloading page=1 on Wed Jun 14 01:03:51 2023
Sleeping for 70 seconds on Wed Jun 14 01:03:51 2023
Downloading page=2 on Wed Jun 14 01:05:01 2023
Sleeping for 70 seconds on Wed Jun 14 01:05:01 2023
Downloading page=3 on Wed Jun 14 01:06:12 2023
Sleeping for 70 seconds on Wed Jun 14 01:06:12 2023
Downloading page=4 on Wed Jun 14 01:07:22 2023
Sleeping for 70 seconds on Wed Jun 14 01:07:22 2023
Downloading page=5 on Wed Jun 14 01:08:33 2023
Sleeping for 70 seconds on Wed Jun 14 01:08:33 2023
Downloading page=6 on Wed Jun 14 01:09:44 2023
Sleeping for 70 seconds on Wed Jun 14 01:09:44 2023
Downloading page=7 on Wed Jun 14 01:10:54 2023
Sleeping for 70 seconds on Wed Jun 14 01:10:54 2023
Downloading page=8 on Wed Jun 14 01:12:05 2023
Sleeping for 70 seconds on Wed Jun 14 01:12:05 2023
Downloading page=9 on Wed Jun 14 01:13:15 2023
Sleeping for 70 seconds on Wed Jun 14 01:13:15 2023
Downloadin

In [6]:
# save results from get_request as json file
save_file = open('steamspy_all.json', 'w')
json.dump(result_steamspy, save_file, indent=6)
save_file.close()

#  parse SteamSpy data into dataframe
steamspy_all_appid = pd.DataFrame.from_dict(result_steamspy, orient='index')

# export steam_spy_all to csv
steamspy_all_appid.to_csv('/Users/sstefanovic/Documents/GitHub/LHL-Capstone_Project/data/steamspy_all_appid_raw.csv', index=False)

# generate sorted app_list from SteamSpy data
app_list = steamspy_all_appid[['appid', 'name']].sort_values('appid').reset_index(drop=True)

# export app_list to csv
app_list.to_csv('/Users/sstefanovic/Documents/GitHub/LHL-Capstone_Project/data/app_list.csv', index=False)

# instead read from stored csv
#app_list = pd.read_csv('../data/download/app_list.csv')

# display first few rows
app_list.head()

Unnamed: 0,appid,name
0,10,Counter-Strike
1,20,Team Fortress Classic
2,30,Day of Defeat
3,40,Deathmatch Classic
4,50,Half-Life: Opposing Force


## Next step - Getting all the data from SteamSpy

In [126]:
#getting numpy array from the app_list dataframe
list_of_appids = app_list.iloc[:,0].values

In [142]:
import csv
import requests
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

used_appid = []  # List to track used appids

steamspy_columns = [
    'appid', 'name', 'developer', 'publisher', 'score_rank', 'positive',
    'negative', 'userscore', 'owners', 'average_forever', 'average_2weeks',
    'median_forever', 'median_2weeks', 'price', 'initialprice', 'discount',
    'languages', 'genre', 'ccu', 'tags'
]


def check_appid(app_id):
    """
    Check if the given app_id is in the used_appid list.

    Parameters
    ----------
    app_id: int
        App ID to check.

    Returns
    -------
    bool
        True if the app_id is in the used_appid list, False otherwise.
    """
    return app_id in used_appid


def fetch_steamspy_data(app_id_list, output_file):
    """
    Perform get requests for a list of app IDs and write the output to a CSV file.

    Parameters
    --------
    app_id_list: list of integers
        List of app IDs to fetch data for.
    output_file: str
        Path to the output CSV file.

    Returns
    -------
    None
    """
    steamspy_games_data_json = []  # List to store the data

    def parse(app_id):
        url = f'https://steamspy.com/api.php?request=appdetails&appid={app_id}'
        response = None
        while response is None:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    game_data = response.json()
                    steamspy_games_data_json.append(game_data)
                    used_appid.append(app_id)  # Add app_id to the used_appid list
                else:
                    print(f'Request for app_id {app_id} failed with status code: {response.status_code}')
            except requests.exceptions.RequestException as e:
                print(f"Request for app_id {app_id} failed with error: {e}")
            time.sleep(1)  # Delay for 1 second between requests

            # Print progress after every 1000 requests
            if len(used_appid) % 1000 == 0:
                print('Processed {} requests on {}. Last app_id was {}'.format(len(used_appid), time.asctime(), app_id))

    # Use a ThreadPoolExecutor to parallelize the requests
    with ThreadPoolExecutor(max_workers=8) as executor:
        futures = [executor.submit(parse, app_id) for app_id in app_id_list]
        for future in as_completed(futures):
            future.result()

    # Write the data to a CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=steamspy_columns)
        writer.writeheader()
        writer.writerows(steamspy_games_data_json)

    print(f"Data written to {output_file} successfully.")

In [143]:
#getting data
fetch_steamspy_data(list_of_appids,'steamspy_data_raw########.csv')

Processed 1000 requests on Thu Jun 15 00:10:53 2023. Last app_id was 45700
Processed 1000 requests on Thu Jun 15 00:10:53 2023. Last app_id was 45710
Processed 1000 requests on Thu Jun 15 00:10:53 2023. Last app_id was 45730
Processed 1000 requests on Thu Jun 15 00:10:53 2023. Last app_id was 45720
Processed 2000 requests on Thu Jun 15 00:13:44 2023. Last app_id was 233700
Processed 2000 requests on Thu Jun 15 00:13:44 2023. Last app_id was 233720
Processed 3000 requests on Thu Jun 15 00:16:35 2023. Last app_id was 277910
Processed 3000 requests on Thu Jun 15 00:16:35 2023. Last app_id was 277930
Processed 4000 requests on Thu Jun 15 00:19:25 2023. Last app_id was 313870
Processed 4000 requests on Thu Jun 15 00:19:25 2023. Last app_id was 313960
Processed 6000 requests on Thu Jun 15 00:25:06 2023. Last app_id was 368180Processed 6000 requests on Thu Jun 15 00:25:06 2023. Last app_id was 368220

Processed 6000 requests on Thu Jun 15 00:25:06 2023. Last app_id was 368230
Processed 7000 r