In [54]:
import csv
import os
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [14]:
api_key = os.environ.get("STEAM_API_KEY")

if api_key is not None:
    # Your code that uses the API key
    print("API Key:", api_key)
else:
    print("API Key not found. Please set the api_key environment variable.")

In [60]:
def get_genres_for_game(appid):
    """
    bs4 scraper for getting the genres for a game.
    Takes:
    - appid = Steam appid for the game of interest
    Returns:
    - list of lowercase strings representing game genres
    """
    url = f"https://store.steampowered.com/app/{appid}/"
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        genre_label = soup.find('b', string="Genre:")

        if genre_label:
            genre_span = genre_label.find_next('span')
            genre_links = genre_span.find_all('a')
            genres = [link.text.lower() for link in genre_links]

            return genres

    return []

def get_owned_games(api_key, steam_id):
    """
    Retrieves the games owned by the supplied Steam ID.
    Takes:
    - api_key = API key for Steam API
    - steam_id = Steam ID of the player of interest
    Returns:
    - a list of the games owned by the player
    """
    url = f"https://api.steampowered.com/IPlayerService/GetOwnedGames/v0001/?key={api_key}&steamid={steam_id}&format=json&include_appinfo=1&include_played_free_games=1"
    response = requests.get(url)
    
    if response.status_code == 200:
        return response.json().get('response', {}).get('games', [])
    else:
        return []
    
def create_playtime_dataframe(api_key, steam_ids):
    """
    Creates a Pandas DataFrame with columns for Steam ID and a list of playtimes for appids,
    in the form [(appid, playtime)], where playtime > 0
    Takes:
    - api_key = API key for Steam API
    - steam_ids = list of Steam IDs to retrieve playtimes for
    Returns:
    Pandas DataFrame with columns steam_id, playtimes where playtimes is a list
    of (appid, playtime) tuples.
    """
    data = []
    
    # Use tqdm to create a progress bar
    with tqdm(total=len(steam_ids)) as pbar:
        for steam_id in steam_ids:
            owned_games = get_owned_games(api_key, steam_id)
            # Only consider playtimes > 0
            playtimes = [(game['appid'], game['playtime_forever']) for game in owned_games if game['playtime_forever'] > 0]
            data.append({'steam_id': steam_id, 'playtimes': playtimes})
            pbar.update(1)  # Update the progress bar

    df = pd.DataFrame(data)
    
    return df

def get_genres_for_appids(appids, csv_filename):
    """
    Write the genres present in a set of games to csv.
    Will create the csv if it does not exist, else will append to the existing csv.
    Takes:
    - appids = a list of appids
    - csv_filename = filename of csv to write to
    """
    # Create the CSV file if it doesn't exist
    if not os.path.exists(csv_filename):
        with open(csv_filename, 'w', newline='') as csvfile:
            csv_writer = csv.writer(csvfile)
            csv_writer.writerow(['appid', 'genres'])

    with tqdm(total=len(appids)) as pbar:
        for appid in appids:
            genres = get_genres_for_game(appid)

            # Append the appid and genres to the CSV file
            with open(csv_filename, 'a', newline='') as csvfile:
                csv_writer = csv.writer(csvfile)
                csv_writer.writerow([appid, ', '.join(genres)])

            pbar.update(1)

In [24]:
steam_ids = pd.read_csv('./data/buldars_gate_3_players_2023-11-04.csv')['steam_id'].values
steam_ids

array([76561199004154858, 76561199058333673, 76561198016380377, ...,
       76561198056016073, 76561198137532719, 76561197996547993])

In [44]:
df_game_playtimes = create_playtime_dataframe(api_key, steam_ids)

100%|██████████| 2602/2602 [20:57<00:00,  2.07it/s] 


In [45]:
df_game_playtimes

Unnamed: 0,steam_id,playtimes
0,76561199004154858,"[(4500, 1493), (400, 323), (13560, 2), (15100,..."
1,76561199058333673,"[(15100, 7), (10090, 306), (17470, 605), (2233..."
2,76561198016380377,"[(240, 124), (4760, 29), (12810, 108), (10180,..."
3,76561198898124688,"[(107410, 583), (249130, 765), (264710, 3105),..."
4,76561198011137973,[]
...,...,...
2597,76561198137518354,"[(4000, 256), (9480, 43), (22320, 941), (550, ..."
2598,76561198115832615,"[(3320, 239), (4000, 83), (17480, 119), (22330..."
2599,76561198056016073,"[(50, 260), (70, 777), (2500, 4), (3320, 33), ..."
2600,76561198137532719,"[(280, 91), (20900, 1301), (17480, 394), (400,..."


In [56]:
unique_appids = df_game_playtimes['playtimes'].apply(lambda x: [appid for appid, _ in x]).explode().unique()

In [63]:
get_genres_for_appids(unique_appids, 'data/appid_genre_map.csv')

100%|██████████| 18079/18079 [3:22:40<00:00,  1.49it/s]  


In [111]:
appid_genre_map = pd.read_csv('./data/appid_genre_map.csv', dtype={'appid': str})
appid_genre_map['genres'] = appid_genre_map['genres'].str.split(', ')
appid_to_genres = dict(zip(appid_genre_map['appid'], appid_genre_map['genres']))

In [129]:
appid_genre_map

Unnamed: 0,appid,genres
0,4500,"[action, rpg]"
1,400,[action]
2,13560,[action]
3,15100,"[action, adventure]"
4,17390,"[action, adventure, casual, rpg, simulation, s..."
...,...,...
18074,2330500,"[casual, indie]"
18075,853450,"[action, indie, rpg, early access]"
18076,615250,"[action, strategy]"
18077,624550,"[action, adventure, rpg]"


In [132]:
unique_genres = appid_genre_map['genres'].explode().dropna().unique()
unique_genres

array(['action', 'rpg', 'adventure', 'casual', 'simulation', 'strategy',
       'indie', 'free to play', 'massively multiplayer', 'racing',
       'sports', 'early access', 'animation & modeling',
       'design & illustration', 'photo editing', 'utilities',
       'video production', 'game development', 'audio production',
       'education', 'web publishing', 'software training', 'movie',
       'accounting'], dtype=object)

In [133]:
# Create a DataFrame of the total playtime attributed to each genre for each player
player_genre_playtime = {}

for index, row in df_game_playtimes.iterrows():
    player = row['steam_id']
    playtimes = row['playtimes']

    player_genre_playtime[player] = {genre: 0 for genre in unique_genres}

    for appid, playtime in playtimes:
        genres = appid_to_genres[str(appid)]
        if type(genres) is list:
            for genre in genres:
                player_genre_playtime[player][genre] += playtime

df_genre_playtimes = pd.DataFrame(player_genre_playtime).T
df_genre_playtimes.index.name = 'steam_id'

In [134]:
df_genre_playtimes

Unnamed: 0_level_0,action,rpg,adventure,casual,simulation,strategy,indie,free to play,massively multiplayer,racing,...,photo editing,utilities,video production,game development,audio production,education,web publishing,software training,movie,accounting
steam_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
76561199004154858,284719,139819,111869,4658,145470,188425,73581,61985,13944,433,...,358,358,0,0,0,0,0,0,0,0
76561199058333673,21094,6021,23339,1251,782,6846,4261,3010,2621,53,...,329,329,0,0,0,0,0,0,0,0
76561198016380377,114234,138309,153019,87191,108409,124549,131727,76426,47784,10,...,732,4214,0,0,0,0,0,0,0,0
76561198898124688,169319,116134,92346,11562,8312,20518,29027,1667,18750,59,...,875,875,14,0,0,0,0,0,0,0
76561198011137973,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76561198137518354,393132,343848,222596,21058,86424,180815,162705,111029,114093,1449,...,1708,1708,0,2210,7,2210,2210,7,0,0
76561198115832615,343959,458307,407367,24989,77171,190699,328191,37815,100910,0,...,0,0,0,0,0,0,0,0,0,0
76561198056016073,461428,130835,180911,23759,35844,47386,86340,120395,12038,3369,...,1257,1257,5903,0,0,1454,0,1454,0,0
76561198137532719,37940,29016,12050,119459,185612,186959,150269,2076,188,8918,...,0,0,0,0,0,0,0,0,0,0


In [136]:
df_genre_playtimes.to_csv('data/buldars_gate_3_player_genre_playtimes.csv')