## 0. Import packages

In [1]:
import pandas as pd
import gzip
from tqdm import tqdm
tqdm.pandas() #for progres_apply etc.

## 1. Load steam data

In [2]:
#read file line-by-line and parse json, returns dataframe
def parse_json(filename_gzipped_python_json, read_max=-1):
    #read gzipped content
    f=gzip.open(filename_gzipped_python_json,'r')
    
    #parse json
    parse_data = []
    for line in tqdm(f): #tqdm is for showing progress bar, always good when processing large amounts of data
        line = line.decode('utf-8')
        line = line.replace('true','True') #difference json/python
        line = line.replace('false','False')
        parsed_result = eval(line) #load python nested datastructure
        parse_data.append(parsed_result)
        if read_max !=-1 and len(parse_data) > read_max:
            print(f'Break reading after {read_max} records')
            break
    print(f"Reading {len(parse_data)} rows.")

    #create dataframe
    df= pd.DataFrame.from_dict(parse_data)
    return df

In [3]:
# filenames and path
steam_path = 'data/'
metadata_games = 'steam_games.json.gz' 
user_items = 'australian_users_items.json.gz'
user_reviews = 'australian_user_reviews.json.gz'
game_bundles = 'bundle_data.json.gz'
steam_reviews= 'steam_reviews.json.gz'
games_all = "gamesAll.json.gz"

Read only the required files for further use.

We only make use of the user_items dataset which contains all the items that are in his/hers library

In [4]:
dataframes = {}
for dataset in [user_items]:
    print(f"----- {dataset}-----")
    df_metadata = parse_json(steam_path + dataset)
    dataframes[dataset.split(".")[0]] = df_metadata
    display(df_metadata.head(2))

----- australian_users_items.json.gz-----


88310it [00:58, 1516.35it/s]


Reading 88310 rows.


Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."


Now, the steam_reviews dataset is loaded.
We see that only positive interactions are included

In [5]:
def split_items(items):
    ids = []
    for i in items:
        ids.append(i["item_id"])
    return ids

Convert the data dataframe to a usable format

In [6]:
user_items_df = dataframes["australian_users_items"]

# keep useful columns user_id and items 
user_items_df = user_items_df[["user_id", "items"]]

# split the items dictionary in a list of item ids
user_items_df["items"] = user_items_df["items"].apply(split_items)

# rename column items to item_id
user_items_df = user_items_df.rename(columns={'items': "item_id"})

# only keep users with at least one item in their inventory
user_items_df = user_items_df[user_items_df.item_id.apply(lambda x: len(x) > 0)]

# split the column user_id which is a list to multiple rows in the dataframe for the same user_id
user_items_df = user_items_df.explode("item_id").reset_index(drop=True)
user_items_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_items_df["items"] = user_items_df["items"].apply(split_items)


Unnamed: 0,user_id,item_id
0,76561197970982479,10
1,76561197970982479,20
2,76561197970982479,30
3,76561197970982479,40
4,76561197970982479,50
...,...,...
5153204,76561198329548331,346330
5153205,76561198329548331,373330
5153206,76561198329548331,388490
5153207,76561198329548331,521570


In [7]:
# Drop nan's and empty item_id lists from the dataset
interactions_df = user_items_df.dropna()
interactions_df

Unnamed: 0,user_id,item_id
0,76561197970982479,10
1,76561197970982479,20
2,76561197970982479,30
3,76561197970982479,40
4,76561197970982479,50
...,...,...
5153204,76561198329548331,346330
5153205,76561198329548331,373330
5153206,76561198329548331,388490
5153207,76561198329548331,521570


## 2. Store the loaded dataset as pickle file for further use

In [8]:
import pickle
pickle.dump(interactions_df, open("pickle_dumps/interactions_df.p", "wb"))