## 0. Import packages

In [1]:
import pandas as pd
import gzip
from tqdm import tqdm
tqdm.pandas() #for progres_apply etc.

## 1. Load steam data

In [2]:
#read file line-by-line and parse json, returns dataframe
def parse_json(filename_gzipped_python_json, read_max=-1):
    #read gzipped content
    f=gzip.open(filename_gzipped_python_json,'r')
    
    #parse json
    parse_data = []
    for line in tqdm(f): #tqdm is for showing progress bar, always good when processing large amounts of data
        line = line.decode('utf-8')
        line = line.replace('true','True') #difference json/python
        line = line.replace('false','False')
        parsed_result = eval(line) #load python nested datastructure
        parse_data.append(parsed_result)
        if read_max !=-1 and len(parse_data) > read_max:
            print(f'Break reading after {read_max} records')
            break
    print(f"Reading {len(parse_data)} rows.")

    #create dataframe
    df= pd.DataFrame.from_dict(parse_data)
    return df

In [3]:
# filenames and path
steam_path = 'data/'
metadata_games = 'steam_games.json.gz' 
user_items = 'australian_users_items.json.gz'
user_reviews = 'australian_user_reviews.json.gz'
game_bundles = 'bundle_data.json.gz'
steam_reviews= 'steam_reviews.json.gz'

In [4]:
dataframes = {}
for dataset in [steam_reviews]:
    print(f"----- {dataset}-----")
    df_metadata = parse_json(steam_path + dataset)
    dataframes[dataset.split(".")[0]] = df_metadata
    display(df_metadata.head(2))

3885it [00:00, 38849.11it/s]

----- steam_reviews.json.gz-----


7793069it [03:19, 39006.95it/s]


Reading 7793069 rows.


Unnamed: 0,username,product_id,page_order,text,hours,recommended,products,date,early_access,page,compensation,found_funny,user_id
0,Chaos Syren,725280,0,This would not be acceptable as an entertainme...,0.1,True,41.0,2017-12-17,False,1,,,
1,₮ʜᴇ Wᴀʀᴛᴏɴ,328100,0,looks like a facebook game,51.1,True,769.0,2017-12-27,False,1,,,


Now, the steam_reviews dataset is loaded.
We see that only positive interactions are included

In [5]:
steam_reviews_df = dataframes["steam_reviews"]
steam_reviews_df = steam_reviews_df[["user_id", "product_id", "recommended"]]
steam_reviews_df = steam_reviews_df.rename(columns={'product_id': "item_id", "recommended": 'recommend'})
steam_reviews_df

Unnamed: 0,user_id,item_id,recommend
0,,725280,True
1,,328100,True
2,,328100,True
3,,35140,True
4,76561198007483075,35140,True
...,...,...,...
7793064,,252490,True
7793065,76561198089897928,252490,True
7793066,76561198048207033,252490,True
7793067,,252490,True


In [6]:
# Drop nan's from the dataset
interactions_df = steam_reviews_df.dropna()
interactions_df

Unnamed: 0,user_id,item_id,recommend
4,76561198007483075,35140,True
8,76561197970402776,707610,True
11,76561198060686749,328100,True
13,76561198023491401,35140,True
16,76561198115331805,35140,True
...,...,...,...
7793058,76561197962161824,252490,True
7793061,76561198010660367,252490,True
7793062,76561197983773018,252490,True
7793065,76561198089897928,252490,True


## 2. Store the loaded dataset as pickle file for further use

In [7]:
import pickle
pickle.dump(interactions_df, open("pickle_dumps/interactions_df.p", "wb"))