In [1]:
import pandas as pd

In [2]:
# Sciprt to get raw data.
exec( open( "../raw_data/get_raw_data.py" ).read() )

In [3]:
# Get the raw data using the function `get_raw_data`
# from the above script.
relative_path = "../raw_data"

app_list     = get_raw_data( "app_list", relative_path )
app_data     = get_raw_data( "app_data", relative_path )
app_reviews  = get_raw_data( "app_reviews", relative_path )
user_reviews = get_raw_data( "user_reviews", relative_path )

cleaned_app_list     = None
cleaned_app_data     = None
cleaned_app_reviews  = None
cleaned_user_reviews = None

In [4]:
# Start cleaning

# 1 - `app_list` data does not require any cleaning,
#     it is simpyl a mapping between app_ids and 
#     app_names.
cleaned_app_list = app_list

# debug
cleaned_app_list

Unnamed: 0,app_id,app_title
0,5,Dedicated Server
1,7,Steam Client
2,8,winui2
3,10,Counter-Strike
4,20,Team Fortress Classic
...,...,...
104005,1456550,The Tower Of TigerQiuQiu Soapbubble
104006,1456850,Solicitude Wake-up Demo
104007,1457260,Masters of Puzzle - Halloween Edition: Undeadl...
104008,1457270,Masters of Puzzle - Halloween Edition: Pumpkin...


In [5]:
# 2 - `app_data` contains some invalid values as we can see above.
#     This is because some apps are for testing or reserved for
#     internal usage. Fortunately, Steam API return a False status
#     for invalid apps, So, we are going to remove the rows with status
#     equals to False.
cleaned_app_data = app_data[app_data.status != False]

#     Now we can remove the status column since all the values 
#     are True.
cleaned_app_data = cleaned_app_data.drop( columns = ["status"] )

#     We are also going to remove applications which do not have a
#     `game` type. Other types such as trailer, dlc, demo, mod etc.
#     are not of any use to us.
cleaned_app_data = cleaned_app_data[cleaned_app_data.type == "game"]

#     Now we can also remove the type column since all the values 
#     are `game`.
cleaned_app_data = cleaned_app_data.drop( columns = ["type"] )

#     debug
cleaned_app_data

Unnamed: 0,app_id,name,required_age,platforms,metacritic_score,category_ids,categories,genre_ids,genres,recommendations
3,10,Counter-Strike,0.0,windows:mac:linux,88.0,1:49:36:37:8,Multi-player:PvP:Online PvP:Shared/Split Scree...,1,Action,94916.0
4,20,Team Fortress Classic,0.0,windows:mac:linux,,1:49:36:37:8:44,Multi-player:PvP:Online PvP:Shared/Split Scree...,1,Action,3597.0
5,30,Day of Defeat,0.0,windows:mac:linux,79.0,1:8,Multi-player:Valve Anti-Cheat enabled,1,Action,2669.0
6,40,Deathmatch Classic,0.0,windows:mac:linux,,1:49:36:37:8:44,Multi-player:PvP:Online PvP:Shared/Split Scree...,1,Action,1244.0
7,50,Half-Life: Opposing Force,0.0,windows:mac:linux,,2:1:8:44,Single-player:Multi-player:Valve Anti-Cheat en...,1,Action,7560.0
...,...,...,...,...,...,...,...,...,...,...
103979,1455060,TERMINUS,0.0,windows,,2,Single-player,4:23,Casual:Indie,
103980,1455090,Good puzzle: Castles,0.0,windows,,2:22,Single-player:Steam Achievements,4:23:28,Casual:Indie:Simulation,
103983,1455420,Choco Pixel S,0.0,windows,,2:22:23,Single-player:Steam Achievements:Steam Cloud,4:23,Casual:Indie,
103984,1455430,Harvest Green,0.0,windows,,2,Single-player,25:4:23:3:28:2,Adventure:Casual:Indie:RPG:Simulation:Strategy,


In [6]:
# 2 - Similar to `app_data`, `app_reviews` some  invalid values,
#     we are going to discard those rows and remove the `status`
#     column.
cleaned_app_reviews = app_reviews[app_reviews.status != False]

#     Remove the `status` column
cleaned_app_reviews = cleaned_app_reviews.drop( columns = ["status"] )

#     Since many apps have no reviews we are also going to remove them.
cleaned_app_reviews = cleaned_app_reviews[cleaned_app_reviews.total_reviews != 0]

#     debug
cleaned_app_reviews

Unnamed: 0,app_id,review_score,review_score_desc,total_positive,total_negative,total_reviews
3,10,9,Overwhelmingly Positive,19908,753,20661
4,20,8,Very Positive,1889,385,2274
5,30,8,Very Positive,1169,169,1338
6,40,8,Very Positive,548,131,679
7,50,9,Overwhelmingly Positive,4047,192,4239
...,...,...,...,...,...,...
103877,1450220,5,Mixed,11,6,17
103894,1450700,0,3 user reviews,3,0,3
103926,1452150,0,3 user reviews,3,0,3
103981,1454920,0,3 user reviews,3,0,3


In [7]:
user_reviews

Unnamed: 0,app_id,steam_id,recommendation_id,playtime_forever,playtime_at_review,language,timestamp_created,voted_up,votes_up,votes_funny,weighted_vote_score,comment_count,num_games_owned,num_reviews
0,10,76561198116800044,78348955,1025.0,1025.0,english,1603991029,True,0,0,0.000000,0,8,2
1,10,76561198897181049,78322946,1995.0,1954.0,english,1603948187,True,0,0,0.000000,0,8,2
2,10,76561198272044528,78318330,7070.0,7019.0,english,1603938044,True,0,0,0.000000,0,109,2
3,10,76561199042823737,78316198,614.0,498.0,english,1603933995,True,0,0,0.000000,0,3,3
4,10,76561198992482648,78307493,6143.0,6143.0,english,1603919460,True,1,0,0.523810,0,59,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4166536,2028850,76561198150653411,45832633,0.0,,english,1542323143,True,37,103,0.586540,0,62,15
4166537,2028850,76561198181450110,45645184,0.0,,english,1541451266,True,12,5,0.589577,0,156,45
4166538,2028850,76561198253602926,44055675,0.0,,english,1533945835,True,16,7,0.476056,0,63,14
4166539,2028850,76561197966220495,12309333,0.0,,english,1412139615,True,147,178,0.092377,8,333,2


In [8]:
cleaned_user_reviews = user_reviews.groupby( ["app_id"] )

In [None]:
app_ids       = []
num_users     = []
user_playtime = []

for row in cleaned_user_reviews:
    app_ids.append( row[0] )
    
    print( len( app_ids ) )
    
    users = 0
    playtime = 0
    for ( index, row ) in row[1].iterrows():
        p = float( row["playtime_forever"] )
        if ( p != float( "NaN" ) ):
            users += 1
            playtime += p
            
    num_users.append( users )
    user_playtime.append( user_playtime )
    
user_data = pd.DataFrame( { "app_id" : app_ids, "num_users" : num_users, "playtime" : user_playtime } )

user_data