In [None]:
import os
import pandas as pd

In [None]:
# Sciprt to get raw data.
exec( open( "../raw_data/get_raw_data.py" ).read() )

In [None]:
# Get the raw data using the function `get_raw_data`
# from the above script.
relative_path = "../raw_data"

app_list     = get_raw_data( "app_list", relative_path )
app_data     = get_raw_data( "app_data", relative_path )
app_reviews  = get_raw_data( "app_reviews", relative_path )
user_reviews = get_raw_data( "user_reviews", relative_path )

cleaned_app_list     = None
cleaned_app_data     = None
cleaned_app_reviews  = None
cleaned_user_reviews = None

In [None]:
# Start cleaning

# 1 - `app_list` data does not require any cleaning,
#     it is simpyl a mapping between app_ids and 
#     app_names.
cleaned_app_list = app_list

# debug
cleaned_app_list

In [None]:
# 2 - `app_data` contains some invalid values as we can see above.
#     This is because some apps are for testing or reserved for
#     internal usage. Fortunately, Steam API return a False status
#     for invalid apps, So, we are going to remove the rows with status
#     equals to False.
cleaned_app_data = app_data[app_data.status != False]

#     Now we can remove the status column since all the values 
#     are True.
cleaned_app_data = cleaned_app_data.drop( columns = ["status"] )

#     We are also going to remove applications which do not have a
#     `game` type. Other types such as trailer, dlc, demo, mod etc.
#     are not of any use to us.
cleaned_app_data = cleaned_app_data[cleaned_app_data.type == "game"]

#     Now we can also remove the type column since all the values 
#     are `game`.
cleaned_app_data = cleaned_app_data.drop( columns = ["type"] )

#     debug
cleaned_app_data

In [None]:
# 3 - Similar to `app_data`, `app_reviews` some  invalid values,
#     we are going to discard those rows and remove the `status`
#     column.
cleaned_app_reviews = app_reviews[app_reviews.status != False]

#     Remove the `status` column
cleaned_app_reviews = cleaned_app_reviews.drop( columns = ["status"] )

#     Since many apps have no reviews we are also going to remove them.
cleaned_app_reviews = cleaned_app_reviews[cleaned_app_reviews.total_reviews != 0]

#     debug
cleaned_app_reviews

In [40]:
def total_hours( x ):
    return x.sum()

def num_users( x ):
    return len( x.index )

# Group all the reviews for any sepcific application and sum
# their hours and total number of users.
cleaned_user_reviews = user_reviews.groupby( ["app_id"] )["playtime_forever"].agg( [ num_users, total_hours ] ).reset_index()

# debug
cleaned_user_reviews

Unnamed: 0,app_id,num_users,total_hours
0,10,1000.0,19124629.0
1,20,1000.0,6349770.0
2,30,1000.0,7296999.0
3,40,679.0,1374408.0
4,50,1000.0,1406570.0
...,...,...,...
48980,1450220,17.0,0.0
48981,1450700,3.0,0.0
48982,1452150,3.0,0.0
48983,1454920,3.0,0.0


In [38]:
# Save the cleaned data.

if not os.path.exists( "../cleaned_data" ):
    os.makedirs( "../cleaned_data" )
    
cleaned_app_data.to_csv( "../cleaned_data/app_data.csv", index = False )
cleaned_app_reviews.to_csv( "../cleaned_data/app_reviews.csv", index = False )
cleaned_user_reviews.to_csv( "../cleaned_data/user_data.csv", index = False )