In [1]:
import pandas as pd
import ast
import json
import gzip

### Funciones

In [2]:
#Funcion que lee los archivos JSON

def leer(ruta):
    """Esta funcion lee los archivos JSON y los transforma a DataFrame"""
    lista=[]
    with open(ruta, 'r', encoding='UTF-8') as f:
        for line in f.readlines():
            try:
                data=ast.literal_eval(line)
                lista.append(data)
            except ValueError as e:
                lista.append(line)
                continue

    return pd.DataFrame(lista)

In [3]:
# Análisis de Calidad de Datos en un DataFrame"
def datos(df):
    verif = {"nombre_campo": [], "tipo_dato": [], "%_No_Nulos": [], "%_Nulos": [], "Nulos": []}

    for columna in df.columns:
        porcentaje_no_nulos = (df[columna].count() / len(df)) * 100
        verif["nombre_campo"].append(columna)
        verif["tipo_dato"].append(df[columna].dtypes)
        verif["%_No_Nulos"].append(round(porcentaje_no_nulos, 2))
        verif["%_Nulos"].append(round(100 - porcentaje_no_nulos, 2))
        verif["Nulos"].append(df[columna].isnull().sum())

    df_info = pd.DataFrame(verif)
        
    return df_info

### Review

In [4]:
df_reviews=leer("../../../dato/australian_user_reviews.json")
df_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [5]:
# Obtener los datos de la columna Review

lista_review=[]
for ir, fr in df_reviews["reviews"].items():
    df_n_r=pd.json_normalize(fr)
    df_n_r["user_id"]=df_reviews.iloc[ir]["user_id"]
    lista_review.append(df_n_r)

df_reviews2=pd.concat(lista_review)
df_reviews2.head()

Unnamed: 0,funny,posted,last_edited,item_id,helpful,recommend,review,user_id
0,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...,76561197970982479
1,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.,76561197970982479
2,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
0,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...,js41637
1,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...,js41637


In [6]:
#Limpieza de datos

df_reviews2.drop(['funny','last_edited','helpful'], axis=1, inplace=True)
df_reviews2 = df_reviews2.dropna(how='all', axis=1)
df_reviews2 = df_reviews2.drop_duplicates()
df_reviews2.head()

Unnamed: 0,posted,item_id,recommend,review,user_id
0,"Posted November 5, 2011.",1250,True,Simple yet with great replayability. In my opi...,76561197970982479
1,"Posted July 15, 2011.",22200,True,It's unique and worth a playthrough.,76561197970982479
2,"Posted April 21, 2011.",43110,True,Great atmosphere. The gunplay can be a bit chu...,76561197970982479
0,"Posted June 24, 2014.",251610,True,I know what you think when you see this title ...,js41637
1,"Posted September 8, 2013.",227300,True,For a simple (it's actually not all that simpl...,js41637


In [7]:
datos(df_reviews2)

Unnamed: 0,nombre_campo,tipo_dato,%_No_Nulos,%_Nulos,Nulos
0,posted,object,100.0,0.0,0
1,item_id,object,100.0,0.0,0
2,recommend,object,100.0,0.0,0
3,review,object,100.0,0.0,0
4,user_id,object,100.0,0.0,0


### Items

In [8]:
#Dado el tamaño del archivo JSON no se subira a git, y se convertira a formato parquet
df_items=leer("../../../dato/australian_users_items.json")
df_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [9]:
#Extraer los datos guardados en los diccionarios de la columna de items
lista_items=[]
for i, f in df_items["items"].items():
    df_n_i=pd.json_normalize(f)
    df_n_i["user_id"]=df_items.iloc[i]["user_id"]
    df_n_i["items_count"]=df_items.iloc[i]["items_count"]
    #df_n_i["items_count"]=df_items.iloc[i]["items_count"]
    #df_n_i["steam_id"]=df_items.iloc[i]["steam_id"]
    lista_items.append(df_n_i)

df_items2=pd.concat(lista_items)
df_items2.head(5)

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count
0,10,Counter-Strike,6.0,0.0,76561197970982479,277
1,20,Team Fortress Classic,0.0,0.0,76561197970982479,277
2,30,Day of Defeat,7.0,0.0,76561197970982479,277
3,40,Deathmatch Classic,0.0,0.0,76561197970982479,277
4,50,Half-Life: Opposing Force,0.0,0.0,76561197970982479,277


In [10]:
df_items2.drop(['item_name', 'playtime_2weeks'], axis=1, inplace=True)
df_items2 = df_items2.dropna(how='all')
df_items2 = df_items2.drop_duplicates()
df_items2.head()

Unnamed: 0,item_id,playtime_forever,user_id,items_count
0,10,6.0,76561197970982479,277
1,20,0.0,76561197970982479,277
2,30,7.0,76561197970982479,277
3,40,0.0,76561197970982479,277
4,50,0.0,76561197970982479,277


In [11]:
datos(df_items2)

Unnamed: 0,nombre_campo,tipo_dato,%_No_Nulos,%_Nulos,Nulos
0,item_id,object,100.0,0.0,0
1,playtime_forever,float64,100.0,0.0,0
2,user_id,object,100.0,0.0,0
3,items_count,int64,100.0,0.0,0


### Games

In [12]:
df_games=pd.read_json("../../../dato/output_steam_games.json", lines=True)
df_games.tail(3)

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,0.0,610660.0,Laush Dmitriy Sergeevich
120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,0.0,658870.0,"xropi,stev3ns"
120444,,,Maze Run VR,,http://store.steampowered.com/app/681550/Maze_...,,"[Early Access, Adventure, Indie, Action, Simul...",http://steamcommunity.com/app/681550/reviews/?...,"[Single-player, Stats, Steam Leaderboards, HTC...",4.99,1.0,681550.0,


In [13]:
df_games = df_games.dropna(how="all").reset_index(drop=True) 
df_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,0.0,761140.0,Kotoshiro
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,0.0,643980.0,Secret Level SRL
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,0.0,670290.0,Poolians.com
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,0.0,767400.0,彼岸领域
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,0.0,773570.0,


In [14]:
df_games.drop(['title', 'url', 'early_access', 'reviews_url','specs'], axis=1, inplace=True)
df_games.head()

Unnamed: 0,publisher,genres,app_name,release_date,tags,price,id,developer
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,761140.0,Kotoshiro
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",Free To Play,643980.0,Secret Level SRL
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",Free to Play,670290.0,Poolians.com
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,2017-12-07,"[Action, Adventure, Casual]",0.99,767400.0,彼岸领域
4,,,Log Challenge,,"[Action, Indie, Casual, Sports]",2.99,773570.0,


In [15]:
df_games['price'] = df_games['price'].astype(str)

### Guardar archivos

In [16]:
df_items2.to_csv("../data/items.csv", index=False)
df_reviews2.to_parquet("../data/reviews.parquet", index=False)
df_games.to_csv("../data/games.csv", index=False)