In [55]:
import gzip
import ast
import json
import pandas as pd

In [56]:
pd.set_option('display.max_colwidth', 100)

In [57]:
def duplicados_de(df):
    df_columns = df.columns.tolist()
    for i in df.columns:
        print( 'Duplicados de ',i,': ', df.duplicated(subset=i,keep=False).sum())

----------------

----

# user_reviews

----------------

Carga de datos

In [58]:
datos = []

# Usamos with porque es una declaración de contexto en Python. 
# Se utiliza para asegurar que un recurso se abra y se cierre correctamente.
# Usamos gzip para abrir y descomprimir el archivo
with gzip.open("data/raws/user_reviews.json.gz", 'rt', encoding='utf-8') as archivo_descomprimido:
    
    for linea in archivo_descomprimido:
        # eval() evalua una cadena de texto (string) que contiene una expresión o código Python.
        # En este caso evalua cada los strings de cada línea y nos devuelve un diccionario por línea
        dato = eval(linea)

        datos.append(dato)

user_reviews_raw = pd.DataFrame(datos)
user_reviews_raw

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"[{'funny': '', 'posted': 'Posted November 5, 2011.', 'last_edited': '', 'item_id': '1250', 'help..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014.', 'last_edited': '', 'item_id': '251610', 'helpf..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.', 'last_edited': '', 'item_id': '248820', 'helpful'..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2013.', 'last_edited': '', 'item_id': '250320', 'he..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny', 'posted': 'Posted April 15, 2014.', 'last_edited'..."
...,...,...,...
25794,76561198306599751,http://steamcommunity.com/profiles/76561198306599751,"[{'funny': '', 'posted': 'Posted May 31.', 'last_edited': '', 'item_id': '261030', 'helpful': '0..."
25795,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{'funny': '', 'posted': 'Posted June 17.', 'last_edited': '', 'item_id': '730', 'helpful': '0 o..."
25796,76561198310819422,http://steamcommunity.com/profiles/76561198310819422,"[{'funny': '1 person found this review funny', 'posted': 'Posted June 23.', 'last_edited': '', '..."
25797,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,"[{'funny': '', 'posted': 'Posted July 21.', 'last_edited': '', 'item_id': '233270', 'helpful': '..."


----------

Análisis de duplicados y nulos.

In [59]:
user_reviews_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


La función .info() nos dice que todos los valores de la tabla son no nulos.

Trabajaremos en una tabla user_reviews para no afectar los datos crudos.

In [60]:
user_reviews = user_reviews_raw

In [61]:
user_reviews['reviews'] = user_reviews['reviews'].apply(lambda x: json.dumps(x)) # convertimos los datos de reviews en str para poder usar duplicated()
print('Filas duplicadas: ',user_reviews.duplicated().sum())

Tenemos 313 filas duplicadas que eliminaremos.

In [63]:
user_reviews = user_reviews.drop_duplicates().reset_index(drop=True)

In [64]:
duplicados_de(user_reviews)

Duplicados de  user_id :  2
Duplicados de  user_url :  2
Duplicados de  reviews :  28


In [65]:
(
    user_reviews[user_reviews.duplicated(subset='user_id',keep=False)]
)

Unnamed: 0,user_id,user_url,reviews
83,76561198094224872,http://steamcommunity.com/profiles/76561198094224872,[]
8865,76561198094224872,http://steamcommunity.com/profiles/76561198094224872,"[{""funny"": """", ""posted"": ""Posted April 30."", ""last_edited"": """", ""item_id"": ""72850"", ""helpful"": ""..."


Podemos observar que tenemos reviews en blanco marcados por '[]'. Procederemos a eliminarlos

In [66]:
user_reviews = user_reviews[user_reviews.reviews !='[]'].reset_index(drop=True)

In [67]:
print( 'Duplicados de user_id: ', user_reviews.duplicated(subset='user_id',keep=False).sum())
print( 'Duplicados de user_url: ', user_reviews.duplicated(subset='user_url',keep=False).sum())
print( 'Duplicados de reviews: ', user_reviews.duplicated(subset='reviews',keep=False).sum())


Duplicados de user_id:  0
Duplicados de user_url:  0
Duplicados de reviews:  0


La tabla ya no posee datos nulos ni duplicados.

In [68]:
user_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"[{""funny"": """", ""posted"": ""Posted November 5, 2011."", ""last_edited"": """", ""item_id"": ""1250"", ""help..."
1,js41637,http://steamcommunity.com/id/js41637,"[{""funny"": """", ""posted"": ""Posted June 24, 2014."", ""last_edited"": """", ""item_id"": ""251610"", ""helpf..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{""funny"": """", ""posted"": ""Posted February 3."", ""last_edited"": """", ""item_id"": ""248820"", ""helpful""..."
3,doctr,http://steamcommunity.com/id/doctr,"[{""funny"": """", ""posted"": ""Posted October 14, 2013."", ""last_edited"": """", ""item_id"": ""250320"", ""he..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{""funny"": ""3 people found this review funny"", ""posted"": ""Posted April 15, 2014."", ""last_edited""..."
...,...,...,...
25453,76561198306599751,http://steamcommunity.com/profiles/76561198306599751,"[{""funny"": """", ""posted"": ""Posted May 31."", ""last_edited"": """", ""item_id"": ""261030"", ""helpful"": ""0..."
25454,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{""funny"": """", ""posted"": ""Posted June 17."", ""last_edited"": """", ""item_id"": ""730"", ""helpful"": ""0 o..."
25455,76561198310819422,http://steamcommunity.com/profiles/76561198310819422,"[{""funny"": ""1 person found this review funny"", ""posted"": ""Posted June 23."", ""last_edited"": """", ""..."
25456,76561198312638244,http://steamcommunity.com/profiles/76561198312638244,"[{""funny"": """", ""posted"": ""Posted July 21."", ""last_edited"": """", ""item_id"": ""233270"", ""helpful"": ""..."


---------

----

# users_items

----

In [69]:
# Abre el archivo .gz y lo descomprime
datos = []
with gzip.open('data/raws/users_items.json.gz', 'rt', encoding='utf-8') as archivo_descomprimido:
    for linea in archivo_descomprimido:
        dato = eval(linea)
        datos.append(dato)

users_items_raw = pd.DataFrame(datos)

In [70]:
users_items_raw

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"[{'item_id': '10', 'item_name': 'Counter-Strike', 'playtime_forever': 6, 'playtime_2weeks': 0}, ..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strike', 'playtime_forever': 0, 'playtime_2weeks': 0}, ..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchestra: Ostfront 41-45', 'playtime_forever': 923, 'pla..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strike', 'playtime_forever': 0, 'playtime_2weeks': 0}, ..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defeat: Source', 'playtime_forever': 1131, 'playtime_2w..."
...,...,...,...,...,...
88305,76561198323066619,22,76561198323066619,http://steamcommunity.com/profiles/76561198323066619,"[{'item_id': '413850', 'item_name': 'CS:GO Player Profiles', 'playtime_forever': 0, 'playtime_2w..."
88306,76561198326700687,177,76561198326700687,http://steamcommunity.com/profiles/76561198326700687,"[{'item_id': '11020', 'item_name': 'TrackMania Nations Forever', 'playtime_forever': 0, 'playtim..."
88307,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClown77xX,[]
88308,76561198329548331,7,76561198329548331,http://steamcommunity.com/profiles/76561198329548331,"[{'item_id': '304930', 'item_name': 'Unturned', 'playtime_forever': 677, 'playtime_2weeks': 677}..."


In [71]:
users_items_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


Podemos ver que todos los datos de user_items son no nulos.

----

Análisis de duplicados y nulos.

In [72]:
users_items = users_items_raw
users_items['items'] = users_items['items'].apply(lambda x: json.dumps(x)) # convertimos los datos de reviews en str para poder usar duplicated()

In [73]:
users_items_raw.at[0, 'items']

'[{"item_id": "10", "item_name": "Counter-Strike", "playtime_forever": 6, "playtime_2weeks": 0}, {"item_id": "20", "item_name": "Team Fortress Classic", "playtime_forever": 0, "playtime_2weeks": 0}, {"item_id": "30", "item_name": "Day of Defeat", "playtime_forever": 7, "playtime_2weeks": 0}, {"item_id": "40", "item_name": "Deathmatch Classic", "playtime_forever": 0, "playtime_2weeks": 0}, {"item_id": "50", "item_name": "Half-Life: Opposing Force", "playtime_forever": 0, "playtime_2weeks": 0}, {"item_id": "60", "item_name": "Ricochet", "playtime_forever": 0, "playtime_2weeks": 0}, {"item_id": "70", "item_name": "Half-Life", "playtime_forever": 0, "playtime_2weeks": 0}, {"item_id": "130", "item_name": "Half-Life: Blue Shift", "playtime_forever": 0, "playtime_2weeks": 0}, {"item_id": "300", "item_name": "Day of Defeat: Source", "playtime_forever": 4733, "playtime_2weeks": 0}, {"item_id": "240", "item_name": "Counter-Strike: Source", "playtime_forever": 1853, "playtime_2weeks": 0}, {"item_

In [74]:
print('Filas duplicadas: ',users_items.duplicated().sum())

Filas duplicadas:  657


Tenemos 657 filas duplicadas que eliminaremos.

In [75]:
users_items = users_items.drop_duplicates().reset_index(drop=True)

In [76]:
duplicados_de(users_items)

Duplicados de  user_id :  53
Duplicados de  items_count :  87355
Duplicados de  steam_id :  55
Duplicados de  user_url :  53
Duplicados de  items :  18857


Podemos observar que tenemos 53 user_id duplicados y 53 user_url duplicados.

In [77]:
users_items[users_items.duplicated(subset='user_id',keep=False)].sort_values('user_id').head(5)

Unnamed: 0,user_id,items_count,steam_id,user_url,items
4075,76561198050680344,125,76561198050680344,http://steamcommunity.com/profiles/76561198050680344,"[{""item_id"": ""4000"", ""item_name"": ""Garry's Mod"", ""playtime_forever"": 4128, ""playtime_2weeks"": 12..."
14958,76561198050680344,125,76561198050680344,http://steamcommunity.com/profiles/76561198050680344,"[{""item_id"": ""4000"", ""item_name"": ""Garry's Mod"", ""playtime_forever"": 4128, ""playtime_2weeks"": 12..."
32714,76561198063371165,45,76561198063371165,http://steamcommunity.com/profiles/76561198063371165,"[{""item_id"": ""3320"", ""item_name"": ""Insaniquarium! Deluxe"", ""playtime_forever"": 390, ""playtime_2w..."
32713,76561198063371165,45,76561198063371165,http://steamcommunity.com/profiles/76561198063371165,"[{""item_id"": ""3320"", ""item_name"": ""Insaniquarium! Deluxe"", ""playtime_forever"": 390, ""playtime_2w..."
35104,76561198064484479,336,76561198064484479,http://steamcommunity.com/profiles/76561198064484479,"[{""item_id"": ""10"", ""item_name"": ""Counter-Strike"", ""playtime_forever"": 1436, ""playtime_2weeks"": 0..."


In [78]:
print(
    users_items[users_items.duplicated(subset='user_id',keep=False)].sort_values('user_id').at[4075,'items']
)
print(
    users_items[users_items.duplicated(subset='user_id',keep=False)].sort_values('user_id').at[14958,'items']
)

[{"item_id": "4000", "item_name": "Garry's Mod", "playtime_forever": 4128, "playtime_2weeks": 128}, {"item_id": "17300", "item_name": "Crysis", "playtime_forever": 39, "playtime_2weeks": 0}, {"item_id": "17330", "item_name": "Crysis Warhead", "playtime_forever": 0, "playtime_2weeks": 0}, {"item_id": "17340", "item_name": "Crysis Wars", "playtime_forever": 0, "playtime_2weeks": 0}, {"item_id": "6060", "item_name": "STAR WARS\u2122 Battlefront\u2122 II", "playtime_forever": 654, "playtime_2weeks": 0}, {"item_id": "34900", "item_name": "Bad Rats", "playtime_forever": 267, "playtime_2weeks": 0}, {"item_id": "6020", "item_name": "STAR WARS\u2122 Jedi Knight: Jedi Academy\u2122", "playtime_forever": 108, "playtime_2weeks": 0}, {"item_id": "500", "item_name": "Left 4 Dead", "playtime_forever": 48, "playtime_2weeks": 0}, {"item_id": "550", "item_name": "Left 4 Dead 2", "playtime_forever": 1414, "playtime_2weeks": 0}, {"item_id": "223530", "item_name": "Left 4 Dead 2 Beta", "playtime_forever": 

Podemos ver que estas filas difieren en la cantidad de horas que jugó un usuario. Luego de desanidar esta columna se decidirá que hacer con los datos duplicados en el campo user_id.

Exploraremos si existen listas vacias en los datos de la columna items.

In [79]:
users_items[users_items['items'] == '[]']

Unnamed: 0,user_id,items_count,steam_id,user_url,items
9,Wackky,0,76561198039117046,http://steamcommunity.com/id/Wackky,[]
11,76561198079601835,0,76561198079601835,http://steamcommunity.com/profiles/76561198079601835,[]
31,hellom8o,0,76561198117222320,http://steamcommunity.com/id/hellom8o,[]
38,starkillershadow553,0,76561198059648579,http://steamcommunity.com/id/starkillershadow553,[]
54,darkenkane,0,76561198058876001,http://steamcommunity.com/id/darkenkane,[]
...,...,...,...,...,...
87641,76561198316380182,0,76561198316380182,http://steamcommunity.com/profiles/76561198316380182,[]
87642,76561198316970597,0,76561198316970597,http://steamcommunity.com/profiles/76561198316970597,[]
87643,76561198318100691,0,76561198318100691,http://steamcommunity.com/profiles/76561198318100691,[]
87650,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClown77xX,[]


Podemos observar que tenemos usuarios que no han consumido ningun item por lo que items_count es 0 e items posee '[]'. Eliminaremos estos usuarios del df principal pero guardaremos esta información en una tabla.

In [80]:
users_0items_count = users_items[users_items['items'] == '[]']

In [81]:
users_items = users_items[users_items['items'] !='[]'].reset_index(drop=True)

revisemos nuevamente los duplicados

In [82]:
duplicados_de(users_items)

Duplicados de  user_id :  53


Duplicados de  items_count :  70641
Duplicados de  steam_id :  53
Duplicados de  user_url :  53
Duplicados de  items :  2143


Tenemos 53 duplicados de los que nos ocuparemos en un análisis posterior. Los valores duplicados de items_count e items son esperados así que los mantendremos.

---------

-----

# steam_games

Cargamos los datos del archivo steam_games.json.gz

In [83]:
datos = []

# Usamos with porque es una declaración de contexto en Python. 
# Se utiliza para asegurar que un recurso se abra y se cierre correctamente.
# Usamos gzip para abrir y descomprimir el archivo games_ruta
with gzip.open("data/raws/steam_games.json.gz", 'rt', encoding='utf-8') as archivo_descomprimido:
    for linea in archivo_descomprimido:
        # Cargamos cada línea como un objeto JSON
        obj_json = json.loads(linea)
        # Los agregamos a la lista 'datos'
        datos.append(obj_json)
# creamos un df a partir de la lista 'datos'
gamesRaw = pd.DataFrame(datos)

Exploramos esos datos

In [84]:
gamesRaw.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'discount_price', 'specs', 'price',
       'early_access', 'id', 'metascore', 'developer', 'user_id', 'steam_id',
       'items', 'items_count'],
      dtype='object')

In [85]:
gamesRaw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   publisher       24083 non-null  object 
 1   genres          28852 non-null  object 
 2   app_name        32133 non-null  object 
 3   title           30085 non-null  object 
 4   url             32135 non-null  object 
 5   release_date    30068 non-null  object 
 6   tags            31972 non-null  object 
 7   reviews_url     32133 non-null  object 
 8   discount_price  225 non-null    float64
 9   specs           31465 non-null  object 
 10  price           30758 non-null  object 
 11  early_access    32135 non-null  object 
 12  id              32133 non-null  object 
 13  metascore       2677 non-null   object 
 14  developer       28836 non-null  object 
 15  user_id         88310 non-null  object 
 16  steam_id        88310 non-null  object 
 17  items           88310 non-nul

-----

Vamos a revisar la cantidad de datos nulos

In [86]:
(
    gamesRaw
    .isna()
    .sum()
)

publisher          96362
genres             91593
app_name           88312
title              90360
url                88310
release_date       90377
tags               88473
reviews_url        88312
discount_price    120220
specs              88980
price              89687
early_access       88310
id                 88312
metascore         117768
developer          91609
user_id            32135
steam_id           32135
items              32135
items_count        32135
dtype: int64

Podemos notar que las las columnas 'user_id', 'steam_id','items' e 'items_count' poseen la misma cantidad de datos nulos.

veamos la cantidad de filas que poseen nulos en todas sus columnas

In [87]:
cant_filas_solo_nulos = gamesRaw[gamesRaw.isna().all(axis=1)].shape[0]
print (cant_filas_solo_nulos, 'filas con todos sus valores nulos')

0 filas con todos sus valores nulos


Revisaremos ahora las filas que no tienen valores nulos en user_id

In [88]:
gamesRaw[gamesRaw['user_id'].notna()].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88310 entries, 0 to 88309
Data columns (total 19 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   publisher       0 non-null      object 
 1   genres          0 non-null      object 
 2   app_name        0 non-null      object 
 3   title           0 non-null      object 
 4   url             0 non-null      object 
 5   release_date    0 non-null      object 
 6   tags            0 non-null      object 
 7   reviews_url     0 non-null      object 
 8   discount_price  0 non-null      float64
 9   specs           0 non-null      object 
 10  price           0 non-null      object 
 11  early_access    0 non-null      object 
 12  id              0 non-null      object 
 13  metascore       0 non-null      object 
 14  developer       0 non-null      object 
 15  user_id         88310 non-null  object 
 16  steam_id        88310 non-null  object 
 17  items           88310 non-null 

Podemos ver que todas las filas no contienen valores no nulos en user_id, poseen valores nulos en las primeras 15 columnas de la tabla. Por lo que entendemos que estamos en presencia de dos tablas independientes unidas.

-------

Revisaremos si la tabla items obtenida del archivo 'users_items.json.gz' es la que se encuentrea anexada a stam_games

In [89]:
gamesRaw.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'discount_price', 'specs', 'price',
       'early_access', 'id', 'metascore', 'developer', 'user_id', 'steam_id',
       'items', 'items_count'],
      dtype='object')

In [90]:
users_items.columns

Index(['user_id', 'items_count', 'steam_id', 'user_url', 'items'], dtype='object')

In [91]:
items_anexados_a_steam_games = gamesRaw[['user_id','items_count','steam_id','items']]
items_anexados_a_steam_games = items_anexados_a_steam_games.dropna()
print('users_items_raw = items_anexados_a_steam_games?', users_items_raw.equals(items_anexados_a_steam_games))
print('users_items_raw.user_id = items_anexados_a_steam_games.user_id?', users_items_raw.user_id.equals(items_anexados_a_steam_games.user_id))
print('users_items_raw.items_count = items_anexados_a_steam_games.items_count?', users_items_raw.items_count.equals(items_anexados_a_steam_games.items_count))
print('users_items_raw.steam_id = items_anexados_a_steam_games.steam_id?', users_items_raw.steam_id.equals(items_anexados_a_steam_games.steam_id))
print('users_items_raw.items = items_anexados_a_steam_games.items?', users_items_raw['items'].equals(items_anexados_a_steam_games['items']))


users_items_raw = items_anexados_a_steam_games? False
users_items_raw.user_id = items_anexados_a_steam_games.user_id? True
users_items_raw.items_count = items_anexados_a_steam_games.items_count? False
users_items_raw.steam_id = items_anexados_a_steam_games.steam_id? True
users_items_raw.items = items_anexados_a_steam_games.items? False


Las columnas items tienen valores nulos?

In [92]:
users_items_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


In [93]:
items_anexados_a_steam_games.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 88310 entries, 0 to 88309
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   user_id      88310 non-null  object 
 1   items_count  88310 non-null  float64
 2   steam_id     88310 non-null  object 
 3   items        88310 non-null  object 
dtypes: float64(1), object(3)
memory usage: 3.4+ MB


No poseen valores nulos pero podemos notar que difieren en el tipo de dato.

In [94]:
print("El tipo de la columna 'users_items.items_count' es ", users_items_raw.items_count.dtypes)
print("El tipo de la columna 'items_anexados_a_steam_games.items_count' es ", items_anexados_a_steam_games.items_count.dtypes)

El tipo de la columna 'users_items.items_count' es  int64
El tipo de la columna 'items_anexados_a_steam_games.items_count' es  float64


Cambiaremos el tipo de 'items_anexados_a_steam_games.items_count' para hacer nuevamente la comparación.

In [95]:
users_items_raw_comparacion = users_items_raw
users_items_raw_comparacion.items_count = users_items_raw.items_count.astype("float64")
print("El tipo de la columna 'users_items_raw_comparacion.items_count' es ", users_items_raw.items_count.dtypes)

El tipo de la columna 'users_items_raw_comparacion.items_count' es  float64


In [96]:
print('users_items.items_count = items_anexados_a_steam_games.items_count?', users_items_raw.items_count.equals(items_anexados_a_steam_games.items_count))

users_items.items_count = items_anexados_a_steam_games.items_count? True


Hemos confirmado que la tabla items es la que se encuentra anexada a games por lo que procederemos a eliminarla.

In [97]:
gamesRaw.columns

Index(['publisher', 'genres', 'app_name', 'title', 'url', 'release_date',
       'tags', 'reviews_url', 'discount_price', 'specs', 'price',
       'early_access', 'id', 'metascore', 'developer', 'user_id', 'steam_id',
       'items', 'items_count'],
      dtype='object')

----

In [98]:
steam_games_df = gamesRaw[~gamesRaw['user_id'].notna()] #  eliminamos las filas donde 'user_id' posee valores distintos de NaN
steam_games_df.drop(columns=['user_id', 'steam_id','items', 'items_count'], inplace=True)
steam_games_df = steam_games_df.rename(columns={'id':'item_id'})
steam_games_df.reset_index(drop=True, inplace=True)
steam_games_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  steam_games_df.drop(columns=['user_id', 'steam_id','items', 'items_count'], inplace=True)


Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,discount_price,specs,price,early_access,item_id,metascore,developer
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_Summoner_Kitty/,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,4.49,[Single-player],4.99,False,761140,,Kotoshiro
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironbound/,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game, Trading Card Game, Turn-Based, Fantasy, Tactical...",http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievement...",Free To Play,False,643980,,Secret Level SRL


In [99]:
steam_games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   publisher       24083 non-null  object 
 1   genres          28852 non-null  object 
 2   app_name        32133 non-null  object 
 3   title           30085 non-null  object 
 4   url             32135 non-null  object 
 5   release_date    30068 non-null  object 
 6   tags            31972 non-null  object 
 7   reviews_url     32133 non-null  object 
 8   discount_price  225 non-null    float64
 9   specs           31465 non-null  object 
 10  price           30758 non-null  object 
 11  early_access    32135 non-null  object 
 12  item_id         32133 non-null  object 
 13  metascore       2677 non-null   object 
 14  developer       28836 non-null  object 
dtypes: float64(1), object(14)
memory usage: 3.7+ MB


In [100]:
(120444-32135) / 120444

0.7331955099465312

Al eliminar las columnas 'user_id', 'steam_id', 'items' y 'items_count', hemos reducido un 75% del dataframe games.

---------

Analisis de los valores duplicados de steam_games_df

In [101]:
# (
#     steam_games_df
#     .duplicated()
#     .sum()
# )

In [102]:
(
    steam_games_df
    .duplicated(subset='item_id')
    .sum()
)


2

In [103]:
steam_games_df[steam_games_df.duplicated(subset='item_id',keep=False)]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,discount_price,specs,price,early_access,item_id,metascore,developer
74,,,,,http://store.steampowered.com/,,,,14.99,,19.99,False,,,
13894,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History, Singleplayer, First-Person, Shooter, Story Rich,...",http://steamcommunity.com/app/612880/reviews/?browsefilter=mostrecent&p=1,,"[Single-player, Steam Achievements, Full controller support, Steam Cloud]",59.99,False,612880.0,86.0,Machine Games
14573,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfenstein_II_The_New_Colossus/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History, Singleplayer, First-Person, Shooter, Story Rich,...",http://steamcommunity.com/app/612880/reviews/?browsefilter=mostrecent&p=1,,"[Single-player, Steam Achievements, Full controller support, Steam Cloud]",59.99,False,612880.0,86.0,Machine Games
30961,"Warner Bros. Interactive Entertainment, Feral Interactive (Mac)","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealth, Third Person, Superhero, Singleplayer, Beat 'em...",,,"[Single-player, Steam Achievements, Steam Trading Cards, Partial Controller Support, Steam Cloud]",19.99,False,,91.0,"Rocksteady Studios,Feral Interactive (Mac)"


In [104]:
# steam_games_df = steam_games_df.dropna(subset='item_id') esta mas abajo,

In [105]:
fila1 = steam_games_df.loc[13894]
fila2 = steam_games_df.loc[14573]

columnas_diferentes = fila1 != fila2 # me devuelve true/false 
columnas_diferentes = columnas_diferentes[columnas_diferentes] # filtra las columnas que son true

print("Columnas en las que difieren las filas:")
print(columnas_diferentes.index.tolist()) # Hace una lista con los nombres de las columnas diferentes

Columnas en las que difieren las filas:
['url', 'discount_price']


In [106]:
print (steam_games_df.at[13894,'discount_price'])
print (steam_games_df.at[14573,'discount_price'])

nan
nan


In [107]:
print (steam_games_df.at[13894,'url'])
print (steam_games_df.at[14573,'url'])

http://store.steampowered.com/app/612880/
http://store.steampowered.com/app/612880/Wolfenstein_II_The_New_Colossus/


Hemos probado ambos links y llevan al mismo sitio por lo que solo dejaremos un registro

In [108]:
steam_games_df = steam_games_df.drop([74,14573])

Observando la tabla podemos deducir que el id del juego se encuentra en la url por lo que se completará con el id 200260 el valor faltante en la fila 30961

In [109]:
steam_games_df.at[30961,'item_id'] = 30961

In [110]:
steam_games_df = steam_games_df.reset_index(drop=True)


In [111]:
steam_games_df

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,discount_price,specs,price,early_access,item_id,metascore,developer
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_Summoner_Kitty/,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",http://steamcommunity.com/app/761140/reviews/?browsefilter=mostrecent&p=1,4.49,[Single-player],4.99,False,761140,,Kotoshiro
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironbound/,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game, Trading Card Game, Turn-Based, Fantasy, Tactical...",http://steamcommunity.com/app/643980/reviews/?browsefilter=mostrecent&p=1,,"[Single-player, Multi-player, Online Multi-Player, Cross-Platform Multiplayer, Steam Achievement...",Free To Play,False,643980,,Secret Level SRL
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_Pool_3D__Poolians/,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Indie, Multiplayer]",http://steamcommunity.com/app/670290/reviews/?browsefilter=mostrecent&p=1,,"[Single-player, Multi-player, Online Multi-Player, In-App Purchases, Stats]",Free to Play,False,670290,,Poolians.com
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400/reviews/?browsefilter=mostrecent&p=1,0.83,[Single-player],0.99,False,767400,,彼岸领域
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_Challenge/,,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?browsefilter=mostrecent&p=1,1.79,"[Single-player, Full controller support, HTC Vive, Oculus Rift, Tracked Motion Controllers, Room...",2.99,False,773570,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32128,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colony_On_Mars/,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640/reviews/?browsefilter=mostrecent&p=1,1.49,"[Single-player, Steam Achievements]",1.99,False,773640,,"Nikita ""Ghost_RUS"""
32129,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGistICAL_South_Africa/,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530/reviews/?browsefilter=mostrecent&p=1,4.24,"[Single-player, Steam Achievements, Steam Cloud, Stats, Steam Leaderboards]",4.99,False,733530,,Sacada
32130,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russian_Roads/,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660/reviews/?browsefilter=mostrecent&p=1,1.39,"[Single-player, Steam Achievements, Steam Trading Cards]",1.99,False,610660,,Laush Dmitriy Sergeevich
32131,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_2__Directions/,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmospheric, Relaxing]",http://steamcommunity.com/app/658870/reviews/?browsefilter=mostrecent&p=1,,"[Single-player, Steam Achievements, Steam Cloud]",4.99,False,658870,,"xropi,stev3ns"


-------

Análisis de valores nulos de steam_games_df sin las columnas anexadas.

In [112]:
(
    steam_games_df
    .isnull()
    .sum()
)

publisher          8051
genres             3282
app_name              1
title              2049
url                   0
release_date       2066
tags                162
reviews_url           1
discount_price    31909
specs               669
price              1377
early_access          0
item_id               0
metascore         29457
developer          3298
dtype: int64

In [113]:
steam_games_df[steam_games_df.item_id.isna()]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,discount_price,specs,price,early_access,item_id,metascore,developer


Eliminaresmos este registro ya que la mayoría de sus valores son nulos.

In [114]:
steam_games_df = steam_games_df.dropna(subset='item_id')

Mas adelante, en la creación de tablas para los endpoint se tomaran decisiones pertinentes sobres los valores nulos que presenta este df en las restantes columnas.

In [115]:
steam_games_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32133 entries, 0 to 32132
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   publisher       24082 non-null  object 
 1   genres          28851 non-null  object 
 2   app_name        32132 non-null  object 
 3   title           30084 non-null  object 
 4   url             32133 non-null  object 
 5   release_date    30067 non-null  object 
 6   tags            31971 non-null  object 
 7   reviews_url     32132 non-null  object 
 8   discount_price  224 non-null    float64
 9   specs           31464 non-null  object 
 10  price           30756 non-null  object 
 11  early_access    32133 non-null  object 
 12  item_id         32133 non-null  object 
 13  metascore       2676 non-null   object 
 14  developer       28835 non-null  object 
dtypes: float64(1), object(14)
memory usage: 3.7+ MB


In [116]:
steam_games_df.item_id = steam_games_df.item_id.astype("int")


------

Los Datasets users_items y raw_steam_games_df estan relacionados. Entendemos que si un usuario no esta registrado y no ha consumido ningun item no debería realizar comentarios del mismo. Por lo que revisaremos si en el df users_reviews existen usuarios que no se encuentran en users_items.

In [117]:
users_items.head(2)

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"[{""item_id"": ""10"", ""item_name"": ""Counter-Strike"", ""playtime_forever"": 6, ""playtime_2weeks"": 0}, ..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{""item_id"": ""10"", ""item_name"": ""Counter-Strike"", ""playtime_forever"": 0, ""playtime_2weeks"": 0}, ..."


In [118]:
user_reviews.head(2)

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"[{""funny"": """", ""posted"": ""Posted November 5, 2011."", ""last_edited"": """", ""item_id"": ""1250"", ""help..."
1,js41637,http://steamcommunity.com/id/js41637,"[{""funny"": """", ""posted"": ""Posted June 24, 2014."", ""last_edited"": """", ""item_id"": ""251610"", ""helpf..."


In [119]:
users_de_users_items = list(users_items.user_id.drop_duplicates())
users_de_user_reviews = list(user_reviews.user_id.drop_duplicates())
print (f' cantidad de users de users_items: ',len(users_de_users_items))
print (f' cantidad de users de user_reviews: ',len(users_de_user_reviews))
# Convierte las listas en conjuntos
users_de_user_reviews_set = set(users_de_user_reviews)
users_de_users_items_set = set(users_de_users_items)

# Encuentra los elementos en lista1 que no están en lista2
usuarios_faltantes = users_de_user_reviews_set - users_de_users_items_set

print("Elementos en users_de_user_reviews que no están en users_de_users_items:", usuarios_faltantes)

usuarios_faltantes
len(usuarios_faltantes)

 cantidad de users de users_items:  70912
 cantidad de users de user_reviews:  25458
Elementos en users_de_user_reviews que no están en users_de_users_items: {'76561198040259612', '76561197972017423', '76561197985644714', '76561197994033447', 'Sir_Javier', '76561198056735890', '76561198062741031', 'punkinoz', 'DankHades', '76561198093250521', '76561198071383193', 'ddoogg', '76561198104700278', 'teriyakichiken', '76561198027260537', 'pleasedontvotefortrump', '76561198041366429', '76561198063116217', '76561198055544059', '76561198056772858', '76561198049392366', '76561197970931849', 'Mortwolf', 'pvt_pineapple', '76561198070475810', '76561197980834629', '76561198078987534', '76561197986709258', '76561198057988695', '76561197999539608', '76561198076293018', 'thewhk', '76561198073916938', '76561198195842698', '76561198007309128', '76561198003607025', 'heavy_back', '76561198087375527', 'D3V3D', 'Rawex', 'zrustz16', 'VenomVenti', '76561198071986441', 'jamesreesonswingspanis6ft', '765611979715

2845

Efectivamente tenemos usuarios no registrados en users_items que se encuentran en user_reviews. Procederemos a eliminar estos usuarios para este primer mvp pero guardaremos los ids para informar de los 2845 usuarios que se encuentran en esta situación.

# info para el informe

In [120]:
user_reviews = user_reviews[~(user_reviews.user_id.isin(usuarios_faltantes))]
user_reviews

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970982479,"[{""funny"": """", ""posted"": ""Posted November 5, 2011."", ""last_edited"": """", ""item_id"": ""1250"", ""help..."
1,js41637,http://steamcommunity.com/id/js41637,"[{""funny"": """", ""posted"": ""Posted June 24, 2014."", ""last_edited"": """", ""item_id"": ""251610"", ""helpf..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{""funny"": """", ""posted"": ""Posted February 3."", ""last_edited"": """", ""item_id"": ""248820"", ""helpful""..."
3,doctr,http://steamcommunity.com/id/doctr,"[{""funny"": """", ""posted"": ""Posted October 14, 2013."", ""last_edited"": """", ""item_id"": ""250320"", ""he..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{""funny"": ""3 people found this review funny"", ""posted"": ""Posted April 15, 2014."", ""last_edited""..."
...,...,...,...
25452,JustMielThings,http://steamcommunity.com/id/JustMielThings,"[{""funny"": """", ""posted"": ""Posted May 20."", ""last_edited"": """", ""item_id"": ""570"", ""helpful"": ""No r..."
25453,76561198306599751,http://steamcommunity.com/profiles/76561198306599751,"[{""funny"": """", ""posted"": ""Posted May 31."", ""last_edited"": """", ""item_id"": ""261030"", ""helpful"": ""0..."
25454,Ghoustik,http://steamcommunity.com/id/Ghoustik,"[{""funny"": """", ""posted"": ""Posted June 17."", ""last_edited"": """", ""item_id"": ""730"", ""helpful"": ""0 o..."
25455,76561198310819422,http://steamcommunity.com/profiles/76561198310819422,"[{""funny"": ""1 person found this review funny"", ""posted"": ""Posted June 23."", ""last_edited"": """", ""..."


In [121]:
user_reviews.to_csv("data/raw_user_reviews.csv",index=False)
steam_games_df.to_csv("data/raw_steam_games.csv",index=False)
users_items.to_csv("data/raw_user_items.csv",index=False)