# Feature Engineering

In [22]:
import sys
import os
import pandas as pd
import numpy as np
import scipy as sp
import textblob
import sklearn
from textblob import TextBlob
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

print(f"System version: {sys.version}")
print(f"pandas version: {pd.__version__}")
print(f"numpy version: {np.__version__}")
print(f"scipy version: {sp.__version__}")
print(f"scikit-learn version: {sklearn.__version__}")
%load_ext autoreload
%autoreload 2
import warnings
warnings.filterwarnings("ignore")

System version: 3.11.8 (main, Mar 12 2024, 11:52:02) [GCC 12.2.0]
pandas version: 2.2.1
numpy version: 1.26.4
scipy version: 1.12.0
scikit-learn version: 1.4.1.post1


## Extracción

En esta sección, extraemos los datos de los archivos steam_games, user_items y user_reviews que estan en formato parquet.

In [23]:
# Cargamos los archivos parquet
def read_parquet_files(parquet_files):
    dataframes = {}
    for name in parquet_files:
        dataframes[name] = pd.read_parquet(f'../dataset/{name}.parquet', engine='pyarrow')
    return dataframes


parquet_files = ['steam_games', 'user_items', 'user_reviews']
dataframes = read_parquet_files(parquet_files)

# Convertimos a df.
df_steam_games = dataframes['steam_games']
df_user_items = dataframes['user_items']
df_user_reviews = dataframes['user_reviews']

Vamos a añadir una columna ‘sentiment_analysis’ al dataset ‘user_reviews’ usando NLP para analizar el sentimiento de las reseñas de los juegos. Esto nos permitirá entender las opiniones de los usuarios. Las reseñas se calificarán de la siguiente manera:

0: Negativa (insatisfacción, disgusto, decepción)
1: Neutral (indiferencia, objetividad, sin emoción)
2: Positiva (satisfacción, gusto, admiración)

Crearemos una función **`analisis_sentimiento`** usando TextBlob para analizar el sentimiento de las reseñas de los juegos. Esta función se basará en la polaridad, que varía entre -1 y 1, para determinar si una reseña es negativa, neutra o positiva.

In [24]:
def analisis_sentimiento(review):
    # Si la reseña está ausente, retorna 1 (neutral)
    if pd.isnull(review):
        return 1

    # Calcula la polaridad de la reseña usando TextBlob
    polarity = TextBlob(review).sentiment.polarity

    # Retorna 0 (malo) si la polaridad es menor que 0, 2 (positivo) si la polaridad es mayor que 0, y 1 (neutral) en caso contrario
    if polarity < 0:
        return 0
    elif polarity > 0:
        return 2
    else:
        return 1

- Aplicamos la función a la columna review.

In [25]:
df_user_reviews.sample(5)

Unnamed: 0,item_id,recommend,review,user_id,posted_year
28889,383870,False,"The ending sucked, there was tis massive build...",Aquilla96,unknown
47759,230410,True,A grindfest better than Destiny.,Nightxlash,2014
25310,214490,True,"crazy spoopy.Lots of fun, good stuff.",76561198022701298,2014
40078,242920,True,"Highly addictive game, 10/10 amazing. Beautifu...",76561197998718499,2014
161,351570,True,"I liked it, the zombies were pretty bad though.",therealmorty,2015


In [26]:
df_user_reviews['sentiment_analysis'] = df_user_reviews['review'].apply(analisis_sentimiento)
df_user_reviews[['review','sentiment_analysis']].sample(5)

Unnamed: 0,review,sentiment_analysis
37715,Very enjoyable. Very similar to the games 'STA...,2
5264,This game was so good now its so ♥♥♥♥ing gay t...,2
32246,The Best Film EVERRRRR,2
2607,The original GunZ is much better.Don't waste y...,0
47103,game is hard :),0


## Creación de Conjuntos de Datos para los Endpoints de la API

Nuestro propósito en esta sección es establecer varios conjuntos de datos, actuando como bases de datos pseudo, para las funciones de los endpoints de la API. Esto nos permitirá recuperar los datos requeridos de manera rápida y eficaz, sin la necesidad de cargar toda la información, optimizando así el rendimiento de la API.

## Creación de la Base de Datos para los Endpoints 1 y 2
####    (Endpoints de la API def PlayTimeGenre( genero : str ): y def UserForGenre( genero : str ):)
Para formar un único conjunto de datos que sirva como pseudo base de datos para los endpoints, es necesario fusionar df_steam_games y df_user_items. De esta forma, consolidamos toda la información requerida en un solo lugar. Las columnas necesarias son: item_id, genres, release_year de df_steam_games y item_id, user_id, playtime_forever de df_user_items.

1. Seleccionamos únicamente las columnas requeridas:
```python
steam_games_columns = ['item_id','genres','release_year']
user_items_columns = ['item_id','user_id', 'playtime_forever']
```
2. Creamos subconjuntos de los DataFrames con solo las columnas necesarias:
```python
df_games_subset = df_steam_games[steam_games_columns]
df_items_subset = df_user_items[user_items_columns]
```

In [27]:
steam_games_columns = ['item_id','genres','release_year']
user_items_columns = ['item_id','user_id', 'playtime_forever']

df_games_subset = df_steam_games[steam_games_columns]
df_items_subset = df_user_items[user_items_columns]


df_endpoints1_2 = pd.merge(df_games_subset, df_items_subset, on='item_id')
df_endpoints1_2.head(5)
df_endpoints1_2.shape

(15255102, 5)

In [28]:
# Seleccionamos los 10 géneros mas frecuentes
top_10_popular_genres = ['Action', 'Adventure', 'Indie', 'Strategy', 'RPG', 'Simulation', 'Casual', 'Massively Multiplayer', 'Racing', 'Sports']

# Filtramos por las condiciones establecidas
df_endpoints1_2 = df_endpoints1_2[(df_endpoints1_2['release_year'] != 'unknown') & (df_endpoints1_2['playtime_forever'] > 0)].reset_index(drop=True)
df_endpoints1_2.head()
print(df_endpoints1_2.shape)

(10511145, 5)


In [29]:
df_endpoints1_2['release_year'] = df_endpoints1_2['release_year'].astype('int16')
df_endpoints1_2['playtime_forever'] = df_endpoints1_2['playtime_forever'].astype('float32')
df_endpoints1_2.memory_usage(deep=True)

Index                     132
item_id             657310551
genres              681639958
release_year         21022290
user_id             739967023
playtime_forever     42044580
dtype: int64

- Por último, creamos una tabla pivote que tenga como índice user_id y release_year, como columnas genres y como valores únicos la suma de playtime_forever.

In [30]:
df_endpoints1_2 = df_endpoints1_2.pivot_table(index=['user_id', 'release_year'], columns='genres', values='playtime_forever', aggfunc='sum', fill_value=0)
df_endpoints1_2

Unnamed: 0_level_0,genres,Action,Adventure,Animation &amp; Modeling,Audio Production,Casual,Design &amp; Illustration,Early Access,Education,Free to Play,Indie,...,RPG,Racing,Simulation,Software Training,Sports,Strategy,Utilities,Video Production,Web Publishing,unknown
user_id,release_year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
--000--,2006,15.416667,15.416667,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,15.416667,...,0.000000,0.000000,15.416667,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
--000--,2009,88.816666,88.816666,0.0,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
--000--,2010,0.366667,0.000000,0.0,0.0,0.000000,0.0,0.0,0.000000,0.366667,0.366667,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0
--000--,2011,108.699997,108.699997,0.0,0.0,0.000000,0.0,0.0,0.000000,46.049999,30.616665,...,62.649998,46.049999,11.083333,0.0,0.000000,11.083333,0.0,0.0,0.0,0.0
--000--,2012,1822.516724,37.150002,0.0,0.0,30.016666,0.0,0.0,0.000000,10.500000,37.700001,...,29.516666,0.000000,0.000000,0.0,7.683333,1796.400024,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zzzmidmiss,2010,7.783334,0.166667,0.0,0.0,3.916667,0.0,0.0,0.683333,4.550000,7.950000,...,0.000000,0.000000,3.233333,0.0,3.233333,3.400000,0.0,0.0,0.0,0.0
zzzmidmiss,2011,38.366665,38.366665,0.0,0.0,1.250000,0.0,0.0,0.000000,0.266667,1.750000,...,37.599998,0.266667,0.000000,0.0,0.000000,1.150000,0.0,0.0,0.0,0.0
zzzmidmiss,2012,98.366669,61.650005,0.0,0.0,6.083333,0.0,0.1,0.000000,22.549999,51.316666,...,45.500000,0.000000,6.450000,0.0,0.000000,15.383334,0.0,0.0,0.0,0.0
zzzmidmiss,2013,1.633333,1.750000,0.0,0.0,0.283333,0.0,0.0,0.000000,0.166667,1.750000,...,0.166667,0.000000,0.000000,0.0,0.000000,1.466667,0.0,0.0,0.0,0.0


In [31]:
df_endpoints1_2.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 660506 entries, ('--000--', 2006) to ('zzzmidmiss', 2014)
Data columns (total 22 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Action                     660506 non-null  float32
 1   Adventure                  660506 non-null  float32
 2   Animation &amp; Modeling   660506 non-null  float32
 3   Audio Production           660506 non-null  float32
 4   Casual                     660506 non-null  float32
 5   Design &amp; Illustration  660506 non-null  float32
 6   Early Access               660506 non-null  float32
 7   Education                  660506 non-null  float32
 8   Free to Play               660506 non-null  float32
 9   Indie                      660506 non-null  float32
 10  Massively Multiplayer      660506 non-null  float32
 11  Photo Editing              660506 non-null  float32
 12  RPG                        660506 non-null  float32
 13 

# Base de datos para los Endpoints 3, 4 y 5

- ##### def UsersRecommend( año : int ): Devuelve el top 3 de juegos MÁS recomendados por usuarios para el año dado. (reviews.recommend = True y comentarios positivos/neutrales)

- ##### def UsersWorstDeveloper( año : int ): Devuelve el top 3 de desarrolladoras con juegos MENOS recomendados por usuarios para el año dado. (reviews.recommend = False y comentarios negativos)

- ##### def sentiment_analysis( empresa desarrolladora : str ): Según la empresa desarrolladora, se devuelve un diccionario con el nombre de la desarrolladora como llave y una lista con la cantidad total de registros de reseñas de usuarios que se encuentren categorizados con un análisis de sentimiento como valor.

In [32]:
# Seleccionamos las columnas necesarias
steam_games_columnas = ['item_id', 'item_name', 'developer']
user_reviews_columnas = ['item_id', 'recommend','sentiment_analysis','posted_year']

df_games_subset = df_steam_games[steam_games_columnas]
df_reviews_subset = df_user_reviews[user_reviews_columnas]


df_endpoints3_4_5 = pd.merge(df_games_subset, df_reviews_subset, on='item_id')
df_endpoints3_4_5.head()

Unnamed: 0,item_id,item_name,developer,recommend,sentiment_analysis,posted_year
0,282010,Carmageddon Max Pack,Stainless Games Ltd,True,1,unknown
1,282010,Carmageddon Max Pack,Stainless Games Ltd,True,1,unknown
2,282010,Carmageddon Max Pack,Stainless Games Ltd,True,1,unknown
3,70,Half-Life,Valve,True,0,2015
4,70,Half-Life,Valve,True,0,2011


- Para optimizar el uso de recursos, aplicaremos un filtro a los registros de juegos basándonos en dos criterios: que tengan una reseña con un posted_year válido y que el developer sea reconocido. Adicionalmente, prescindiremos de la columna item_id debido a que no es necesaria para nuestro análisis.

In [33]:
df_endpoints3_4_5 = df_endpoints3_4_5[(df_endpoints3_4_5['posted_year'] != 'unknown') & (df_endpoints3_4_5['developer'] != 'unknown') ].reset_index(drop=True)
df_endpoints3_4_5.drop('item_id',axis=1, inplace=True)
df_endpoints3_4_5.head()

Unnamed: 0,item_name,developer,recommend,sentiment_analysis,posted_year
0,Half-Life,Valve,True,0,2015
1,Half-Life,Valve,True,0,2011
2,Half-Life,Valve,True,0,2014
3,Half-Life,Valve,True,2,2013
4,Half-Life,Valve,True,0,2013


In [34]:
# Optimizacion de uso de memoria para la segunda fuente de datos
df_endpoints3_4_5['sentiment_analysis'] = df_endpoints3_4_5['sentiment_analysis'].astype('int8')
df_endpoints3_4_5['posted_year'] = df_endpoints3_4_5['posted_year'].astype('int16')
df_endpoints3_4_5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 152619 entries, 0 to 152618
Data columns (total 5 columns):
 #   Column              Non-Null Count   Dtype 
---  ------              --------------   ----- 
 0   item_name           152619 non-null  object
 1   developer           152619 non-null  object
 2   recommend           152619 non-null  bool  
 3   sentiment_analysis  152619 non-null  int8  
 4   posted_year         152619 non-null  int16 
dtypes: bool(1), int16(1), int8(1), object(2)
memory usage: 2.9+ MB


# Sistemas de Recomendacion

Usaremos el filtrado colaborativo para el sistema de recomendación. Hay dos enfoques:

- Basado en usuarios: se recomiendan ítems a usuarios basándose en las calificaciones de usuarios similares.
- Basado en ítems: se recomiendan los ítems más similares que el usuario aún no ha evaluado.

### Agragando los Ratings

Como no tenemos **REALMENTE** una valoración o *rating* del 1 al 5 (como podríamos tener por ejemplo al valorar películas), se decide crear uno  a partir del análisis de sentimiento y las recomendaciones del usuario. Como criterio usaremos el análisis de sentimiento como el factor principal, y la recomendación como el factor secundario, para un rango del 1 al 5:


In [37]:
# Seleccionamos las columnas necesarias
steam_games_columns = ['item_id', 'item_name']
user_reviews_columns = ['item_id','user_id', 'recommend','sentiment_analysis']
user_items_columns = ['user_id','item_id']

df_games_subset = df_steam_games[steam_games_columns]
df_reviews_subset = df_user_reviews[user_reviews_columns]
df_items_subset = df_user_items[user_items_columns]

df_juegos_usuarios = pd.merge(df_games_subset, df_items_subset, on='item_id')
df_juegos_usuarios.head()

Unnamed: 0,item_id,item_name,user_id
0,282010,Carmageddon Max Pack,UTNerd24
1,282010,Carmageddon Max Pack,I_DID_911_JUST_SAYING
2,282010,Carmageddon Max Pack,76561197962104795
3,282010,Carmageddon Max Pack,r3ap3r78
4,282010,Carmageddon Max Pack,saint556


In [38]:
df_features = pd.merge(df_juegos_usuarios, df_reviews_subset, on=["user_id", "item_id"])
df_features.drop_duplicates(inplace=True)
df_features.reset_index(drop=True, inplace=True)
df_features.head()

Unnamed: 0,item_id,item_name,user_id,recommend,sentiment_analysis
0,282010,Carmageddon Max Pack,InstigatorAU,True,1
1,70,Half-Life,EizanAratoFujimaki,True,0
2,70,Half-Life,GamerFag,True,0
3,70,Half-Life,76561198020928326,True,0
4,70,Half-Life,Bluegills,True,2


Para mantener la información útil y reducir la dimensionalidad, solo consideraremos usuarios con al menos 5 reviews y juegos con 10 o más reviews.

In [39]:
# Agrupamos el dataframe por item_name para contar los juegos
count = df_features.groupby("item_name").size()

# Seleccionamos solo los juegos que tengan al menos 10 reviews.
df_features = df_features.loc[df_features["item_name"].isin(count[count >= 10].index), :]
df_features.head()

Unnamed: 0,item_id,item_name,user_id,recommend,sentiment_analysis
1,70,Half-Life,EizanAratoFujimaki,True,0
2,70,Half-Life,GamerFag,True,0
3,70,Half-Life,76561198020928326,True,0
4,70,Half-Life,Bluegills,True,2
5,70,Half-Life,76561198071955492,True,0


In [40]:
# Agrupamos el dataframe por user_id para contar el número de juegos puntuados por usuarios.
conteo = df_features.groupby("user_id").size()

# Seleccionamos solo los usuarios que hayan dejado reviews a al menos 5 juegos
df_features = df_features.loc[df_features["user_id"].isin(conteo[conteo >= 5].index), :]
df_features.head()

Unnamed: 0,item_id,item_name,user_id,recommend,sentiment_analysis
2,70,Half-Life,GamerFag,True,0
5,70,Half-Life,76561198071955492,True,0
8,70,Half-Life,meeeedie,True,1
10,70,Half-Life,KewlKatzz,True,2
13,70,Half-Life,76561197994404698,True,1


Creamos una funcion para aplicar el puntaje del rating según el criterio establecido:
* **1** si el análisis de sentimiento es negativo ya sea que este recomendado o no (True o False)
* **2** si el análisis de sentimiento es neutral y no es recomendado (False)
* **3** si el análisis de sentimiento es neutral pero es recomendado (True)
* **4** si el análisis de sentimiento es positivo y no es recomendado (False)
* **5** si el análisis de sentimiento es positivo y es recomendado (True)

In [41]:
def calcula_rating(row):
    '''
    Calcula una calificación basada en el análisis de sentimientos y la recomendación de review de juegos realizado por los usuarios.

    Parámetros:
    row (dict): Un diccionario que contiene las siguientes claves:
        - "sentiment_analysis" (int): La puntuación del análisis de sentimientos (0, 1 o 2).
        - "recommend" (bool): Indica si las reseñas recomiendan.

    Devuelve:
        int o None: La calificación calculada como un número entero entre 1 y 5, o None si las entradas son inválidas.
    '''
    if row["sentiment_analysis"] == 0 and not row["recommend"]:
        return 1
    elif row["sentiment_analysis"] == 0 and row["recommend"]:
        return 1
    elif row["sentiment_analysis"] == 1 and not row["recommend"]:
        return 2
    elif row["sentiment_analysis"] == 1 and row["recommend"]:
        return 3
    elif row["sentiment_analysis"] == 2 and not row["recommend"]:
        return 4
    elif row["sentiment_analysis"] == 2 and row["recommend"]:
        return 5
    else:
        return None

In [42]:
# Aplicamos la función get_rating al df_features
ratings = df_features.apply(lambda row: calcula_rating(row), axis=1)

# Creamos el dataframe df_ratings con la nueva columna de ratings
df_ratings = df_features[['item_id', 'item_name', 'user_id']].assign(rating=ratings)
df_ratings.head()

Unnamed: 0,item_id,item_name,user_id,rating
2,70,Half-Life,GamerFag,1
5,70,Half-Life,76561198071955492,1
8,70,Half-Life,meeeedie,3
10,70,Half-Life,KewlKatzz,5
13,70,Half-Life,76561197994404698,3


* Observamos la distribución de los ratings

In [43]:
df_ratings.groupby(["rating"])["user_id"].count()

rating
1    1920
2     185
3    1680
4     284
5    4239
Name: user_id, dtype: int64

- Usamos StandardScaler para normalizar los ratings, lo que es crucial para nuestro sistema de recomendación basado en filtrado colaborativo. Esto evita que usuarios o ítems con ratings más altos afecten desproporcionadamente las recomendaciones.

In [44]:
# Inicializamos StandardScaler
scaler = StandardScaler()

# Creamos un array con los ratings
rating_array = df_ratings['rating'].values.reshape(-1, 1)

# Normalizamos
normalized_rating  =scaler.fit_transform(rating_array)

# Creamos un df con los ratings normalizados
df_norm = df_ratings.copy()
df_norm['rating'] = normalized_rating
df_norm.head()

Unnamed: 0,item_id,item_name,user_id,rating
2,70,Half-Life,GamerFag,-1.565118
5,70,Half-Life,76561198071955492,-1.565118
8,70,Half-Life,meeeedie,-0.347209
10,70,Half-Life,KewlKatzz,0.870699
13,70,Half-Life,76561197994404698,-0.347209


## Creando la matriz usuarios/ratings


In [45]:
df_matrix = df_norm.pivot_table(index=['user_id'], columns=['item_name'], values='rating').fillna(0)
df_matrix.head()

item_name,100% Orange Juice,8BitMMO,A Bird Story,A Story About My Uncle,APB Reloaded,ARK: Survival Evolved,ARMA: Cold War Assault,Ace of Spades: Battle Builder,AdVenture Capitalist,Age of Empires II HD,...,Worms Revolution,XCOM: Enemy Unknown,XCOM® 2,Yet Another Zombie Defense,You Have to Win the Game,Zombie Army Trilogy,how do you Do It?,the static speaks my name,theHunter Classic,theHunter: Primal
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-SEVEN-,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
091263,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1011001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12345678901234567890123456567890,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.565118,0.0
1234567io9872345678765432,0.0,0.0,0.0,0.0,-0.956164,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Para calcular la "Sparcity" de nuestra matriz empleamos la siguiente función:

In [46]:
def get_sparsity(df):
  """
  Devuelve el sparsity de una matriz como df.

  Args:
    df: La matriz en la que se desea calcular el sparsity.

  Returns:
    El sparsity de la matriz `df`.
  """

  num_zeros = (df == 0).sum()
  num_elements = df.size

  sparsity = (1 - num_zeros.sum() / num_elements)*100

  return f'Sparsity: {round(sparsity, 2)}%'

In [47]:
get_sparsity(df_matrix)

'Sparsity: 1.09%'

Dada la dispersión de la matriz, usaremos sparse de SciPy para mejorar la eficiencia y reducir la memoria, almacenando solo los valores no cero.

In [48]:
matrix_sparse = sp.sparse.csr_matrix(df_matrix.values)
matrix_sparse

<1391x548 sparse matrix of type '<class 'numpy.float64'>'
	with 8308 stored elements in Compressed Sparse Row format>

### Matriz de Similitud: Similitud de coseno

Creamos dos matrices de similitud utilizando la similitud del coseno para medir la similitud entre los juegos item_sim_matrix y entre los usuarios user_sim_matrix.

In [51]:
matriz_similitud_usuarios = cosine_similarity(matrix_sparse)
print(matriz_similitud_usuarios.shape)

(1391, 1391)


In [52]:
matriz_similitud_items = cosine_similarity(matrix_sparse.T)
print(matriz_similitud_items.shape)

(548, 548)


In [53]:
# DataFrame de la similitud de usuarios
df_similitud_usuarios = pd.DataFrame(matriz_similitud_usuarios, index = df_matrix.index, columns = df_matrix.index)
df_similitud_usuarios.head()

user_id,-SEVEN-,091263,1011001,12345678901234567890123456567890,1234567io9872345678765432,12779,131312,1337lolroflmao,1626466724893520,17101710,...,yoshipowerz,yotuic,you_re_ded,youngbenaffleck,zaaikbr,zachwgtv,zakbot,zaukster,zayyntt,zyr0n1c
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-SEVEN-,1.0,-0.230133,-0.411551,0.345894,-0.234051,0.0,0.0,-0.166343,0.0,0.0,...,0.0,-0.270148,0.122628,0.037896,0.0,0.0,0.084326,0.0,0.0,0.103689
091263,-0.230133,1.0,0.202183,-0.231632,0.183274,0.0,0.0,0.130255,0.0,0.0,...,0.113761,0.132716,-0.452289,0.0,0.269786,0.0,-0.05647,0.0,-0.351671,0.0
1011001,-0.411551,0.202183,1.0,-0.25988,0.04163,0.0,0.0,0.14614,0.0,0.0,...,0.0,0.297801,-0.092134,-0.125551,-0.168389,0.0,-0.063357,0.0,0.0,-0.214538
12345678901234567890123456567890,0.345894,-0.231632,-0.25988,1.0,-0.235576,0.0,0.0,-0.167427,0.0,0.0,...,0.0,-0.341179,0.105554,0.0,0.0,0.0,0.072585,0.0,0.0,0.0
1234567io9872345678765432,-0.234051,0.183274,0.04163,-0.235576,1.0,0.0,0.0,0.132473,0.0,0.0,...,0.0,0.216126,-0.083517,-0.042733,0.0,0.0,-0.057431,0.0,-0.079344,0.07755


In [55]:
# DataFrame de la similitud de usuarios
df_similitud_items = pd.DataFrame(matriz_similitud_items, index = df_matrix.columns, columns = df_matrix.columns)
df_similitud_items.head()

item_name,100% Orange Juice,8BitMMO,A Bird Story,A Story About My Uncle,APB Reloaded,ARK: Survival Evolved,ARMA: Cold War Assault,Ace of Spades: Battle Builder,AdVenture Capitalist,Age of Empires II HD,...,Worms Revolution,XCOM: Enemy Unknown,XCOM® 2,Yet Another Zombie Defense,You Have to Win the Game,Zombie Army Trilogy,how do you Do It?,the static speaks my name,theHunter Classic,theHunter: Primal
item_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100% Orange Juice,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.016656,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.193398,0.0,0.0,0.0
8BitMMO,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Bird Story,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A Story About My Uncle,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
APB Reloaded,0.0,0.0,0.0,0.0,1.0,-0.012329,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Carga
Finalmente, en esta sección cargamos nuestros datos transformados para los endpoints que se consumirán en la API a su destino final. Optamos por almacenarlos en formato parquet con compresion snappy para reducir su tamaño de almacenamiento.

In [57]:
# Nombres correspondientes a cada DataFrame
dfs = [df_endpoints1_2, df_endpoints3_4_5, df_similitud_usuarios, df_similitud_items, df_matrix, df_user_reviews]
names = ['df_endpoints1_2.parquet', 'df_endpoints3_4_5.parquet', 'df_similitud_usuarios.parquet', 'df_similitud_items.parquet', 'matrix.parquet', 'analisis_sentimiento.parquet']

for dfs, n in zip(dfs, names):
    # Definimos la ruta del directorio
    #folder_path = f'../data/processed/'
    folder_path = f'../dataset/'


    # Verificamos si el folder_path existe
    if not os.path.exists(folder_path):
        # Si no existe, lo creamos
        os.makedirs(folder_path)

    # Definimos la ruta completa del archivo
    path = os.path.join(folder_path, n)

    # Guardamos el DataFrame como un archivo parquet
    dfs.to_parquet(path, engine='pyarrow', compression='zstd')

    print(f"'{n}' fue guardado correctamente en '{folder_path}'")

'df_endpoints1_2.parquet' fue guardado correctamente en '../dataset/'
'df_endpoints3_4_5.parquet' fue guardado correctamente en '../dataset/'
'df_similitud_usuarios.parquet' fue guardado correctamente en '../dataset/'
'df_similitud_items.parquet' fue guardado correctamente en '../dataset/'
'matrix.parquet' fue guardado correctamente en '../dataset/'
'analisis_sentimiento.parquet' fue guardado correctamente en '../dataset/'
