In [2]:
import pandas as pd
import numpy as np
from wordcloud import WordCloud, STOPWORDS

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel
from pprint import pprint

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_genres = pd.read_csv('data/df_genres.csv')
df_item_genre = pd.read_csv('data/df_item_genre.csv')
df_items = pd.read_csv('data/raw_steam_games.csv')
df_reviews = pd.read_csv('data/df_reviews.csv')
df_users_items = pd.read_csv('data/df_users_items.csv')
df_users = pd.read_csv('data/df_users.csv')

In [4]:
df_reviews

Unnamed: 0,user_id,item_id,helpful,recommend,posted_date,year,sentiment_analysis
0,76561197970982479,1250,No ratings yet,True,2011-11-05,2011.0,2
1,76561197970982479,22200,No ratings yet,True,2011-07-15,2011.0,2
2,76561197970982479,43110,No ratings yet,True,2011-04-21,2011.0,2
3,js41637,251610,15 of 20 people (75%) found this review helpful,True,2014-06-24,2014.0,2
4,js41637,227300,0 of 1 people (0%) found this review helpful,True,2013-09-08,2013.0,0
...,...,...,...,...,...,...,...
52229,76561198310819422,570,1 of 1 people (100%) found this review helpful,True,,,1
52230,76561198312638244,233270,No ratings yet,True,,,2
52231,76561198312638244,130,No ratings yet,True,,,2
52232,76561198312638244,70,No ratings yet,True,,,2


In [5]:
sist_recomendacion_df = df_reviews[df_reviews.sentiment_analysis == 2]

In [38]:
len(sist_recomendacion_df.item_id.unique())

2748

In [43]:
itemss = list(sist_recomendacion_df.item_id.unique())

In [22]:
genero_grouped = df_item_genre.groupby('item_id')['genres'].apply(lambda x: ', '.join(x)).reset_index()


In [46]:
sist_recomendacion_df =  genero_grouped[genero_grouped.item_id.isin(itemss)]

In [49]:
sist_recomendacion_df = sist_recomendacion_df.merge(df_items[['item_id','title']], on='item_id',)

In [79]:
sist_recomendacion_df.title = sist_recomendacion_df.title.str.title().str.strip()

In [58]:
sist_recomendacion_df['texto'] = sist_recomendacion_df.genres + " " + sist_recomendacion_df.title

In [80]:
sist_recomendacion_df

Unnamed: 0,item_id,genres,title,texto
0,10,action,Counter-Strike,action Counter-Strike
1,20,action,Team Fortress Classic,action Team Fortress Classic
2,30,action,Day Of Defeat,action Day of Defeat
3,40,action,Deathmatch Classic,action Deathmatch Classic
4,50,action,Half-Life: Opposing Force,action Half-Life: Opposing Force
...,...,...,...,...
2273,516040,"indie, strategy",Everything Is Peachy,"indie, strategy Everything is Peachy"
2274,520550,"indie, simulation",American Truck Simulator - Steering Creations ...,"indie, simulation American Truck Simulator - S..."
2275,521340,"casual, indie",True Or False,"casual, indie True or False"
2276,521990,"action, indie",Galactic Storm,"action, indie Galactic Storm"


In [87]:
# Instanciamos el CV
vectorizer = CountVectorizer()
stopwords = STOPWORDS
# eliminamos las "stop words", palabras comunes no informativas
tf = TfidfVectorizer(stop_words='english')

# calculamos los features para cada ítem (texto)
tfidf_matrix = tf.fit_transform(sist_recomendacion_df['texto'])

# calculamos las similitudes entre todos los documentos
cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)
n = 5

results = {} 
for idx, row in sist_recomendacion_df.iterrows():
    # guardamos los indices similares basados en la similitud coseno. Los ordenamos en modo ascendente, siendo 0 nada de similitud y 1 total
    similar_indices = cosine_similarities[idx].argsort()[:-n-2:-1] 
    # guardamos los N más cercanos
    similar_items = [(f"{sist_recomendacion_df.loc[i, 'title']}") for i in similar_indices]
    results[f"{row['title']}"] = similar_items[1:]

In [88]:
# @app.get('/recomendacion/{titulo}')

def recomendacion(titulo:str):
    '''Ingresas un nombre de pelicula y te recomienda las similares en una lista'''
    titulo = titulo.title().strip()

    if sist_recomendacion_df['title'].str.contains(titulo).any():
        titulo = titulo.title().strip()
        lista = (results[titulo])
        data = {'titulo':titulo , 'lista recomendada': lista}
    else:
        mensaje = "El item ingresado: {}, no se encuentra en la base de datos.".format(titulo)
        data = {mensaje}    
    return data

In [95]:
print(recomendacion('Half-Life: Opposing Force   '))
print(recomendacion('Galactic STORM'))
print(recomendacion('Everything Is Peachy'))
print(recomendacion('Day Of Defeat'))


{'titulo': 'Half-Life: Opposing Force', 'lista recomendada': ['Half-Life', 'Half-Life 2', 'Half-Life: Before', 'Half-Life 2: Episode One', 'Half-Life 2: Episode Two']}
{'titulo': 'Galactic Storm', 'lista recomendada': ['Galactic Civilizations Iii', 'Spore™ Galactic Adventures', 'Conflict Desert Storm™', 'Galcon 2: Galactic Conquest', 'Galactic Civilizations® Ii: Ultimate Edition']}
{'titulo': 'Everything Is Peachy', 'lista recomendada': ['Z', 'On My Own', 'War, The Game', 'Rush', 'Spirits']}
{'titulo': 'Day Of Defeat', 'lista recomendada': ['Day Of Defeat: Source', 'Dino D-Day', 'Garbage Day', 'The Fifth Day', "Day One : Garry'S Incident"]}


In [94]:
print(recomendacion('Day Of DefeAt '))


{'titulo': 'Day Of Defeat', 'lista recomendada': ['Day Of Defeat: Source', 'Dino D-Day', 'Garbage Day', 'The Fifth Day', "Day One : Garry'S Incident"]}


In [96]:
sist_recomendacion_df.to_csv('data/sist_recomendacion_df_item.csv',index=False)