In [1]:
import pandas as pd
filmes = pd.read_csv("./Projeto_IA/movies.csv")

In [2]:
filmes.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
import re

def limpar_titulo(titulo):
    titulo = re.sub("[^a-zA-Z0-9 ]", "", titulo)
    return titulo

In [4]:
filmes["titulo_limpo"] = filmes["title"].apply(limpar_titulo)

In [5]:
filmes

Unnamed: 0,movieId,title,genres,titulo_limpo
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
vetorizar = TfidfVectorizer(ngram_range=(1,2))

tfidf = vetorizar.fit_transform(filmes["titulo_limpo"])

In [7]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def procurar(titulo):
    titulo = limpar_titulo(titulo)
    fila_vec = vetorizar.transform([titulo])
    similar = cosine_similarity(fila_vec, tfidf).flatten()
    indices = np.argpartition(similar, -5)[-5:]
    resultado = filmes.iloc[indices].iloc[::-1]
    
    return resultado

In [8]:
import ipywidgets as widgets
from IPython.display import display

entrada_filme = widgets.Text(
    value='',
    description='Titulo Filme:',
    disabled=False
)
lista_filme = widgets.Output()

def por_tipo(dado):
    with lista_filme:
        lista_filme.clear_output()
        titulo = dado["new"]
        if len(titulo) > 5:
            display(procurar(titulo))

entrada_filme.observe(por_tipo, names='value')


display(entrada_filme, lista_filme)

Text(value='', description='Titulo Filme:')

Output()

In [9]:
movie_id = 89745

filme = filmes[filmes["movieId"] == movie_id]

In [10]:
ratings = pd.read_csv("./Projeto_IA/ratings.csv")

In [11]:
usuario_similar = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()

In [12]:
usuario_similar_recs = ratings[(ratings["userId"].isin(usuario_similar)) & (ratings["rating"] > 4)]["movieId"]

In [13]:
usuario_similar_recs = usuario_similar_recs.value_counts() / len(usuario_similar)

usuario_similar_recs = usuario_similar_recs[usuario_similar_recs > .10]

In [14]:
todos_usuarios = ratings[(ratings["movieId"].isin(usuario_similar_recs.index)) & (ratings["rating"] > 4)]

In [15]:
todos_usuarios_recs = todos_usuarios["movieId"].value_counts() / len(todos_usuarios["userId"].unique())

In [16]:
porcentagem_rec = pd.concat([usuario_similar_recs, todos_usuarios_recs], axis=1)
porcentagem_rec.columns = ["similar", "todos"]

In [17]:
porcentagem_rec

Unnamed: 0,similar,todos
89745,1.000000,0.040459
58559,0.573393,0.148256
59315,0.530649,0.054931
79132,0.519715,0.132987
2571,0.496687,0.247010
...,...,...
47610,0.103545,0.022770
780,0.103380,0.054723
88744,0.103048,0.010383
1258,0.101226,0.083887


In [18]:
porcentagem_rec["pontuacao"] = porcentagem_rec["similar"] / porcentagem_rec["todos"]

In [19]:
porcentagem_rec = porcentagem_rec.sort_values("pontuacao", ascending=False)

In [20]:
porcentagem_rec.head(10).merge(filmes, left_index=True, right_on="movieId")

Unnamed: 0,similar,todos,pontuacao,movieId,title,genres,titulo_limpo
17067,1.0,0.040459,24.716368,89745,"Avengers, The (2012)",Action|Adventure|Sci-Fi|IMAX,Avengers The 2012
20513,0.103711,0.005289,19.610199,106072,Thor: The Dark World (2013),Action|Adventure|Fantasy|IMAX,Thor The Dark World 2013
25058,0.241054,0.012367,19.49177,122892,Avengers: Age of Ultron (2015),Action|Adventure|Sci-Fi,Avengers Age of Ultron 2015
19678,0.216534,0.012119,17.867419,102125,Iron Man 3 (2013),Action|Sci-Fi|Thriller|IMAX,Iron Man 3 2013
16725,0.215043,0.012052,17.843074,88140,Captain America: The First Avenger (2011),Action|Adventure|Sci-Fi|Thriller|War,Captain America The First Avenger 2011
16312,0.175447,0.010142,17.299824,86332,Thor (2011),Action|Adventure|Drama|Fantasy|IMAX,Thor 2011
21348,0.287608,0.016737,17.183667,110102,Captain America: The Winter Soldier (2014),Action|Adventure|Sci-Fi|IMAX,Captain America The Winter Soldier 2014
25071,0.214049,0.012856,16.649399,122920,Captain America: Civil War (2016),Action|Sci-Fi|Thriller,Captain America Civil War 2016
25061,0.136017,0.008573,15.865628,122900,Ant-Man (2015),Action|Adventure|Sci-Fi,AntMan 2015
14628,0.242876,0.015517,15.651921,77561,Iron Man 2 (2010),Action|Adventure|Sci-Fi|Thriller|IMAX,Iron Man 2 2010


In [21]:
def encontrar_filmes_similares(movie_id):
    usuario_similar = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] > 4)]["userId"].unique()
    usuario_similar_recs = ratings[(ratings["userId"].isin(usuario_similar)) & (ratings["rating"] > 4)]["movieId"]
    usuario_similar_recs = usuario_similar_recs.value_counts() / len(usuario_similar)

    usuario_similar_recs = usuario_similar_recs[usuario_similar_recs > .10]
    todos_usuarios = ratings[(ratings["movieId"].isin(usuario_similar_recs.index)) & (ratings["rating"] > 4)]
    todos_usuarios_recs = todos_usuarios["movieId"].value_counts() / len(todos_usuarios["userId"].unique())
    porcentagem_rec = pd.concat([usuario_similar_recs, todos_usuarios_recs], axis=1)
    porcentagem_rec.columns = ["similar", "all"]
    
    porcentagem_rec["pontuacao"] = porcentagem_rec["similar"] / porcentagem_rec["all"]
    porcentagem_rec = porcentagem_rec.sort_values("pontuacao", ascending=False)
    return porcentagem_rec.head(10).merge(filmes, left_index=True, right_on="movieId")[["pontuacao", "title", "genres"]]

In [22]:
import ipywidgets as widgets
from IPython.display import display

entrada_filme = widgets.Text(
    value='',
    description='Título Filme:',
    disabled=False
)
lista_recomendacao = widgets.Output()

def por_tipo(data):
    with lista_recomendacao:
        lista_recomendacao.clear_output()
        titulo = data["new"]
        if len(titulo) > 5:
            resultado = procurar(titulo)
            movie_id = resultado.iloc[0]["movieId"]
            display(encontrar_filmes_similares(movie_id))

entrada_filme.observe(por_tipo, names='value')

display(entrada_filme, lista_recomendacao)

Text(value='', description='Título Filme:')

Output()