Importar Dados

In [4]:
# Importar Pacotes
import pandas as pd
import numpy as np

In [5]:
# Importar arquivo de filmes e visualizar as primeiras linhas
movies = pd.read_csv('movies.csv', low_memory = False)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# Importar arquivo de avaliações e visualizar as primeiras linhas
ratings = pd.read_csv('ratings.csv', low_memory = False)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


Pré Processamento dos Dados

In [7]:
# Selecionar somente colunas que serão usadas
movies = movies[['movieId', 'title']]
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [8]:
# Selecionar somente colunas que serão usadas
ratings = ratings[['userId', 'movieId', 'rating']]
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [9]:
# Agrupar as avaliações por ID do filme e contar o número de avaliações
total_ratings = ratings.groupby('movieId')['rating'].count().reset_index()
total_ratings.columns = ['movieId', 'total_ratings']

# adicionar total ratings para movies conforme o movieId
movies = movies.merge(total_ratings, on = 'movieId', how = 'left')
movies.head()

Unnamed: 0,movieId,title,total_ratings
0,1,Toy Story (1995),68997.0
1,2,Jumanji (1995),28904.0
2,3,Grumpier Old Men (1995),13134.0
3,4,Waiting to Exhale (1995),2806.0
4,5,Father of the Bride Part II (1995),13154.0


In [10]:
# Remover filmes nulos do banco de dados.
movies.dropna(inplace = True)
ratings.dropna(inplace = True)

In [11]:
# Remover filmes com menos de 1000 avaliações
movies = movies[movies['total_ratings'] >= 1000]
movies.shape

(4397, 3)

In [12]:
# Remover avaliações de filmes com menos de 1000 avaliações
ratings = ratings[ratings['movieId'].isin(movies['movieId'])]
ratings.shape

(28528258, 3)

In [13]:
# Verificar quantidade de avaliações por usuário
ratings.groupby('userId').count()

Unnamed: 0_level_0,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,133,133
2,52,52
3,143,143
4,27,27
5,33,33
...,...,...
200944,275,275
200945,104,104
200946,23,23
200947,61,61


In [14]:
# Agrupar as avaliações por userId e contar o número de avaliações
ratings_count = ratings.groupby('userId')['rating'].count()

# Filtrar os usuários com mais de 50 avaliações
y = ratings_count[ratings_count > 50].index

print(y)

Index([     1,      2,      3,      9,     10,     13,     15,     16,     17,
           18,
       ...
       200936, 200937, 200939, 200940, 200942, 200943, 200944, 200945, 200947,
       200948],
      dtype='int64', name='userId', length=123638)


In [15]:
# Filtrar as avaliações dos usuários com mais de 50 avaliações
ratings = ratings[ratings['userId'].isin(y)]
ratings.shape

(26096025, 3)

In [16]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4397 entries, 0 to 86162
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movieId        4397 non-null   int64  
 1   title          4397 non-null   object 
 2   total_ratings  4397 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 137.4+ KB


In [17]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26096025 entries, 0 to 32000203
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 796.4 MB


In [18]:
# Concatenar os datasets de filmes e avaliações
ratings_and_movies = ratings.merge(movies, on='movieId')
ratings_and_movies.head()

Unnamed: 0,userId,movieId,rating,title,total_ratings
0,1,17,4.0,Sense and Sensibility (1995),22251.0
1,3,17,5.0,Sense and Sensibility (1995),22251.0
2,15,17,4.5,Sense and Sensibility (1995),22251.0
3,28,17,4.0,Sense and Sensibility (1995),22251.0
4,29,17,4.0,Sense and Sensibility (1995),22251.0


In [19]:
#Verificar se ha valores nulos em ratings_and_movies
ratings_and_movies.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
title,0
total_ratings,0


In [20]:
# Descartar valores duplicados verificando userId e movieId
ratings_and_movies.drop_duplicates(subset=['userId', 'movieId'], keep='first', inplace=True)
ratings_and_movies.shape

(26096025, 5)

In [21]:
# Remover movieId
del ratings_and_movies['movieId']
ratings_and_movies.head()

Unnamed: 0,userId,rating,title,total_ratings
0,1,4.0,Sense and Sensibility (1995),22251.0
1,3,5.0,Sense and Sensibility (1995),22251.0
2,15,4.5,Sense and Sensibility (1995),22251.0
3,28,4.0,Sense and Sensibility (1995),22251.0
4,29,4.0,Sense and Sensibility (1995),22251.0


In [22]:
# Fazer pivot da tabela
movies_pivot = ratings_and_movies.pivot(index='title', columns='userId', values='rating')
movies_pivot.head()

userId,1,2,3,9,10,13,15,16,17,18,...,200936,200937,200939,200940,200942,200943,200944,200945,200947,200948
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",,,,,,,,,,,...,,,,,,,,,,
(500) Days of Summer (2009),,,,,,,,,,,...,,,,,,,,,3.0,
*batteries not included (1987),,,,,,,,,,,...,,,,,,,,,,
...And Justice for All (1979),,,,,,,,,,,...,,,,,,,,,,
10 Cloverfield Lane (2016),,,,,,,,,,,...,,,,,,,,4.0,,


In [23]:
# Substituir ratings nulas por zero
movies_pivot.fillna(0, inplace=True)
movies_pivot.head()

userId,1,2,3,9,10,13,15,16,17,18,...,200936,200937,200939,200940,200942,200943,200944,200945,200947,200948
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And Justice for All (1979),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0


In [24]:
# Criar uma matriz esparsa
from scipy.sparse import csr_matrix
movies_sparse = csr_matrix(movies_pivot)

In [25]:
# Criar modelo
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(algorithm='brute')
model_knn.fit(movies_sparse)

Fazer previsões de filmes para o modelo treinado

In [26]:
# fazer previsões com base em um filme
distances, sugestions = model_knn.kneighbors(movies_pivot.filter(items = ['Toy Story (1995)'], axis=0).values.reshape(1,-1))
for i in range(0, len(sugestions)):
    print(movies_pivot.index[sugestions[i]])

Index(['Toy Story (1995)', 'Toy Story 2 (1999)', 'Aladdin (1992)',
       'Lion King, The (1994)', 'Bug's Life, A (1998)'],
      dtype='object', name='title')
