# **Treinamento do modelo**

**Importar Dados**

In [3]:
# Importar Pacotes
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import joblib

In [4]:
# Importar arquivo de filmes e visualizar as primeiras linhas
movies = pd.read_csv('movies.csv', low_memory = False)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
# Importar arquivo de avaliações e visualizar as primeiras linhas
ratings = pd.read_csv('ratings.csv', low_memory = False)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,17,4.0,944249077
1,1,25,1.0,944250228
2,1,29,2.0,943230976
3,1,30,5.0,944249077
4,1,32,5.0,943228858


**Pré Processamento dos Dados**

In [6]:
# Selecionar somente colunas que serão usadas
movies = movies[['movieId', 'title']]
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [7]:
# Selecionar somente colunas que serão usadas
ratings = ratings[['userId', 'movieId', 'rating']]
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,17,4.0
1,1,25,1.0
2,1,29,2.0
3,1,30,5.0
4,1,32,5.0


In [8]:
# Agrupar as avaliações por ID do filme e contar o número de avaliações
total_ratings = ratings.groupby('movieId')['rating'].count().reset_index()
total_ratings.columns = ['movieId', 'total_ratings']

# adicionar total ratings para movies conforme o movieId
movies = movies.merge(total_ratings, on = 'movieId', how = 'left')
movies.head()

Unnamed: 0,movieId,title,total_ratings
0,1,Toy Story (1995),68997.0
1,2,Jumanji (1995),28904.0
2,3,Grumpier Old Men (1995),13134.0
3,4,Waiting to Exhale (1995),2806.0
4,5,Father of the Bride Part II (1995),13154.0


In [9]:
# Remover filmes nulos do banco de dados.
movies.dropna(inplace = True)
ratings.dropna(inplace = True)

In [10]:
# Remover filmes com menos de 1000 avaliações
movies = movies[movies['total_ratings'] >= 1000]
movies.shape

(4397, 3)

In [11]:
# Remover avaliações de filmes com menos de 1000 avaliações
ratings = ratings[ratings['movieId'].isin(movies['movieId'])]
ratings.shape

(28528258, 3)

In [12]:
# Verificar quantidade de avaliações por usuário
ratings.groupby('userId').count()

Unnamed: 0_level_0,movieId,rating
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,133,133
2,52,52
3,143,143
4,27,27
5,33,33
...,...,...
200944,275,275
200945,104,104
200946,23,23
200947,61,61


In [13]:
# Agrupar as avaliações por userId e contar o número de avaliações
ratings_count = ratings.groupby('userId')['rating'].count()

# Filtrar os usuários com mais de 100 avaliações
y = ratings_count[ratings_count > 100].index

print(y)

Index([     1,      3,     10,     16,     18,     20,     28,     29,     33,
           34,
       ...
       200928, 200930, 200933, 200936, 200937, 200940, 200943, 200944, 200945,
       200948],
      dtype='int64', name='userId', length=76549)


In [14]:
# Filtrar as avaliações dos usuários com mais de 50 avaliações
ratings = ratings[ratings['userId'].isin(y)]
ratings.shape

(22709318, 3)

In [15]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4397 entries, 0 to 86162
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   movieId        4397 non-null   int64  
 1   title          4397 non-null   object 
 2   total_ratings  4397 non-null   float64
dtypes: float64(1), int64(1), object(1)
memory usage: 137.4+ KB


In [16]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Index: 22709318 entries, 0 to 32000203
Data columns (total 3 columns):
 #   Column   Dtype  
---  ------   -----  
 0   userId   int64  
 1   movieId  int64  
 2   rating   float64
dtypes: float64(1), int64(2)
memory usage: 693.0 MB


In [17]:
# Concatenar os datasets de filmes e avaliações
ratings_and_movies = ratings.merge(movies, on='movieId')
ratings_and_movies.head()

Unnamed: 0,userId,movieId,rating,title,total_ratings
0,1,17,4.0,Sense and Sensibility (1995),22251.0
1,3,17,5.0,Sense and Sensibility (1995),22251.0
2,28,17,4.0,Sense and Sensibility (1995),22251.0
3,29,17,4.0,Sense and Sensibility (1995),22251.0
4,43,17,5.0,Sense and Sensibility (1995),22251.0


In [18]:
#Verificar se ha valores nulos em ratings_and_movies
ratings_and_movies.isnull().sum()

Unnamed: 0,0
userId,0
movieId,0
rating,0
title,0
total_ratings,0


In [19]:
# Descartar valores duplicados verificando userId e movieId
ratings_and_movies.drop_duplicates(subset=['userId', 'movieId'], keep='first', inplace=True)
ratings_and_movies.shape

(22709318, 5)

In [20]:
# Remover movieId
del ratings_and_movies['movieId']
ratings_and_movies.head()

Unnamed: 0,userId,rating,title,total_ratings
0,1,4.0,Sense and Sensibility (1995),22251.0
1,3,5.0,Sense and Sensibility (1995),22251.0
2,28,4.0,Sense and Sensibility (1995),22251.0
3,29,4.0,Sense and Sensibility (1995),22251.0
4,43,5.0,Sense and Sensibility (1995),22251.0


In [21]:
# Agrupar por título e userId e calcular a média das avaliações
ratings_and_movies = ratings_and_movies.groupby(['title', 'userId']).rating.mean().reset_index()

In [22]:
# Fazer pivot da tabela
movies_pivot = ratings_and_movies.pivot(index='title', columns='userId', values='rating')
movies_pivot.head()

userId,1,3,10,16,18,20,28,29,33,34,...,200928,200930,200933,200936,200937,200940,200943,200944,200945,200948
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",,,,,,,3.0,,,,...,,,,,,,,,,
(500) Days of Summer (2009),,,,,,,4.0,,,,...,,2.5,4.0,,,,,,,
*batteries not included (1987),,,,,,,,,,,...,,,,,,,,,,
...And Justice for All (1979),,,,,,,,,,,...,,,,,,,,,,
10 Cloverfield Lane (2016),,,,,,,,,,,...,,,,,,,,,4.0,


In [23]:
# Substituir ratings nulas por zero
movies_pivot.fillna(0, inplace=True)
movies_pivot.head()

userId,1,3,10,16,18,20,28,29,33,34,...,200928,200930,200933,200936,200937,200940,200943,200944,200945,200948
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
(500) Days of Summer (2009),0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,2.5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
*batteries not included (1987),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...And Justice for All (1979),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10 Cloverfield Lane (2016),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0


In [24]:
# Criar uma matriz esparsa
movies_sparse = csr_matrix(movies_pivot)

In [25]:
# Criar e treinar modelo KNN
model_knn = NearestNeighbors(algorithm='brute')
model_knn.fit(movies_sparse)

In [26]:
# Salvar o modelo e os dados processados
joblib.dump(model_knn, 'model_knn.pkl')
joblib.dump(movies_pivot, 'movies_pivot.pkl')
joblib.dump(movies_sparse, 'movies_sparse.pkl')

['movies_sparse.pkl']



---



# **Uso do modelo treinado para um usuário**
pode ser usado em qualquer local do projeto

Importações

In [27]:
# Importar as dependências
import pandas as pd
import numpy as np
import joblib

In [28]:
# Carregar o modelo e os dados processados
model_knn = joblib.load('model_knn.pkl')
movies_pivot = joblib.load('movies_pivot.pkl')
movies_sparse = joblib.load('movies_sparse.pkl')
movies = pd.read_csv('movies.csv', low_memory=False)

Função para obter filmes recomendados

In [29]:
# Função para recomendar filmes
def recommend_movies(user_movies, user_ratings_dict, n_recommendations=10):
    # Obter índices dos filmes avaliados pelo usuário
    movie_indices = [movies_pivot.index.get_loc(movie) for movie in user_movies]

    # Calcular distâncias e índices para todos os filmes avaliados pelo usuário
    distances, indices = model_knn.kneighbors(movies_sparse[movie_indices], n_neighbors=n_recommendations + 1)

    # Usar um array NumPy para armazenar as recomendações
    recommendations = np.zeros(movies_pivot.shape[0])

    # Adicionar recomendações ao array
    for i in range(len(user_movies)):
        for j in range(1, len(distances[i])):
            movie_index = indices[i][j]
            recommendations[movie_index] += user_ratings_dict[user_movies[i]] / distances[i][j]

    # Obter os índices dos filmes recomendados ordenados pelos scores
    recommended_indices = np.argsort(recommendations)[::-1][:n_recommendations]

    # Converter os índices para títulos de filmes
    recommended_movies = movies_pivot.index[recommended_indices].tolist()

    return recommended_movies

In [30]:
# Carregar as avaliações do usuário
user_ratings = pd.read_csv('ratings_user.csv', low_memory=False)
user_ratings.head()

Unnamed: 0,movieId,rating,timestamp
0,4306,9.0,944249077
1,8360,9.0,944250228


In [31]:
# Selecionar somente colunas que serão usadas
user_ratings = user_ratings[['movieId', 'rating']]
user_ratings.head()

Unnamed: 0,movieId,rating
0,4306,9.0
1,8360,9.0


In [32]:
# Obter títulos dos filmes avaliados pelo usuário
user_movies = movies[movies['movieId'].isin(user_ratings['movieId'])]['title'].tolist()
user_movies

['Shrek (2001)', 'Shrek 2 (2004)']

In [33]:
# Obter notas dos filmes avaliados pelo usuário
user_ratings_dict = dict(zip(movies[movies['movieId'].isin(user_ratings['movieId'])]['title'], user_ratings['rating']))
user_ratings_dict

{'Shrek (2001)': 9.0, 'Shrek 2 (2004)': 9.0}

Usar o modelo para fazer recomendações

In [34]:
# Fazer recomendações executando a função de recomendação
recommended_movies = recommend_movies(user_movies, user_ratings_dict)
print("Filmes recomendados:")
for movie in recommended_movies:
    print(movie)

Filmes recomendados:
Ice Age (2002)
Shrek the Third (2007)
Madagascar (2005)
Charlie and the Chocolate Factory (2005)
Spider-Man 2 (2004)
Cars (2006)
Chronicles of Narnia: The Lion, the Witch and the Wardrobe, The (2005)
Ice Age 2: The Meltdown (2006)
Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002)
Pirates of the Caribbean: Dead Man's Chest (2006)
