In [1]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [2]:
links = pd.read_csv('../../datasets/links.csv')
movies = pd.read_csv('../../datasets/movies.csv')
ratings = pd.read_csv('../../datasets/ratings.csv')
tags = pd.read_csv('../../datasets/tags.csv')

In [3]:
ratings_all.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [3]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [4]:
movie_genres = [change_string(g) for g in movies.genres.values]
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movie_genres)
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
neigh = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh.fit(X_train_tfidf)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [5]:
# Проверка на произвольных жанрах
test = change_string("Adventure|Comedy|Fantasy|Crime")

predict = count_vect.transform([test])
X_tfidf2 = tfidf_transformer.transform(predict)
res = neigh.kneighbors(X_tfidf2, return_distance=True)
# res
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres
29183,127040,Fantomas (Fantômas) (1964),Adventure|Comedy|Crime|Fantasy
29184,127042,Fantomas Unleashed (Fantômas se déchaîne) (1965),Adventure|Comedy|Crime|Fantasy
29185,127044,Fantomas vs. Scotland Yard (Fantômas contre Sc...,Adventure|Comedy|Crime|Fantasy
12784,60074,Hancock (2008),Action|Adventure|Comedy|Crime|Fantasy
19038,94015,Mirror Mirror (2012),Adventure|Comedy|Fantasy
38217,148886,Dinosaur Island (1994),Adventure|Comedy|Fantasy
16496,82854,Gulliver's Travels (2010),Adventure|Comedy|Fantasy


In [6]:
# Подготовка тэгов - приведение к тому же формату, что и в жанрах
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')
# movies_with_tags.shape
movies_with_tags.dropna(inplace=True)
# movies_with_tags.shape
tag_strings_list = []
movies_list = []
movies['tags'] = ""

for movie, group in tqdm(movies_with_tags.groupby('title')):
    m_tags = '|'.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values])
    tag_strings_list.append(m_tags)
    movies.loc[movies.movieId == group.movieId.values[0],['tags']] = m_tags
    movies_list.append(movie)

# tag_strings_list[:5]

  0%|          | 0/45935 [00:00<?, ?it/s]

In [7]:
# Построение графа ближайших соседей
movie_tags = [change_string(g) for g in movies.tags.values]
count_vect_tags = CountVectorizer()
X_train_counts_tags = count_vect_tags.fit_transform(movie_tags)
tfidf_transformer_tags = TfidfTransformer()
X_train_tfidf_tags = tfidf_transformer_tags.fit_transform(X_train_counts_tags)
neigh_tags = NearestNeighbors(n_neighbors=7, n_jobs=-1, metric='euclidean') 
neigh_tags.fit(X_train_tfidf_tags)

NearestNeighbors(metric='euclidean', n_jobs=-1, n_neighbors=7)

In [8]:
# Проверка на произвольных тэгах
test = change_string('pixar|pixar|fun')

predict = count_vect_tags.transform([test])
X_tfidf2 = tfidf_transformer_tags.transform(predict)

res = neigh_tags.kneighbors(X_tfidf2, return_distance=True)
# res
movies.iloc[res[1][0]]

Unnamed: 0,movieId,title,genres,tags
2271,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,animation|Disney|Pixar|insects|KevinSpacey|opp...
3028,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Pixar|sequelbetterthanoriginal|abandonment|ani...
14509,72356,Partly Cloudy (2009),Animation|Children|Comedy|Fantasy,Pixar|shortfilm|Pixar|shortfilm|Pixar|memasa's...
4791,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,funny|Pixar|Comedy|funny|Pixar|animated|animat...
41760,157296,Finding Dory (2016),Adventure|Animation|Comedy,adventure|animation|pixar|animation|computeran...
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,animated|buddymovie|Cartoon|cgi|comedy|compute...
11048,45517,Cars (2006),Animation|Children|Comedy,redemption|villainnonexistentornotneededforgoo...
