In [1]:
import pandas as pd
import numpy as np

In [2]:
links = pd.read_csv('../input/links.csv')
movies = pd.read_csv('../input/movies.csv')
ratings = pd.read_csv('../input/ratings.csv')
tags = pd.read_csv('../input/tags.csv')

In [3]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [4]:
tags.head(3)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


In [5]:
tags['tag'] = tags['tag'].apply(lambda x: x + '|')

In [6]:
tags = pd.DataFrame(tags.groupby('movieId')['tag'].sum())

In [7]:
tags.head(3)

Unnamed: 0_level_0,tag
movieId,Unnamed: 1_level_1
1,pixar|pixar|fun|
2,fantasy|magic board game|Robin Williams|game|
3,moldy|old|


In [8]:
movies_tags = movies.merge(tags, on='movieId', how='left')
movies_tags['tag'] = movies_tags['tag'].fillna('')

In [9]:
movies_tags.head(3)

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,pixar|pixar|fun|
1,2,Jumanji (1995),Adventure|Children|Fantasy,fantasy|magic board game|Robin Williams|game|
2,3,Grumpier Old Men (1995),Comedy|Romance,moldy|old|


Совместим жанры и тэги в одну колонку "description"

In [10]:
movies_tags['description'] = movies_tags.apply(lambda x: x['genres'].replace('|', ' ') + ' ' + 
                                               x['tag'].replace('|', ' '), axis=1)
movies_tags = movies_tags.drop('genres', axis=1)
movies_tags = movies_tags.drop('tag', axis=1)

In [11]:
movies_tags.head(3)

Unnamed: 0,movieId,title,description
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy pi...
1,2,Jumanji (1995),Adventure Children Fantasy fantasy magic board...
2,3,Grumpier Old Men (1995),Comedy Romance moldy old


In [12]:
movies_list = []
description_list = []

for mov, desc in movies_tags[['title', 'description']].values:
    movies_list.append(mov)
    description_list.append(desc.replace('r:', ' ').replace('.', ' ').replace('-', ' ').replace(':', ' '))

In [13]:
movies_tags.shape[0] == len(description_list)

True

In [14]:
description_list[:10]

['Adventure Animation Children Comedy Fantasy pixar pixar fun ',
 'Adventure Children Fantasy fantasy magic board game Robin Williams game ',
 'Comedy Romance moldy old ',
 'Comedy Drama Romance ',
 'Comedy pregnancy remake ',
 'Action Crime Thriller ',
 'Comedy Romance remake ',
 'Adventure Children ',
 'Action ',
 'Action Adventure Thriller ']

Теперь преобразуем наш description_list в признаки и добавим их к фильмам

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [16]:
coutn_v = CountVectorizer()

In [17]:
X_train = coutn_v.fit_transform(description_list)

In [18]:
X_train.toarray(), X_train.toarray().shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64), (9742, 1748))

In [19]:
tfidf = TfidfTransformer()
X_train_col = tfidf.fit_transform(X_train)

In [20]:
X_train_col.toarray(), X_train_col.toarray().shape

(array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]]), (9742, 1748))

In [21]:
from tqdm import tqdm_notebook
for i in tqdm_notebook(range(X_train_col.shape[1])):
    col_name = 'd{}'.format(i)
    movies_tags[col_name] = pd.Series(X_train_col.toarray()[:, i])

HBox(children=(IntProgress(value=0, max=1748), HTML(value='')))




In [22]:
movies_tags = movies_tags.drop('description', axis=1)

In [23]:
movies_tags.head(3)

Unnamed: 0,movieId,title,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15,d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31,d32,d33,d34,d35,d36,d37,...,d1708,d1709,d1710,d1711,d1712,d1713,d1714,d1715,d1716,d1717,d1718,d1719,d1720,d1721,d1722,d1723,d1724,d1725,d1726,d1727,d1728,d1729,d1730,d1731,d1732,d1733,d1734,d1735,d1736,d1737,d1738,d1739,d1740,d1741,d1742,d1743,d1744,d1745,d1746,d1747
0,1,Toy Story (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.149231,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.114624,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.310561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men (1995),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Получили таблицу признаков из описания фильмов. Теперь найдём наиболее похожие фильмы к фильму Джуманжи

In [24]:
train_data = movies_tags.iloc[:, 2:]

In [25]:
test_data = movies_tags[movies_tags['title'] == 'Jumanji (1995)'].iloc[:, 2:]

In [26]:
from sklearn.neighbors import NearestNeighbors

In [27]:
neighbor = NearestNeighbors(n_neighbors=5, n_jobs=-1, metric='euclidean')

In [28]:
neighbor.fit(train_data)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [29]:
predict = neighbor.kneighbors(test_data, return_distance=True)

In [30]:
predict
movies.iloc[predict[1][0]]

Unnamed: 0,movieId,title,genres
1,2,Jumanji (1995),Adventure|Children|Fantasy
9692,184471,Tomb Raider (2018),Action|Adventure|Fantasy
6254,46972,Night at the Museum (2006),Action|Comedy|Fantasy|IMAX
767,1009,Escape to Witch Mountain (1975),Adventure|Children|Fantasy
1617,2161,"NeverEnding Story, The (1984)",Adventure|Children|Fantasy


## Теперь сделаем рейтинг по схожим пользователям поставивших хорошие оценки

Хорошие оценки будут те, которые выше их среднего рейтинга

In [32]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


In [33]:
ratings.shape

(100836, 4)

In [34]:
mean_rating = ratings.groupby('userId')['rating'].mean()

In [35]:
ratings['mean_rating'] = ratings['userId'].apply(lambda x: mean_rating[x])

In [36]:
ratings['good_rating'] = ratings.apply(lambda x: x['rating'] if x['mean_rating'] <= 
                                       x['rating'] else np.NaN, axis=1)

In [37]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,mean_rating,good_rating
0,1,1,4.0,964982703,4.366379,
1,1,3,4.0,964981247,4.366379,
2,1,6,4.0,964982224,4.366379,
3,1,47,5.0,964983815,4.366379,5.0
4,1,50,5.0,964982931,4.366379,5.0


Теперь занулим все строчки где рейтинг ниже среднего и уберем лишние колонки

In [38]:
ratings = ratings[ pd.isnull( ratings['good_rating'] ) == 0 ]
ratings = ratings.drop('mean_rating', axis=1)
ratings = ratings.drop('good_rating', axis=1)
ratings = ratings.drop('timestamp', axis=1)

In [39]:
ratings.head(3)

Unnamed: 0,userId,movieId,rating
3,1,47,5.0
4,1,50,5.0
6,1,101,5.0


In [40]:
ratings.shape

(54732, 3)

In [41]:
movies_ratings = ratings.merge(movies, on='movieId', how='left')
movies_ratings = movies_ratings.drop('movieId', axis=1)
movies_ratings = movies_ratings.drop('rating', axis=1)
movies_ratings = movies_ratings.drop('genres', axis=1)
movies_ratings.head(3) # оставили в таблице только то что нам пригодится

Unnamed: 0,userId,title
0,1,Seven (a.k.a. Se7en) (1995)
1,1,"Usual Suspects, The (1995)"
2,1,Bottle Rocket (1996)


In [42]:
movies_ratings['userId'] = movies_ratings['userId'].apply(lambda x: str(x) + ' ')
movies_userid = movies_ratings.groupby('title')['userId'].sum()
movies_userid = movies_userid.reset_index(name='userId')

In [43]:
movies_userid.head()

Unnamed: 0,title,userId
0,'71 (2014),610
1,'Hellboy': The Seeds of Creation (2004),332
2,'Salem's Lot (2004),345
3,'Til There Was You (1997),345
4,"'burbs, The (1989)",12 172 217 307 520 555 561


Получили название фильма и пользователей поставивших им хорошие оценки, теперь делаем что и в предыдущей части и выводим похожие фильмы к "Хэнког"

In [44]:
movies_list = []
users_list = []

for mov, user in movies_userid[['title', 'userId']].values:
    movies_list.append(mov)
    users_list.append(user)

In [45]:
movies_userid.shape[0] == len(users_list)

True

In [46]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [47]:
coutn_v = CountVectorizer()

In [48]:
X_train = coutn_v.fit_transform(users_list)

In [49]:
X_train.toarray(), X_train.toarray().shape

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 1, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64), (7138, 601))

In [50]:
tfidf = TfidfTransformer()
X_train_col = tfidf.fit_transform(X_train)

In [51]:
X_train_col.toarray(), X_train_col.toarray().shape

(array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.38072426, ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]]), (7138, 601))

In [52]:
from tqdm import tqdm_notebook
for i in tqdm_notebook(range(X_train_col.shape[1])):
    col_name = 'd{}'.format(i)
    movies_userid[col_name] = pd.Series(X_train_col.toarray()[:, i])

HBox(children=(IntProgress(value=0, max=601), HTML(value='')))




In [53]:
movies_userid = movies_userid.drop('userId', axis=1)

In [54]:
movies_userid.head(3)

Unnamed: 0,title,d0,d1,d2,d3,d4,d5,d6,d7,d8,d9,d10,d11,d12,d13,d14,d15,d16,d17,d18,d19,d20,d21,d22,d23,d24,d25,d26,d27,d28,d29,d30,d31,d32,d33,d34,d35,d36,d37,d38,...,d561,d562,d563,d564,d565,d566,d567,d568,d569,d570,d571,d572,d573,d574,d575,d576,d577,d578,d579,d580,d581,d582,d583,d584,d585,d586,d587,d588,d589,d590,d591,d592,d593,d594,d595,d596,d597,d598,d599,d600
0,'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
train_data = movies_userid.iloc[:, 1:]

In [56]:
test_data = movies_userid[movies_userid['title'] == 'Jumanji (1995)'].iloc[:, 1:]

In [57]:
neighbor.fit(train_data)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=-1, n_neighbors=5, p=2, radius=1.0)

In [58]:
result = neighbor.kneighbors(test_data, return_distance=True)

In [59]:
result
movies.iloc[result[1][0]]

Unnamed: 0,movieId,title,genres
3364,4572,Black Rain (1989),Action|Crime|Drama
2797,3739,Trouble in Paradise (1932),Comedy|Romance
3587,4917,MacArthur (1977),Drama
3582,4911,Jabberwocky (1977),Adventure|Comedy|Fantasy
3051,4085,Beverly Hills Cop (1984),Action|Comedy|Crime|Drama


Интересно, получили совершенно другие фильмы