In [68]:
import pandas as pd
import numpy as np

from tqdm import tqdm_notebook

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

In [71]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

1) найдём пользователя оценки которого будем предсказывать, для этого выберем одного пльзователя, который поставил много оценок и посмотрим на его средний балл, чтобы случайно не взять того, кто поставил везде одинаковые оценки

In [90]:
cnt_estimation = ratings.groupby('userId', as_index=False).count().sort_values('rating', ascending=False)
cnt_estimation.columns = ['userId1','movieId','rating_cnt','timestamp']
mean_estimation = ratings.groupby('userId', as_index=False).mean().sort_values('rating', ascending=False)
mean_estimation.columns = ['userId2','movieId','rating_mean','timestamp']
answer = cnt_estimation.join(mean_estimation, on=None,
    how='left',
    lsuffix='userId1',
    rsuffix='userId2',
    sort=False,
) 
answer[answer['rating_cnt']<500].head(20)[['userId1','rating_cnt', 'rating_mean']]

# искусственно отсечём тех трудяг которые поставили оценку больше 500-ста фильмам и возьмём пользователя под номеров  368

Unnamed: 0,userId1,rating_cnt,rating_mean
56,57,476,3.392857
380,381,474,3.542194
367,368,469,2.842217
508,509,467,3.216274
468,469,465,3.673118
559,560,458,3.567686
461,462,455,3.406593
291,292,446,3.30157
20,21,443,3.260722
596,597,443,3.977427


2) теперь фильтруем таблицу только по одному пользователю, чтобы определить какие фильмы будем предсказывать

In [292]:
ratings_368 = ratings[ratings['userId']==368]
ratings_275 = ratings[ratings['userId']==275] # и его тоже проверим

In [92]:
ratings_368.head()

Unnamed: 0,userId,movieId,rating,timestamp
55396,368,3,3.0,971273951
55397,368,6,4.0,971275527
55398,368,10,3.0,971276726
55399,368,16,4.0,971275668
55400,368,21,3.0,975828964


3) делаем tf idf и обогатим жанры годом выпуска и тегами

In [102]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [204]:
all_tags = {}
for i in tags.iterrows():
    try:
        all_tags[i[1]['movieId']] = all_tags[i[1]['movieId']]+'|'+i[1]['tag']
    except:
        all_tags[i[1]['movieId']] =i[1]['tag']
movie_tags = pd.DataFrame(pd.Series(all_tags))
movie_tags['moveId'] = movie_tags.index
movie_tags.columns = [['tag','moveId']]

# формируем df т.к. каждый тег указан в отдельной строке, то немного извратимся

# теперь соединяем нашу таблицу с тегами с таблицей с фильмами

movie_join_tag = movies.join(movie_tags, on=None,
    how='left',
    lsuffix='movieId',
    rsuffix='movieId',
    sort=False,
)

In [205]:
movie_join_tag.columns = ['movieId', 'title', 'genres', 'tag', 'moveId'] # даем нормальные названия, потому что\
# по умолчанию какие то скобочки приплелись
movie_join_tag.fillna(value = '',inplace=True) # удаляем Nan иначе столбцы не сконкатенируются

In [206]:
movie_join_tag['year']= str(movie_join_tag['title']).split('(')[1].split(')')[0] # добываем год
movie_join_tag['idf'] = movie_join_tag['year']+'|'+movie_join_tag['genres']+'|'+movie_join_tag['tag']

In [207]:
movie_join_tag.head() # любуемся на результат

Unnamed: 0,movieId,title,genres,tag,moveId,year,idf
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,1995,1995|Adventure|Animation|Children|Comedy|Fantasy|
1,2,Jumanji (1995),Adventure|Children|Fantasy,pixar|pixar|fun,1.0,1995,1995|Adventure|Children|Fantasy|pixar|pixar|fun
2,3,Grumpier Old Men (1995),Comedy|Romance,fantasy|magic board game|Robin Williams|game,2.0,1995,1995|Comedy|Romance|fantasy|magic board game|R...
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,moldy|old,3.0,1995,1995|Comedy|Drama|Romance|moldy|old
4,5,Father of the Bride Part II (1995),Comedy,,,1995,1995|Comedy|


In [60]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

осталось дело за малым, превратить в матрицу, соединить с фильмами которые отметил выбранный нами пользователь, разделить и предсказать

In [208]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [209]:
movie_genres = []
for i in movie_join_tag.idf.values:
    movie_genres.append(i.replace(' ', '').replace('-', '').replace('|', ' '))

In [210]:
movie_genres

['1995 Adventure Animation Children Comedy Fantasy ',
 '1995 Adventure Children Fantasy pixar pixar fun',
 '1995 Comedy Romance fantasy magicboardgame RobinWilliams game',
 '1995 Comedy Drama Romance moldy old',
 '1995 Comedy ',
 '1995 Action Crime Thriller pregnancy remake',
 '1995 Comedy Romance ',
 '1995 Adventure Children remake',
 '1995 Action ',
 '1995 Action Adventure Thriller ',
 '1995 Comedy Drama Romance ',
 '1995 Comedy Horror politics president',
 '1995 Adventure Animation Children ',
 '1995 Drama ',
 '1995 Action Adventure Romance politics president',
 '1995 Crime Drama ',
 '1995 Drama Romance Mafia',
 '1995 Comedy JaneAusten',
 '1995 Comedy ',
 '1995 Action Comedy Crime Drama Thriller ',
 '1995 Comedy Crime Thriller ',
 '1995 Crime Drama Horror Mystery Thriller Hollywood',
 '1995 Action Crime Thriller serialkiller',
 '1995 Drama SciFi ',
 '1995 Drama Romance ',
 '1995 Drama alcoholism',
 '1995 Children Drama Shakespeare',
 '1995 Drama Romance ',
 '1995 Adventure Drama Fan

In [211]:
## Альтернативный способ
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(movie_genres)

In [212]:
tfidf_vectorizer.vocabulary_

{'1995': 7,
 'adventure': 23,
 'animation': 55,
 'children': 185,
 'comedy': 218,
 'fantasy': 354,
 'pixar': 761,
 'fun': 384,
 'romance': 844,
 'magicboardgame': 603,
 'robinwilliams': 836,
 'game': 389,
 'drama': 306,
 'moldy': 649,
 'old': 724,
 'action': 14,
 'crime': 242,
 'thriller': 1014,
 'pregnancy': 779,
 'remake': 824,
 'horror': 476,
 'politics': 769,
 'president': 782,
 'mafia': 601,
 'janeausten': 536,
 'mystery': 675,
 'hollywood': 471,
 'serialkiller': 885,
 'scifi': 872,
 'alcoholism': 31,
 'shakespeare': 890,
 'innetflixqueue': 504,
 'kidnapping': 562,
 'highschool': 462,
 'teacher': 993,
 'timetravel': 1016,
 'bradpitt': 146,
 'brucewillis': 154,
 'mindfuck': 641,
 'postapocalyptic': 772,
 'twistending': 1045,
 'animalmovie': 54,
 'pigs': 760,
 'villainnonexistentornotneededforgoodstory': 1064,
 'deathpenalty': 271,
 'nun': 717,
 'war': 1075,
 'twins': 1043,
 'chickflick': 183,
 'funny': 385,
 'paulrudd': 747,
 'quotable': 804,
 'seenmorethanonce': 882,
 'emma': 327,

In [226]:
matrix = pd.DataFrame(X_train_tfidf.toarray())
matrix[['movieId', 'title']]=movies[['movieId', 'title']]

In [268]:
matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1099,1100,1101,1102,1103,1104,1105,1106,movieId,title
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.135915,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,Toy Story (1995)
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.048316,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,Jumanji (1995)
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.058946,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,Grumpier Old Men (1995)
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.07141,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4,Waiting to Exhale (1995)
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.456119,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,Father of the Bride Part II (1995)


In [227]:
pd.DataFrame(X_train_tfidf.toarray()).shape

(9742, 1107)

In [228]:
movies.shape # на всякий проверил, что обе таблицы одного разммера (число строк в смысле)

(9742, 3)

In [238]:
ratings_368.head()

Unnamed: 0,userId,movieId,rating,timestamp
55396,368,3,3.0,971273951
55397,368,6,4.0,971275527
55398,368,10,3.0,971276726
55399,368,16,4.0,971275668
55400,368,21,3.0,975828964


In [245]:
ratings_368.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 469 entries, 55396 to 55864
Data columns (total 4 columns):
userId       469 non-null int64
movieId      469 non-null int64
rating       469 non-null float64
timestamp    469 non-null int64
dtypes: float64(1), int64(3)
memory usage: 38.3 KB


In [246]:
matrix[['movieId',7,8]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9742 entries, 0 to 9741
Data columns (total 3 columns):
movieId    9742 non-null int64
7          9742 non-null float64
8          9742 non-null float64
dtypes: float64(2), int64(1)
memory usage: 228.4 KB


In [269]:
df = ratings_368.join(matrix, on=None,
    how='left',
    lsuffix='movieId',
    rsuffix='movieId',
    sort=False,
)

# не знаю почему, но этот метод не хочет работать... вмето значений из матрицы одни наны

In [270]:
df.head()

Unnamed: 0,userId,movieIdmovieId,rating,timestamp,0,1,2,3,4,5,...,1099,1100,1101,1102,1103,1104,1105,1106,movieIdmovieId.1,title
55396,368,3,3.0,971273951,,,,,,,...,,,,,,,,,,
55397,368,6,4.0,971275527,,,,,,,...,,,,,,,,,,
55398,368,10,3.0,971276726,,,,,,,...,,,,,,,,,,
55399,368,16,4.0,971275668,,,,,,,...,,,,,,,,,,
55400,368,21,3.0,975828964,,,,,,,...,,,,,,,,,,


In [271]:
df[['movieIdmovieId',7]].head() # возможно дело в одинаковых названиях столбцов

Unnamed: 0,movieIdmovieId,movieIdmovieId.1,7
55396,3,,
55397,6,,
55398,10,,
55399,16,,
55400,21,,


In [328]:
df = ratings_368.merge(matrix, how = 'left', left_on='movieId', right_on='movieId') # а вот этот норм

In [267]:
df[['movieId',7]].head()

Unnamed: 0,movieId,7
0,3,0.058946
1,6,0.078417
2,10,0.202789
3,16,0.269549
4,21,0.216832


4) теперь осталось самое простое надеюсь предсказать оценку

In [272]:
from sklearn.linear_model import LogisticRegression # предсказывать нужно всего 5 значений, так что логистическая
from sklearn.model_selection import train_test_split

In [353]:
def user_predict(user_id): # обернём весь труд в функцию, чтобы можно было удобно проверять кого угодно
    ratings_us = ratings[ratings['userId']==user_id]
    df = ratings_us.merge(matrix, how = 'left', left_on='movieId', right_on='movieId')
    X = df.drop([ 'userId','movieId','rating', 'timestamp','title'], axis='columns')
    X = X.astype('float64')
    y = df['rating'].astype('int64')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    # обучаем модель
    model = LogisticRegression()
    model.fit( X_train, y_train )
    predictions = model.predict_proba( X_test )
    return model.score(X_test, y_test)

In [355]:
import warnings
warnings.filterwarnings("ignore")
for i in answer['userId1'][:40]:
    print(user_predict(i)) 

0.40925925925925927
0.4838709677419355
0.4597156398104265
0.3485254691689008
0.5111111111111111
0.4444444444444444
0.44841269841269843
0.4139344262295082
0.5022421524663677
0.4881516587677725
0.5
0.4854368932038835
0.41836734693877553
0.29743589743589743
0.3915343915343915
0.31382978723404253
0.34806629834254144
0.6022727272727273
0.5202312138728323
0.3869047619047619
0.3532934131736527
0.39215686274509803
0.3972602739726027
0.4520547945205479
0.5241379310344828
0.44680851063829785
0.3602941176470588
0.4307692307692308
0.43846153846153846
0.3700787401574803
0.43902439024390244
0.45901639344262296
0.6
0.3275862068965517
0.3652173913043478
0.40350877192982454
0.27358490566037735
0.6730769230769231
0.41346153846153844
0.4807692307692308


В итоге, на кого как работает, кто-то предсказуем до 0.6, кто-то нет