In [71]:
import pandas as pd
import numpy as np
from datetime import datetime

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer
from sklearn.neighbors import KNeighborsRegressor

In [45]:
links = pd.read_csv('E:\\Python\\RSML-2\\RSML_HW1\\links.csv')
movies = pd.read_csv('E:\\Python\\RSML-2\\RSML_HW1\\movies.csv')
ratings = pd.read_csv('E:\\Python\\RSML-2\\RSML_HW1\\ratings.csv')
tags = pd.read_csv('E:\\Python\\RSML-2\\RSML_HW1\\tags.csv')

In [46]:
tags.head(20)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
5,2,89774,Tom Hardy,1445715205
6,2,106782,drugs,1445715054
7,2,106782,Leonardo DiCaprio,1445715051
8,2,106782,Martin Scorsese,1445715056
9,7,48516,way too long,1169687325


In [47]:
movies.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [48]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-','').split('|'))

In [49]:
movie_genres = [change_string(g) for g in movies.genres.values]

In [50]:
movie_genres[:20]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller',
 'Comedy Drama Romance',
 'Comedy Horror',
 'Adventure Animation Children',
 'Drama',
 'Action Adventure Romance',
 'Crime Drama',
 'Drama Romance',
 'Comedy',
 'Comedy',
 'Action Comedy Crime Drama Thriller']

In [51]:
# добавим тэги
movies_with_tags = movies.join(tags.set_index('movieId'), on='movieId')

In [52]:
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,336.0,pixar,1139046000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,474.0,pixar,1137207000.0
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,567.0,fun,1525286000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,fantasy,1528844000.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,62.0,magic board game,1528844000.0


In [53]:
movies_with_tags.tag.unique()

array(['pixar', 'fun', 'fantasy', ..., 'star wars', 'gintama', 'remaster'],
      dtype=object)

In [54]:
movies_with_tags.dropna(inplace=True)

In [55]:
tag_strings = []
movies2 = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([str(s).replace(' ', '').replace('-', '') for s in group.tag.values]))
    movies2.append(movie)

HBox(children=(FloatProgress(value=0.0, max=1572.0), HTML(value='')))




In [56]:
movies_tags = pd.DataFrame(list(zip(movies2,tag_strings)), columns=['title', 'tag'])
movies_genres_tags = movies.join(movies_tags.set_index('title'), on='title')
movies_genres_tags['genres'] = movies_genres_tags['genres'].apply(change_string)
movies_genres_tags = movies_genres_tags.dropna()
movies_genres_tags.head()

Unnamed: 0,movieId,title,genres,tag
0,1,Toy Story (1995),Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure Children Fantasy,fantasy magicboardgame RobinWilliams game
2,3,Grumpier Old Men (1995),Comedy Romance,moldy old
4,5,Father of the Bride Part II (1995),Comedy,pregnancy remake
6,7,Sabrina (1995),Comedy Romance,remake


# Строим рекомедацию на жанрах

In [57]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(movies_genres_tags.genres.values)

In [73]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [59]:
neig_reg = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
neig_reg.fit(X_train_counts, X_train_tfidf)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [60]:
test = change_string('Adventure|Comedy|Fantasy|Crime')

predict = count_vect.transform([test])
X_tfidf = tfidf_transformer.transform(predict)

predicted_movies = neig_reg.kneighbors(X_tfidf, return_distance=True)
predicted_movies

(array([[1.76082037, 1.76082037, 1.76082037, 1.76082037, 1.76082037,
         1.76542145, 1.96404785, 2.05017702, 2.05017702, 2.05017702]]),
 array([[ 624, 1516,  939,  875,  945, 1262,  217,  653, 1296,  275]],
       dtype=int64))

# Строим рекомедацию на тегах

In [61]:
movies_genres_tags.tag

0                                    pixar pixar fun
1          fantasy magicboardgame RobinWilliams game
2                                          moldy old
4                                   pregnancy remake
6                                             remake
                            ...                     
9681                      Comedy funny RachelMcAdams
9692    adventure AliciaVikander videogameadaptation
9709                 JoshBrolin RyanReynolds sarcasm
9710                           EmiliaClarke starwars
9732                   anime comedy gintama remaster
Name: tag, Length: 1574, dtype: object

In [62]:
count_vect_tags = CountVectorizer()
X_train_counts2 = count_vect_tags.fit_transform(movies_genres_tags.tag.values)

In [63]:
tfidf_tags_transformer = TfidfTransformer()
X_train_tfidf2 = tfidf_tags_transformer.fit_transform(X_train_counts2)

In [64]:
neig_reg_tags = KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan')
neig_reg_tags.fit(X_train_counts2, X_train_tfidf2)

KNeighborsRegressor(metric='manhattan', n_jobs=-1, n_neighbors=10)

In [65]:
test_tags = 'mafia funny L.A.'

predict_tags = count_vect_tags.transform([test_tags])
X_tfidf2_tags = tfidf_tags_transformer.transform(predict_tags)

res_tags = neig_reg_tags.kneighbors(X_tfidf2_tags, return_distance=True)
res_tags

(array([[0.9269027 , 0.9269027 , 0.9269027 , 0.9269027 , 0.9269027 ,
         0.9269027 , 0.9269027 , 1.41232319, 1.41232319, 2.0730973 ]]),
 array([[ 528,  301,  180,    7,  322,  526,  398,  673,  990, 1558]],
       dtype=int64))

In [86]:
tfidf_transformer3 = TfidfVectorizer()
tfidf_transformer3.fit(movies_genres_tags['genres'])

TfidfVectorizer()

In [87]:
with_ratings = movies_genres_tags.merge(ratings)

In [96]:
for_model_df = with_ratings[with_ratings['userId']==75]
for_model_df

Unnamed: 0,movieId,title,genres,tag,userId,rating,timestamp
1597,47,Seven (a.k.a. Se7en) (1995),Mystery Thriller,mystery twistending serialkiller,75,2.0,1158968272
3859,260,Star Wars: Episode IV - A New Hope (1977),Action Adventure SciFi,classic spaceaction action scifi EPIC greatsou...,75,5.0,1158989756
8265,515,"Remains of the Day, The (1993)",Drama Romance,Butler Housekeeper,75,2.0,1158989888
8809,541,Blade Runner (1982),Action SciFi Thriller,scifi robots androids artificialintelligence a...,75,4.0,1158989785
9585,589,Terminator 2: Judgment Day (1991),Action SciFi,apocalypse ArnoldSchwarzenegger nuclearwar sci...,75,4.5,1158989827
9970,592,Batman (1989),Action Crime Thriller,superhero,75,3.5,1158989896
12754,903,Vertigo (1958),Drama Mystery Romance Thriller,falling AlfredHitchcock Atmospheric imdbtop250...,75,1.0,1158968143
12920,908,North by Northwest (1959),Action Adventure Mystery Romance Thriller,MountRushmore AlfredHitchcock imdbtop250,75,4.5,1158967045
13075,912,Casablanca (1942),Drama Romance,startofabeautifulfriendship,75,4.0,1158967840
13405,920,Gone with the Wind (1939),Drama Romance War,CivilWar,75,3.0,1158967791


In [101]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [102]:
X, y = tfidf_transformer3.transform(for_model_df['genres']), for_model_df['rating']

In [103]:
from sklearn.model_selection import train_test_split

In [104]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [105]:
lr.fit(X_train, y_train)

LinearRegression()

In [106]:
lr.predict(X_test)

array([3.06072253, 3.84845852, 3.01175606, 2.94917055, 4.25259453,
       3.20127224, 1.74701464, 1.6962854 , 3.03518835, 2.04355448])

In [107]:
y_test

35820    4.0
39171    1.0
12754    1.0
9970     3.5
39720    4.0
22168    4.0
17301    4.5
18781    4.0
15809    4.5
8809     4.0
Name: rating, dtype: float64

# Оценить RMSE на тестовой выборке

In [108]:
from sklearn.metrics import mean_squared_error

In [109]:
mean_squared_error(y_test, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [10, 39]