<a href="https://colab.research.google.com/github/polina-minaeva/Content-Based-Recommendation-System/blob/main/2_%D0%A0%D0%B5%D0%BA%D0%BE%D0%BC%D0%B5%D0%BD%D0%B4%D0%B0%D1%86%D0%B8%D0%B8_%D0%BD%D0%B0_%D0%BE%D1%81%D0%BD%D0%BE%D0%B2%D0%B5_%D1%81%D0%BE%D0%B4%D0%B5%D1%80%D0%B6%D0%B0%D0%BD%D0%B8%D1%8F.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Построим рекомендательную систему на основе содержания на датасете MoveiLens

1. Загрузка данных

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from datetime import datetime
from tqdm.notebook import tqdm

In [2]:
!wget 'https://drive.google.com/uc?id=1m0rwReR09achL0xTM6QPoN4tykz5bOMx' -O MovieLens.zip

--2024-04-10 14:51:26--  https://drive.google.com/uc?id=1m0rwReR09achL0xTM6QPoN4tykz5bOMx
Resolving drive.google.com (drive.google.com)... 74.125.126.138, 74.125.126.139, 74.125.126.100, ...
Connecting to drive.google.com (drive.google.com)|74.125.126.138|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1m0rwReR09achL0xTM6QPoN4tykz5bOMx [following]
--2024-04-10 14:51:26--  https://drive.usercontent.google.com/download?id=1m0rwReR09achL0xTM6QPoN4tykz5bOMx
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 64.233.183.132, 2607:f8b0:4001:c0b::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|64.233.183.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 847695 (828K) [application/octet-stream]
Saving to: ‘MovieLens.zip’


2024-04-10 14:51:28 (75.5 MB/s) - ‘MovieLens.zip’ saved [847695/847695]



In [3]:
!unzip MovieLens.zip

Archive:  MovieLens.zip
  inflating: links.csv               
  inflating: movies.csv              
  inflating: ratings.csv             
  inflating: tags.csv                


In [4]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [5]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings.head() #каждая простановка оценки

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
tags.head() #каждая простановка тега

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


2. Построение Tfidf-векторов для фильмов по их жанрам и тегам

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
#приведем все жанры к единому виду
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|'))

In [8]:
movie_genres = [change_string(g) for g in movies.genres.values]
movie_genres[:10]

['Adventure Animation Children Comedy Fantasy',
 'Adventure Children Fantasy',
 'Comedy Romance',
 'Comedy Drama Romance',
 'Comedy',
 'Action Crime Thriller',
 'Comedy Romance',
 'Adventure Children',
 'Action',
 'Action Adventure Thriller']

In [9]:
movies['genres_chang'] = movie_genres

In [None]:
movies.head()

Unnamed: 0,movieId,title,genres,genres_chang
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy Drama Romance
4,5,Father of the Bride Part II (1995),Comedy,Comedy


In [None]:
movies.shape

(9742, 4)

In [10]:
movies_with_tags = movies.merge(tags, on='movieId')
movies_with_tags.head()

Unnamed: 0,movieId,title,genres,genres_chang,userId,tag,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy,336,pixar,1139045764
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy,474,pixar,1137206825
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy,567,fun,1525286013
3,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy,62,fantasy,1528843929
4,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy,62,magic board game,1528843932


In [11]:
movies_with_tags.dropna(inplace=True)

In [12]:
#теперь займемся тегами, приведем их тоже к единому виду
def change_string(s):
    return str(s).replace(' ', '').replace('-', '').lower()

In [13]:
tag_strings = []
movies_2 = []

for movie, group in tqdm(movies_with_tags.groupby('title')):
    tag_strings.append(' '.join([change_string(s) for s in group.tag.values]))
    movies_2.append(movie)

  0%|          | 0/1572 [00:00<?, ?it/s]

In [None]:
tag_strings[:10]

['artistic funny humorous inspiring intelligent quirky romance zooeydeschanel',
 'lawyers',
 'creepy suspense',
 'shakespearesortof',
 'dogs remake',
 'disney',
 'terrorism',
 'court claustrophobic confrontational earnest gooddialogue greatscreenplay gritty motivational thoughtprovoking',
 'stranded',
 'markruffalo']

In [14]:
movies_3 = pd.DataFrame(data={'title': movies_2, 'tags': tag_strings})
movies_3.head()

Unnamed: 0,title,tags
0,(500) Days of Summer (2009),artistic funny humorous inspiring intelligent ...
1,...And Justice for All (1979),lawyers
2,10 Cloverfield Lane (2016),creepy suspense
3,10 Things I Hate About You (1999),shakespearesortof
4,101 Dalmatians (1996),dogs remake


In [15]:
#добавим в таблицу измененные теги
movies_4 = movies.merge(movies_3, on='title')
movies_4.head()

Unnamed: 0,movieId,title,genres,genres_chang,tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy,pixar pixar fun
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy,fantasy magicboardgame robinwilliams game
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance,moldy old
3,5,Father of the Bride Part II (1995),Comedy,Comedy,pregnancy remake
4,7,Sabrina (1995),Comedy|Romance,Comedy Romance,remake


In [16]:
#соединим теги и жанры в общий признак
movies_4['genres_tags'] = movies_4['genres_chang'] + movies_4['tags']
movies_4.head()

Unnamed: 0,movieId,title,genres,genres_chang,tags,genres_tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure Animation Children Comedy Fantasy,pixar pixar fun,Adventure Animation Children Comedy Fantasypix...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure Children Fantasy,fantasy magicboardgame robinwilliams game,Adventure Children Fantasyfantasy magicboardga...
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy Romance,moldy old,Comedy Romancemoldy old
3,5,Father of the Bride Part II (1995),Comedy,Comedy,pregnancy remake,Comedypregnancy remake
4,7,Sabrina (1995),Comedy|Romance,Comedy Romance,remake,Comedy Romanceremake


In [17]:
#получим вектора
tfidf_tg = TfidfVectorizer()
X_train = tfidf_tg.fit_transform(movies_4['genres_tags'])
X_train

<1574x2090 sparse matrix of type '<class 'numpy.float64'>'
	with 5734 stored elements in Compressed Sparse Row format>

In [18]:
import pandas as pd

In [19]:
#выведем датафрейм с векторами
feats = pd.DataFrame(X_train.toarray())
feats.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2080,2081,2082,2083,2084,2085,2086,2087,2088,2089
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
feats.columns = feats.columns.astype(str) #переведем числовые данные в строки, чтобы потом без проблем вложить данные в модель лин. регрессии
feats.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2080,2081,2082,2083,2084,2085,2086,2087,2088,2089
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


3. Сборка всех признаков для каждого факта взаимодействия пользователя с фильмом

In [21]:
#у нас есть фильмы и их вектора
feats['title'] = movies_4['title']
feats['movieId'] = movies_4['movieId']
feats.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2082,2083,2084,2085,2086,2087,2088,2089,title,movieId
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toy Story (1995),1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Jumanji (1995),2
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Grumpier Old Men (1995),3
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Father of the Bride Part II (1995),5
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Sabrina (1995),7


In [None]:
#у нас есть оценки пользователей фильмам, это будет целевая переменная, которую предсказываем – y
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [23]:
#а это факты каждого взаимодействия пользователя с фильмами, к ним мы присоединим признаки фильмов - их Tfidf-вектора и средние оценки
edr = ratings[['userId', 'movieId', 'rating']]
edr.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [None]:
edr.shape

(100836, 2)

In [24]:
feats2 = edr.merge(feats, how='left', on='movieId')
feats2.head()

Unnamed: 0,userId,movieId,rating,0,1,2,3,4,5,6,...,2081,2082,2083,2084,2085,2086,2087,2088,2089,title
0,1,1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toy Story (1995)
1,1,3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Grumpier Old Men (1995)
2,1,6,4.0,,,,,,,,...,,,,,,,,,,
3,1,47,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Seven (a.k.a. Se7en) (1995)
4,1,50,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Usual Suspects, The (1995)"


In [25]:
#вот средний рейтинг фильма, это тоже признак
mov_rat = ratings.groupby(['movieId']).mean()
mov_rat['rating']

movieId
1         3.920930
2         3.431818
3         3.259615
4         2.357143
5         3.071429
            ...   
193581    4.000000
193583    3.500000
193585    3.500000
193587    3.500000
193609    4.000000
Name: rating, Length: 9724, dtype: float64

In [26]:
feats3 = feats2.merge(mov_rat['rating'], on='movieId')
feats3.head()

Unnamed: 0,userId,movieId,rating_x,0,1,2,3,4,5,6,...,2082,2083,2084,2085,2086,2087,2088,2089,title,rating_y
0,1,1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toy Story (1995),3.92093
1,5,1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toy Story (1995),3.92093
2,7,1,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toy Story (1995),3.92093
3,15,1,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toy Story (1995),3.92093
4,17,1,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toy Story (1995),3.92093


In [27]:
#и также будем указывать среднюю оценку каждого пользователя
us_rat = ratings.groupby(['userId']).mean()
us_rat['rating']

userId
1      4.366379
2      3.948276
3      2.435897
4      3.555556
5      3.636364
         ...   
606    3.657399
607    3.786096
608    3.134176
609    3.270270
610    3.688556
Name: rating, Length: 610, dtype: float64

In [28]:
feats4 = feats3.merge(us_rat['rating'], on='userId')
feats4 = feats4.rename(columns={'rating_x': 'rating', 'rating_y': 'aver_rating', 'rating': 'aver_user_rat'})
feats4.head()

Unnamed: 0,userId,movieId,rating,0,1,2,3,4,5,6,...,2083,2084,2085,2086,2087,2088,2089,title,aver_rating,aver_user_rat
0,1,1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Toy Story (1995),3.92093,4.366379
1,1,3,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Grumpier Old Men (1995),3.259615,4.366379
2,1,6,4.0,,,,,,,,...,,,,,,,,,3.946078,4.366379
3,1,47,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Seven (a.k.a. Se7en) (1995),3.975369,4.366379
4,1,50,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"Usual Suspects, The (1995)",4.237745,4.366379


4. Модель линейной регрессии

In [29]:
#избавимся от лишних признаков
feats4 = feats4.drop(['title', 'userId', 'movieId'], axis=1)

In [30]:
feats4.head()

Unnamed: 0,rating,0,1,2,3,4,5,6,7,8,...,2082,2083,2084,2085,2086,2087,2088,2089,aver_rating,aver_user_rat
0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.92093,4.366379
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.259615,4.366379
2,4.0,,,,,,,,,,...,,,,,,,,,3.946078,4.366379
3,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.975369,4.366379
4,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.237745,4.366379


In [31]:
feats4.dropna(inplace=True) #избавимся от NaNов

In [None]:
feats4.shape

(48289, 2093)

In [32]:
y = feats4['rating'] #определим целевую переменную

In [33]:
feats4 = feats4.drop(['rating'], axis=1)

In [34]:
X = feats4 #определим X

In [35]:
from sklearn.preprocessing import StandardScaler

In [36]:
scaler = StandardScaler()

In [37]:
X = scaler.fit_transform(X) #проведем масштабирование данных

In [38]:
from sklearn.model_selection import train_test_split

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) #разделим выборку

In [40]:
from sklearn.linear_model import LinearRegression

In [41]:
reg = LinearRegression().fit(X_train, y_train) #обучим модель

In [42]:
pred = reg.predict(X_test) #получим прогнозы

In [43]:
from sklearn.metrics import mean_squared_error
from math import sqrt

sqrt(mean_squared_error(y_test, pred)) #и посчитаем ошибку

2662691247689.2524