### Использовать dataset [MovieLens](https://grouplens.org/datasets/movielens/latest/)
- Построить рекомендации (регрессия, предсказываем оценку) на фичах:
    - TF-IDF на тегах и жанрах
    - Средние оценки (+ median, variance, etc.) пользователя и фильма
- Оценить RMSE на тестовой выборке

In [1]:
# import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns

from datetime import datetime
from math import sqrt
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer, CountVectorizer
from sklearn.metrics import accuracy_score, roc_auc_score, precision_score, recall_score, mean_squared_error
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# from pylab import rcParams
# rcParams['figure.figsize'] = (14,7)

import warnings
warnings.simplefilter('ignore')

# import matplotlib
# matplotlib.style.use('ggplot')

# sns.set()
%matplotlib inline
# %config InlineBackend.figure_format = 'retina'

# !ls *.csv

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')
# links = pd.read_csv('links.csv')

In [3]:
display(movies.head(3))
display(ratings.head(3))
display(tags.head(3))

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224


Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992


In [4]:
def change_string(s):
    return ' '.join(s.replace(' ', '').replace('-', '').split('|')) # Убираем лишние символы (применим для 'Жанры')


def extract_year(row):
    match = re.search('\(\D*\d{4}\D*\)', row)
    if match:
        return int(re.search('\d{4}', match[0])[0]) # Получить новую фичу 'Год'
    return 0


def movies_preprocessing(movies):
    movies['genres_'] = [change_string(i) for i in movies['genres'].values] # Обработка 'Жанры'
    movies['year'] = movies['title'].apply(extract_year) # Добавляем фичу 'Год'
    movies.drop('genres', axis='columns', inplace=True) # Удаляем не обработанный столбец 'Жанры'
    movies.columns = ['movieId', 'title', 'genres', 'year'] # Переименовываем колонки
    movies = movies[movies['year']!=0] # Через срез оставим только корректные данные, по фиче 'Год' нули не учитываем 
    movies.drop('title', axis='columns', inplace=True) # Удалил 'Название фильмов' они не нужны для дальнейшей обработки
    return movies


def tags_preprocessing(tags):
    tags['tag'] = tags['tag'].str.lower() # Приведем к нижнему регистру
    groupby_tags = tags.groupby('movieId')[['tag']].agg(' '.join) # Группируем 'Тэги' через объединение через пробел
    groupby_tags.columns = ['tags'] # Убираем lvl после группировки (путем преопределения названия колонки)
    tags = pd.merge(tags, groupby_tags, left_on='movieId', right_index=True) # Полученный результат группировки объединем с данными файла 'Тэги'
    tags = tags.drop(['tag', 'timestamp', 'userId'], axis=1) # Удаляем лишние колонки
    tags = tags.drop_duplicates(subset=['movieId'], keep='first') # Удаляем дубликаты
    return tags

#### TF-IDF на 'genres'

In [5]:
movies = movies_preprocessing(movies)
display(movies.head())

Unnamed: 0,movieId,genres,year
0,1,Adventure Animation Children Comedy Fantasy,1995
1,2,Adventure Children Fantasy,1995
2,3,Comedy Romance,1995
3,4,Comedy Drama Romance,1995
4,5,Comedy,1995


In [6]:
def tfidf_genres(movies):
    
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(movies['genres'])

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # Получаем векторное представление
    
    for key, value in count_vect.vocabulary_.items():
        movies[key+'_genres'] = X_train_tfidf.toarray().T[value] # Записываем полученный результат
    
    movies.drop('genres', axis='columns', inplace=True) # Удаляем 'Жанры' т.к. теперь у нас есть векторное представление
    movies = pd.get_dummies(movies, columns=['year']) # get_dummies для 'Года' 
    
    return movies

In [7]:
movies = tfidf_genres(movies)
movies.head()

Unnamed: 0,movieId,adventure_genres,animation_genres,children_genres,comedy_genres,fantasy_genres,romance_genres,drama_genres,action_genres,crime_genres,...,year_2009,year_2010,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018
0,1,0.41683,0.516249,0.504865,0.267511,0.483001,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,2,0.512335,0.0,0.620541,0.0,0.593668,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,3,0.0,0.0,0.0,0.570835,0.0,0.821065,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,4,0.0,0.0,0.0,0.504938,0.0,0.726281,0.466426,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


#### TF-IDF на 'tags'

In [8]:
tags = tags_preprocessing(tags)
display(tags.head())

Unnamed: 0,movieId,tags
0,60756,funny highly quotable will ferrell comedy funn...
3,89774,boxing story mma tom hardy
6,106782,drugs leonardo dicaprio martin scorsese stock ...
9,48516,way too long leonardo dicaprio suspense twist ...
10,431,al pacino gangster mafia


In [9]:
def tfidf_tags(tags):
    
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(tags['tags'])

    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # Получаем векторное представление
    
    for key, value in count_vect.vocabulary_.items():
        tags[key+'_tags'] = X_train_tfidf.toarray().T[value] # Записываем полученный результат
    
    tags.drop('tags', axis='columns', inplace=True) # Удаляем 'Тэги' т.к. теперь у нас есть векторное представление
    
    return tags

In [10]:
tags = tfidf_tags(tags)
display(tags.head())

Unnamed: 0,movieId,funny_tags,highly_tags,quotable_tags,will_tags,ferrell_tags,comedy_tags,boxing_tags,story_tags,mma_tags,...,uncomfortable_tags,unsettling_tags,sinbad_tags,cinema_tags,really_tags,wesley_tags,snipes_tags,picture_tags,70mm_tags,austere_tags
0,60756,0.469966,0.214121,0.207271,0.567675,0.590822,0.137833,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,89774,0.0,0.0,0.0,0.0,0.0,0.0,0.400252,0.39037,0.529518,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,106782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,48516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Получим дополнительные фичи для target (по условию задачи)

In [11]:
rating_count = ratings.groupby('movieId', sort=False)['rating'].count()
rating_mean = ratings.groupby('movieId', sort=False)[['rating']].mean()

#### Объеденим предобработанные данные в один dataframe

In [12]:
def merge_data(tags, movies, rating_count, rating_mean):
    data = tags.merge(movies, on='movieId')
    data = pd.merge(data, pd.DataFrame(rating_count).rename(columns={'rating':'rating_count'}), left_on='movieId', right_index=True)
    data = pd.merge(data, rating_mean.rename(columns={'rating':'mean_rating'}), left_on='movieId', right_index=True)
    return data

In [13]:
data = merge_data(tags, movies, rating_count, rating_mean)
data.head()

Unnamed: 0,movieId,funny_tags,highly_tags,quotable_tags,will_tags,ferrell_tags,comedy_tags,boxing_tags,story_tags,mma_tags,...,year_2011,year_2012,year_2013,year_2014,year_2015,year_2016,year_2017,year_2018,rating_count,mean_rating
0,60756,0.469966,0.214121,0.207271,0.567675,0.590822,0.137833,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,28,3.553571
1,89774,0.0,0.0,0.0,0.0,0.0,0.0,0.400252,0.39037,0.529518,...,1,0,0,0,0,0,0,0,11,3.727273
2,106782,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,1,0,0,0,0,0,54,3.916667
3,48516,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,107,4.252336
4,431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,42,3.738095


#### Разбиваем данные:
    1. X и y
    2. train и test

In [14]:
X = data.drop(columns=['mean_rating'])
y = data['mean_rating'] # Предсказываем среднее значение

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#### Стандартизируем данные

In [16]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#### Построим несколько Регрессионных моделей

In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, BaggingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from mlxtend.regressor import StackingCVRegressor, StackingRegressor

In [18]:
# model_LR = LinearRegression()
model_ABR = AdaBoostRegressor()
model_BR = BaggingRegressor()
model_GBR = GradientBoostingRegressor()
# model_RFR = RandomForestRegressor(n_estimators=27, max_depth=8, min_samples_leaf=5, max_features=0.5, n_jobs=-1)
# model_SGDR = SGDRegressor(alpha = 0.01, max_iter = 1000)

model_meta_RFR = RandomForestRegressor(n_estimators=27, max_depth=8, min_samples_leaf=5, max_features=0.5, n_jobs=-1)

In [19]:
stack = StackingCVRegressor(regressors=(
#     model_LR, 
    model_ABR, 
    model_BR, 
    model_GBR, 
#     model_RFR, 
#     model_SGDR
),meta_regressor=model_meta_RFR,use_features_in_secondary=True)

In [20]:
stack.fit(X_train, y_train)

StackingCVRegressor(cv=5,
                    meta_regressor=RandomForestRegressor(bootstrap=True,
                                                         criterion='mse',
                                                         max_depth=8,
                                                         max_features=0.5,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=5,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         n_estimators=27,
                                                         n_jobs=-1,
                                                      

In [21]:
def rmse(y_train, y_pred):
     return np.sqrt(mean_squared_error(y_train, y_pred))

def cv_rmse(model, X=X_train, y=y_train):    
    return np.sqrt(-cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=kf))

kf = KFold(n_splits=10, shuffle=True, random_state=555)

In [22]:
models = {
#     'LinearRegression': model_LR,
    'AdaBoostRegressor': model_ABR,
    'BaggingRegressor': model_BR,
    'GradientBoostingRegressor': model_GBR, 
#     'RandomForestRegressor': model_RFR,
#     'SGDRegressor': model_SGDR, 
    'StackingCVRegressor_meta_RandomForestRegressor': model_meta_RFR
}

In [29]:
predictions = {}
predictions_s = {}

predictions_t = {}
predictions_ss = {}

scores = {}
scores_s = {}

scores_t = {}
scores_ss = {}

# model.fit(X_train, y_train)
# y_pred = model.predict(X_test)
# print(accuracy_score(y_test, y_pred))

for name, model in models.items():
    start = datetime.now()
    print('[{}] Running {}'.format(start, name))
    
    model.fit(X_train, y_train)
    
    predictions[name] = np.expm1(model.predict(X_train))
    predictions_t[name] = np.expm1(model.predict(X_test))
    
    predictions_s[name] = model.predict(X_train)
    predictions_ss[name] = model.predict(X_test)
    
    score = cv_rmse(model, X=X_train, y=y_train)
    score_t = cv_rmse(model, X=X_test, y=y_test)
    
    scores[name] = (score.mean(), score.std())
    scores_t[name] = (score_t.mean(), score_t.std())
    
    scores_s[name] = (r2_score(y_train, model.predict(X_train)), explained_variance_score(y_train, model.predict(X_train)))
    scores_ss[name] = (r2_score(y_test, model.predict(X_test)), explained_variance_score(y_test, model.predict(X_test)))    
    
    end = datetime.now()
    print('[{}] Finished Running {} in {:.2f}s'.format(end, name, (end - start).total_seconds()))
    print('[{}] {} _score:\ntrain --- RMSE: {:.6f} / Std: {:.6f} / r2_score: {:.6f} / explained_variance_score: {:.6f}'.format(
        datetime.now(), name, scores[name][0], scores[name][1], scores_s[name][0], scores_s[name][1]))
    print('test --- RMSE: {:.6f} / Std: {:.6f} / r2_score: {:.6f} / explained_variance_score: {:.6f}\n'.format(
        scores_t[name][0], scores_t[name][1], scores_ss[name][0],scores_ss[name][1]))

print('\n[END]')

[2020-02-09 18:26:26.888474] Running AdaBoostRegressor
[2020-02-09 18:26:38.046349] Finished Running AdaBoostRegressor in 11.16s
[2020-02-09 18:26:38.046694] AdaBoostRegressor _score:
train --- RMSE: 0.500458 / Std: 0.044155 / r2_score: 0.152785 / explained_variance_score: 0.173344
test --- RMSE: 0.507388 / Std: 0.066877 / r2_score: 0.118769 / explained_variance_score: 0.129857

[2020-02-09 18:26:38.046772] Running BaggingRegressor
[2020-02-09 18:26:48.278542] Finished Running BaggingRegressor in 10.23s
[2020-02-09 18:26:48.278700] BaggingRegressor _score:
train --- RMSE: 0.478443 / Std: 0.051553 / r2_score: 0.853960 / explained_variance_score: 0.854426
test --- RMSE: 0.523150 / Std: 0.083694 / r2_score: 0.104104 / explained_variance_score: 0.105145

[2020-02-09 18:26:48.278926] Running GradientBoostingRegressor
[2020-02-09 18:27:48.251983] Finished Running GradientBoostingRegressor in 59.97s
[2020-02-09 18:27:48.252140] GradientBoostingRegressor _score:
train --- RMSE: 0.464602 / Std:

In [None]:
pass