# 1. Modules and functions

In [27]:
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

import shap
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from itertools import islice, cycle, product

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

from lightfm.data import Dataset
from lightfm import LightFM

from catboost import CatBoostClassifier

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.float_format', lambda x: '%.3f' % x)

# 2. Main

## 2.1 Load Data

In [30]:
interactions = pd.read_parquet('interactions.parquet',engine='pyarrow')
print(len(interactions))
interactions.head(5)

1800000


Unnamed: 0,year,month,day,user_id,movie_id,watch_duration_minutes
0,2022,10,1,58073,a6889772-f1f4-45bc-9663-85b46fc8499d,4.8
1,2022,10,1,63698,c829f262-ddf1-46b7-a896-a7efc205028c,15.0
2,2022,10,1,66655,91f9d892-a508-4962-91e9-abacd93e0830,51.917
3,2022,10,1,66655,58b805e1-a65c-49f4-b302-865cacaaed8a,22.317
4,2022,10,1,67981,dbc44c07-46e2-4fb0-b57c-8b5172421683,117.0


In [9]:
movies_md = pd.read_parquet('movies_metdata.parquet',engine='pyarrow')
print(len(movies_md))
movies_md.head(5)

226260


Unnamed: 0,movie_id,title,entity_type,genres,actors,director,country,release_world,age_rating,duration
0,395a85a9-2200-4a29-a878-97753c471f79,Свиридовы,Серия,"[""Мелодрамы""]","[""Андрей Мерзликин"",""Алексей Горбунов"",""Елена ...","[""Эльдар Салаватов""]","[""Россия""]",2013-12-15,16.0,49.0
1,6fd5f7b5-8278-4d9a-b03a-54a8bf0bc7c8,Охотники за головами,Серия,"[""Детективы"",""Драмы""]","[""Филипп Янковский"",""Елизавета Боярская"",""Конс...","[""Иван Шурховецкий""]","[""Россия""]",2014-03-03,16.0,48.0
2,b239c99e-d1ea-4c01-b83f-6b5469a6b70a,Молодое поколение,Серия,"[""Комедии"",""Мелодрамы""]","[""Юн Пак"",""Пак Ын-бин"",""Хан Е-ри"",""Ким Мин-сок...","[""Ким Сан-хо"",""Ли Тхэ-гон""]","[""Корея Южная""]",2016-07-22,16.0,65.0
3,9dc02ab1-7bd2-45c2-9b8c-ae29e9813ccf,Художницы,Серия,"[""Документальное""]","[""Катарина Лопаткина""]","[""Школа Masters""]","[""Россия""]",2019-02-01,18.0,27.0
4,54b3b5fe-7d2a-47ba-9a1d-4dc3a3a8ff96,Веселая Астрология,Серия,"[""Мультфильмы""]","[""Стивен Боунс"",""Хашир Хайзал Хаилми"",""Логандр...","[""Кен Фунг""]","[""Малайзия""]",2017-10-01,6.0,3.0


## 2.2 Train/test split

## 2.3 Data preparation using LightFM Dataset (first-level model)

In [10]:
# minimal_duration = 300
# good_movies = interactions.groupby('movie_id')['watch_duration_minutes'].sum().reset_index(drop=False)
# good_movies

In [15]:
interactions_filtered = interactions.loc[interactions['movie_id'].isin(movies_md['movie_id'])]
print(f'''Number of films in initial dataset: {len(interactions)} 
Number of filmns in filtered dataset: {len(interactions_filtered)}
Number of dropped films: {len(interactions) - len(interactions_filtered)}''')

Number of films in initial dataset: 1800000 
Number of filmns in filtered dataset: 1782240
Number of dropped films: 17760


In [16]:
users = interactions[['user_id']].drop_duplicates().reset_index(drop = True)
print(f'Number of unique users: {len(users)}')

Number of unique users: 127060


In [17]:
item_name_mapper = dict(zip(movies_md['movie_id'], movies_md['title']))
duration = movies_md[['movie_id','duration']]
interactions = pd.merge(interactions_filtered,duration,how='left',on='movie_id')
interactions = interactions.dropna()
interactions['watch_percentage'] = interactions['watch_duration_minutes']/interactions['duration']
interactions.head(5)

Unnamed: 0,year,month,day,user_id,movie_id,watch_duration_minutes,duration,watch_percentage
1,2022,10,1,63698,c829f262-ddf1-46b7-a896-a7efc205028c,15.0,144.0,0.104
4,2022,10,1,67981,dbc44c07-46e2-4fb0-b57c-8b5172421683,117.0,107.0,1.093
5,2022,10,1,68523,7822a15d-0856-4f97-b7fc-6041584aa40b,109.5,121.0,0.905
6,2022,10,1,68680,595c19b8-a22f-4beb-ab9f-5431fd3929ea,7.0,131.0,0.053
7,2022,10,1,68680,9992a169-0f6f-42ba-aebc-815d2fa689e0,114.0,100.0,1.14


In [19]:
def compute_popularity(
    df: pd.DataFrame, 
    item_id: str, max_candidates: int) -> pd.DataFrame:
    
    """
    Сalculates mean rating to define popular titles
    """
    
    popular_titles = df.groupby(item_id).agg({'watch_percentage': sum}) \
                     .sort_values(['watch_percentage'], ascending=False).head(max_candidates).index.values

    return popular_titles

In [20]:
ITEM_COLUMN = 'movie_id'
USER_COLUMN = 'user_id'
base_recommendations = compute_popularity(interactions,ITEM_COLUMN,20)
known_items = interactions_filtered.groupby(USER_COLUMN)[ITEM_COLUMN].apply(list).to_dict

In [22]:
def fit(
    data: pd.DataFrame,
    item_col: str,
    max_candidates: int = 20) -> pd.DataFrame:
    
    recommendations = compute_popularity(data, item_col, max_candidates)

    return recommendations

In [23]:
recommendations = fit(interactions,ITEM_COLUMN,20)

In [24]:
def recommend(
    users: pd.DataFrame,
    recommendations: pd.DataFrame
    ) -> pd.DataFrame:
    
    output = users.copy(deep = True)
    recs = list(islice(cycle([recommendations]), len(users['user_id'])))
    output['rekkos'] = recs

    return output

In [28]:
recommend(users, recommendations).head(5)

Unnamed: 0,user_id,rekkos
0,58073,"[e088637d-967f-478b-8158-98b90aded146, 42f5cb3..."
1,63698,"[e088637d-967f-478b-8158-98b90aded146, 42f5cb3..."
2,66655,"[e088637d-967f-478b-8158-98b90aded146, 42f5cb3..."
3,67981,"[e088637d-967f-478b-8158-98b90aded146, 42f5cb3..."
4,68523,"[e088637d-967f-478b-8158-98b90aded146, 42f5cb3..."


In [29]:
for i in base_recommendations:
    print(item_name_mapper[i])

Сердце Пармы
Вышка
Барбоскины Team
Три тысячи лет желаний
Гринч
Три кота и море приключений
Босс-молокосос
Один дома
Либерея: Охотники за сокровищами
Пес-самурай и город кошек
Щенячий патруль в кино
Начать сначала
Красная Шапочка
Ирония судьбы в Голливуде
Зверопой 2
Любовники
Холодное сердце
Холодное сердце 2
Гадкий я
Один дома 2: Затерянный в Нью-Йорке
