# MovieLens Dataset

### Импорты, создание спарк-сессии <a name='intro'></a>

In [10]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

10-Feb-20 19:01:45, matplotlib.pyplot, DEBUG: Loaded backend module://ipykernel.pylab.backend_inline version unknown.


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
import logging
import os
import re
import sys
from collections import Counter
from datetime import datetime
from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
from pyspark.sql.types import DoubleType, StringType, StructType, StructField, TimestampType

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
parent_dir = os.path.split(os.getcwd())[0]
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [4]:
from sponge_bob_magic.datasets.movielens import MovieLens
from sponge_bob_magic.data_preparator.data_preparator import DataPreparator
from sponge_bob_magic.metrics.metrics import HitRateMetric, NDCGMetric
from sponge_bob_magic.models.neuromf_recommender import NeuroMFRecommender
from sponge_bob_magic.splitters.user_log_splitter import RandomUserLogSplitter

from sponge_bob_magic.constants import DEFAULT_CONTEXT
from pyspark.sql.functions import lit

from sponge_bob_magic.utils import  get_spark_session

In [5]:
# отображение максимальной ширины колонок в pandas датафреймах
pd.options.display.max_colwidth = -1

In [6]:
spark = get_spark_session()
spark

## Подготовка данных <a name='data-preparator'></a>

In [7]:
data = MovieLens("1m")
log = spark.createDataFrame(data.ratings).withColumn(
    "context", lit(DEFAULT_CONTEXT)
)
data.info()

ratings


Unnamed: 0,user_id,item_id,relevance,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968



users


Unnamed: 0,user_id,gender,age,occupation,zip_code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117



items


Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance





In [8]:
splitter = RandomUserLogSplitter(
    True, True,
    item_test_size=10,
    seed=1234,
#     user_test_size=500
)

train, test_input, test = splitter._split_quantity(log)

(
    train.count(), 
    test_input.count(), 
    test.count()
)

(939809, 939809, 60400)

## NMF

In [9]:
nmf = NeuroMFRecommender(learning_rate=0.01,
                         epochs=10,
                         embedding_dimension=100)

In [11]:
%%time

nmf.fit(
    log=train,
    user_features=None,
    item_features=None
)

10-Feb-20 19:01:52, root, DEBUG: Проверка датафреймов
10-Feb-20 19:01:53, root, DEBUG: Предварительная стадия обучения (pre-fit)
10-Feb-20 19:02:05, root, DEBUG: Основная стадия обучения (fit)
10-Feb-20 19:02:05, root, DEBUG: Индексирование данных
10-Feb-20 19:02:05, root, DEBUG: Составление батча:
10-Feb-20 19:02:05, root, DEBUG: -- Запись
10-Feb-20 19:02:07, root, DEBUG: -- Считывание
10-Feb-20 19:02:07, root, DEBUG: Обучение модели
10-Feb-20 19:02:07, root, DEBUG: -- Эпоха 0
10-Feb-20 19:02:31, root, DEBUG: -- Текущее значение: 0.9151
10-Feb-20 19:02:31, root, DEBUG: -- Эпоха 1
10-Feb-20 19:02:54, root, DEBUG: -- Текущее значение: 0.4272
10-Feb-20 19:02:54, root, DEBUG: -- Эпоха 2
10-Feb-20 19:03:17, root, DEBUG: -- Текущее значение: 0.2570
10-Feb-20 19:03:17, root, DEBUG: -- Эпоха 3
10-Feb-20 19:03:40, root, DEBUG: -- Текущее значение: 0.2250
10-Feb-20 19:03:40, root, DEBUG: -- Эпоха 4
10-Feb-20 19:04:03, root, DEBUG: -- Текущее значение: 0.2135
10-Feb-20 19:04:03, root, DEBUG: -- 

CPU times: user 3min 27s, sys: 5.1 s, total: 3min 32s
Wall time: 4min 14s


In [12]:
%%time

recs = nmf.predict(
    k=10,
    users=test.select('user_id').distinct(),
    items=test.select('item_id').distinct(),
    context='no_context',
    log=train,
    user_features=None,
    item_features=None,
    filter_seen_items=True
)

10-Feb-20 19:06:39, root, DEBUG: Проверка датафреймов
10-Feb-20 19:06:44, root, DEBUG: Индексирование данных
10-Feb-20 19:06:44, root, DEBUG: Предсказание модели
10-Feb-20 19:06:44, root, DEBUG: -- Запись
10-Feb-20 19:06:45, root, DEBUG: -- Считывание
10-Feb-20 19:06:45, root, DEBUG: Поиск ближайших айтемов с помощью annoy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
10-Feb-20 19:06:58, root, DEBUG: Обратное преобразование индексов
10-Feb-20 19:06:59, root, DEBUG: Преобразование отрицательных relevance


CPU times: user 13.3 s, sys: 361 ms, total: 13.6 s
Wall time: 47.1 s


In [13]:
%%time

hr = HitRateMetric()
print(hr.calculate(recs, test, k=10))

0.304635761589404
CPU times: user 34.1 ms, sys: 12.2 ms, total: 46.3 ms
Wall time: 2.63 s


In [14]:
%%time

ndcg = NDCGMetric()
print(ndcg.calculate(recs, test, k=10))

0.04386336461293571
CPU times: user 68.3 ms, sys: 17.5 ms, total: 85.8 ms
Wall time: 4.1 s
