# MovieLens Dataset

### Импорты, создание спарк-сессии <a name='intro'></a>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import logging
import os
import re
import sys
from collections import Counter
from datetime import datetime
from time import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf
from pyspark.sql.types import DoubleType, StringType, StructType, StructField, TimestampType

from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# os.environ["PYSPARK_PYTHON"] = "/anaconda3/envs/sbm/bin/python"

In [4]:
parent_dir = os.path.split(os.getcwd())[0]
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [5]:
from sponge_bob_magic.data_loader.datasets import download_movielens
from sponge_bob_magic.data_preparator.data_preparator import DataPreparator
from sponge_bob_magic.metrics.metrics import HitRateMetric, NDCGMetric
from sponge_bob_magic.models.neuromf_recommender import NeuroMFRecommender
from sponge_bob_magic.splitters.user_log_splitter import RandomUserLogSplitter

In [6]:
# отображение максимальной ширины колонок в pandas датафреймах
pd.options.display.max_colwidth = -1

In [7]:
spark_memory = "4g"
spark_cores = "*"
user_home = "/Users/alside/recommends/tmp"

spark = (
    SparkSession
    .builder
    .config('spark.driver.memory', spark_memory)
    .config('spark.local.dir', os.path.join(user_home, "tmp"))
    .master(f'local[{spark_cores}]')
    .enableHiveSupport()
    .getOrCreate()
)

spark

In [8]:
spark_logger = logging.getLogger('py4j')
spark_logger.setLevel(logging.WARN)

In [9]:
logger = logging.getLogger()
formatter = logging.Formatter('%(asctime)s, %(name)s, %(levelname)s: %(message)s',
                              datefmt='%d-%b-%y %H:%M:%S')
hdlr = logging.StreamHandler()
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)

## Подготовка данных <a name='data-preparator'></a>

In [10]:
path_data = '../sponge_bob_magic/data/'

if not os.path.exists(path_data):
    os.mkdir(path_data)
    
if not os.path.exists(os.path.join(path_data, "ml-100k")):
    download_movielens(path_data, "ml-100k")
    
path_log = os.path.join(path_data, "ml-100k", "u.data")

In [11]:
dp = DataPreparator(spark)

In [12]:
df = dp.transform_log(
    path_log,
    format_type='csv',
    columns_names={
        'user_id': 'userId', 
        'item_id': 'movieId',
    },
    date_format=None,
    header=True
)

In [13]:
df.show(3)

+-------+-------+-------------------+----------+---------+
|user_id|item_id|          timestamp|   context|relevance|
+-------+-------+-------------------+----------+---------+
|      1|   1193|1999-05-01 00:00:00|no_context|      1.0|
|      1|    661|1999-05-01 00:00:00|no_context|      1.0|
|      1|    914|1999-05-01 00:00:00|no_context|      1.0|
+-------+-------+-------------------+----------+---------+
only showing top 3 rows



In [14]:
df.count()

1000209

In [16]:
splitter = RandomUserLogSplitter(spark, True, True)

train, test_input, test = splitter._split_quantity(df)

(
    train.count(), 
    test_input.count(), 
    test.count()
)

(994169, 994169, 6040)

## NMF

In [17]:
ncf = NeuroMFRecommender(spark,
                         learning_rate=0.01,
                         epochs=10,
                         embedding_dimension=100)

In [18]:
%%time

ncf._pre_fit(
    log=train,
    user_features=None,
    item_features=None
)

CPU times: user 31.7 ms, sys: 7.64 ms, total: 39.3 ms
Wall time: 13.1 s


In [19]:
%%time

ncf._fit_partial(
    log=train,
    user_features=None,
    item_features=None
)

29-Jan-20 15:53:17, root, DEBUG: Индексирование данных
29-Jan-20 15:53:17, root, DEBUG: Составление батча:
29-Jan-20 15:53:17, root, DEBUG: -- Запись
29-Jan-20 15:53:19, root, DEBUG: -- Считывание
29-Jan-20 15:53:19, root, DEBUG: Обучение модели
29-Jan-20 15:53:19, root, DEBUG: -- Эпоха 0
29-Jan-20 15:53:45, root, DEBUG: -- Текущее значение: 0.9162
29-Jan-20 15:53:45, root, DEBUG: -- Эпоха 1
29-Jan-20 15:54:10, root, DEBUG: -- Текущее значение: 0.4249
29-Jan-20 15:54:10, root, DEBUG: -- Эпоха 2
29-Jan-20 15:54:34, root, DEBUG: -- Текущее значение: 0.2546
29-Jan-20 15:54:34, root, DEBUG: -- Эпоха 3
29-Jan-20 15:54:59, root, DEBUG: -- Текущее значение: 0.2233
29-Jan-20 15:54:59, root, DEBUG: -- Эпоха 4
29-Jan-20 15:55:24, root, DEBUG: -- Текущее значение: 0.2125
29-Jan-20 15:55:24, root, DEBUG: -- Эпоха 5
29-Jan-20 15:55:48, root, DEBUG: -- Текущее значение: 0.2071
29-Jan-20 15:55:48, root, DEBUG: -- Эпоха 6
29-Jan-20 15:56:12, root, DEBUG: -- Текущее значение: 0.2024
29-Jan-20 15:56:12,

CPU times: user 3min 40s, sys: 4.71 s, total: 3min 44s
Wall time: 4min 17s


In [20]:
%%time

recs = ncf.predict(
    k=10,
    users=test.select('user_id').distinct(),
    items=test.select('item_id').distinct(),
    context='no_context',
    log=train,
    user_features=None,
    item_features=None,
    filter_seen_items=True
)

29-Jan-20 16:51:12, root, DEBUG: Проверка датафреймов
29-Jan-20 16:51:15, root, DEBUG: Индексирование данных
29-Jan-20 16:51:15, root, DEBUG: Предсказание модели
29-Jan-20 16:51:15, root, DEBUG: -- Запись
29-Jan-20 16:51:16, root, DEBUG: -- Считывание
29-Jan-20 16:51:16, root, DEBUG: Поиск ближайших айтемов с помощью annoy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,
29-Jan-20 16:51:27, root, DEBUG: Обратное преобразование индексов
29-Jan-20 16:51:28, root, DEBUG: Преобразование отрицательных relevance


CPU times: user 11.8 s, sys: 252 ms, total: 12.1 s
Wall time: 40.5 s


In [21]:
%%time

hr = HitRateMetric(spark)
print(hr.calculate(recs, test, k=10))

0.0402317880794702
CPU times: user 6.1 ms, sys: 2.12 ms, total: 8.23 ms
Wall time: 2.22 s


In [22]:
%%time

ndcg = NDCGMetric(spark)
print(ndcg.calculate(recs, test, k=10))

0.022395286988700216
CPU times: user 46.9 ms, sys: 11 ms, total: 57.9 ms
Wall time: 2.85 s
