# Scenario Integration Test

Лаботоратория по искусственному интеллекту, Сбербанк. 

Авторы: 
[Борис Шминке](<mailto:Shminke.B.A@omega.sbrf.ru>), 
[Роза Айсина](<mailto:Aysina.R.M@omega.sbrf.ru>). 

О чем: вызов сценариев с разными моделями.
В качестве датасета используется датасет MovieLens100K. 

## Содержание

1. [Импорты, создание спарк-сессии](#intro)
2. [Загрузка данных](#data-loader)
3. [Сценарии с разными моделями](#scenario)
3.1 [Получение сценария через фабрику](#get-scenario)
3.2 [Обучение сценария](#fit-scenario)

### Импорты, создание спарк-сессии <a name='intro'></a>

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import logging
import os
import sys
from datetime import datetime

import matplotlib.pyplot as plt
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql import functions as sf

In [3]:
parent_dir = os.path.split(os.getcwd())[0]
if parent_dir not in sys.path:
    sys.path.append(parent_dir)

In [4]:
from sponge_bob_magic.data_loader.datasets import download_movielens
from sponge_bob_magic.data_preparator.data_preparator import DataPreparator

from sponge_bob_magic.splitters import log_splitter
from sponge_bob_magic.splitters import user_log_splitter
from sponge_bob_magic.metrics import metrics

from sponge_bob_magic.models.popular_recomennder import PopularRecommender
from sponge_bob_magic.models.als_recommender import ALSRecommender
from sponge_bob_magic.models.knn_recommender import KNNRecommender

from sponge_bob_magic.scenarios.main_scenario.main_factory import MainScenarioFactory

In [5]:
# отображение максимальной ширины колонок в pandas датафреймах
pd.options.display.max_colwidth = -1

In [6]:
spark_memory = "5g"
spark_cores = "*"
user_home = os.environ["HOME"]

spark = (
    SparkSession
    .builder
    .config("spark.driver.memory", spark_memory)
    .config("spark.local.dir", os.path.join(user_home, "tmp"))
    .master(f"local[{spark_cores}]")
    .enableHiveSupport()
    .getOrCreate()
)

spark

In [7]:
spark_logger = logging.getLogger("py4j")
spark_logger.setLevel(logging.WARN)

In [8]:
logger = logging.getLogger()
formatter = logging.Formatter("%(asctime)s, %(name)s, %(levelname)s: %(message)s",
                              datefmt="%d-%b-%y %H:%M:%S")
hdlr = logging.StreamHandler()
hdlr.setFormatter(formatter)
logger.addHandler(hdlr)
logger.setLevel(logging.DEBUG)

## Загрузка данных <a name="data-loader"></a>

In [9]:
path_data = os.path.join(os.environ["HOME"], "data")

if not os.path.exists(path_data):
    os.mkdir(path_data)
    
if not os.path.exists(os.path.join(path_data, "ml-latest-small")):
    download_movielens(path_data, "ml-latest-small")

## Подготовка данных <a name="data-preparator"></a>

In [10]:
path_log = os.path.join(path_data, "ml-latest-small", "ratings.csv")

In [11]:
dp = DataPreparator(spark)

log = dp.transform_log(
    path_log,
    format_type="csv",
    columns_names={
        "user_id": "userId", 
        "item_id": "movieId",
        "timestamp": "timestamp"
    },
    date_format=None,
    header=True
)

In [12]:
log.show(3)

+-------+-------+-------------------+----------+---------+
|user_id|item_id|          timestamp|   context|relevance|
+-------+-------+-------------------+----------+---------+
|      1|      1|2000-07-30 22:45:03|no_context|      1.0|
|      1|      3|2000-07-30 22:20:47|no_context|      1.0|
|      1|      6|2000-07-30 22:37:04|no_context|      1.0|
+-------+-------+-------------------+----------+---------+
only showing top 3 rows



In [13]:
log.count()

100836

In [14]:
log.select([
    sf.count(sf.when(sf.col(c).isNull(), c)).alias(c) 
    for c in log.columns
]).show()

+-------+-------+---------+-------+---------+
|user_id|item_id|timestamp|context|relevance|
+-------+-------+---------+-------+---------+
|      0|      0|        0|      0|        0|
+-------+-------+---------+-------+---------+



In [15]:
log.agg(*(sf.countDistinct(sf.col(c)).alias(c) for c in log.columns)).show()

+-------+-------+---------+-------+---------+
|user_id|item_id|timestamp|context|relevance|
+-------+-------+---------+-------+---------+
|    610|   9724|    85043|      1|        1|
+-------+-------+---------+-------+---------+



In [16]:
log.agg(sf.min(sf.col("timestamp")), sf.max(sf.col("timestamp"))).show()

+-------------------+-------------------+
|     min(timestamp)|     max(timestamp)|
+-------------------+-------------------+
|1996-03-29 21:36:55|2018-09-24 17:27:30|
+-------------------+-------------------+



In [17]:
df_short = log.limit(1000).cache()

In [18]:
def plot_result_value(results):
    results["value_name"] = (results["params"]["alpha"].astype(str)
                         .str.cat(results["params"]["beta"].astype(str), sep=", "))
    
    plt.figure(figsize=(7, 5))
    ax = results["value"].plot(kind="bar", xticks=range(len(results)), rot=0)
    ax.set_xticklabels(results["value_name"].values)

    plt.xlabel(r"$(\alpha, \beta)$" + " пары")
    plt.ylabel("Значение метрики")
    plt.title("Результаты эксперимента")

    plt.show()

## Сценарии с разными моделями <a name="scenario"></a>

### Получение сценария через фабрику <a name="get-scenario"></a>

In [19]:
popular_recommender = PopularRecommender(spark)
als_recommender = ALSRecommender(spark)
knn_recommender = KNNRecommender(spark)

In [20]:
log_bydate_splitter = log_splitter.LogSplitByDateSplitter(
    spark, True, True,
    datetime(2007, 1, 1)
)

log_random_splitter = log_splitter.LogSplitRandomlySplitter(
    spark, True, True,
    test_size=0.3, seed=1234
)

log_cold_splitter = log_splitter.ColdUsersExtractingSplitter(
    spark, True, True,
    test_size=0.3
)

user_random_splitter = user_log_splitter.RandomUserLogSplitter(
    spark, True, True,
    test_size=0.3, seed=1234
)

user_bydate_splitter = user_log_splitter.ByTimeUserLogSplitter(
    spark, True, True,
    test_size=0.3, seed=1234
)


In [37]:
factory = MainScenarioFactory(
    spark,
    splitter=user_random_splitter,
    recommender=knn_recommender,
    criterion=metrics.HitRateMetric(spark),
    metrics=[
        metrics.NDCGMetric(spark),
        metrics.PrecisionMetric(spark),
        metrics.MAPMetric(spark),
        metrics.RecallMetric(spark),
        metrics.Surprisal(spark, log),
    ],
    fallback_recommender=popular_recommender,
)

In [38]:
scenario = factory.get()

### Обучение сценария <a name="fit-scenario"></a>

In [23]:
# results = None

In [24]:
avg_num_users = (
    log
    .select("user_id", "item_id")
    .groupBy("item_id")
    .count()
    .select(sf.mean(sf.col("count")).alias("mean"))
    .collect()[0]["mean"]
)

avg_num_users

10.369806663924312

In [25]:
popular_grid = {
    "alpha": {"type": "int", "args": [0, 10]},
    "beta": {"type": "int", "args": [0, 10]}
}

als_grid = {
    "rank": {"type": "discrete_uniform", "args": [10, 100, 10]}
}

knn_grid = {
    "shrink": {"type": "discrete_uniform", "args": [10, 50, 10]},
    "num_neighbours": {"type": "discrete_uniform", "args": [0, 10, 1]},
}


In [40]:
best_params = scenario.research(
    knn_grid,
    log,
    k=10,
    n_trials=2
)

10-Jan-20 14:14:10, root, DEBUG: Деление лога на обучающую и тестовую выборку
10-Jan-20 14:14:20, root, DEBUG: Длина трейна и теста: (70856, 28628)
10-Jan-20 14:14:21, root, DEBUG: Количество пользователей в трейне и тесте: 610, 610
10-Jan-20 14:14:22, root, DEBUG: Количество объектов в трейне и тесте: 8532, 4950
10-Jan-20 14:14:22, root, DEBUG: Обучение и предсказание дополнительной модели
10-Jan-20 14:14:22, root, DEBUG: Проверка датафреймов
10-Jan-20 14:14:22, root, DEBUG: Предварительная стадия обучения (pre-fit)
10-Jan-20 14:14:23, root, DEBUG: Среднее количество items у каждого user: 117
10-Jan-20 14:14:26, root, DEBUG: Основная стадия обучения (fit)
10-Jan-20 14:14:26, root, DEBUG: Проверка датафреймов
10-Jan-20 14:14:28, root, DEBUG: Количество items после фильтрации: 127
10-Jan-20 14:14:30, root, DEBUG: Пре-фит модели
10-Jan-20 14:15:01, root, DEBUG: -------------
10-Jan-20 14:15:01, root, DEBUG: Оптимизация параметров
10-Jan-20 14:15:01, root, DEBUG: Максимальное количество п

In [41]:
results = pd.concat([scenario.study.trials_dataframe(), results], axis=0)

results

Unnamed: 0_level_0,datetime_complete,datetime_start,number,params,params,params,params,params,state,system_attrs,user_attrs,user_attrs,user_attrs,user_attrs,user_attrs,value
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,alpha,beta,num_neighbours,rank,shrink,Unnamed: 9_level_1,_number,MAP@k,Precision@k,Recall@K,Surprisal@K,nDCG@k,Unnamed: 16_level_1
0,2020-01-10 14:16:13.222048,2020-01-10 14:15:02.038705,0,,,2.0,,50.0,TrialState.COMPLETE,0,0.058702,0.277377,0.108356,2.366873,0.315359,0.796721
1,2020-01-10 14:17:06.055240,2020-01-10 14:16:13.386894,1,,,6.0,,50.0,TrialState.COMPLETE,1,0.066039,0.301639,0.121816,2.050082,0.335183,0.803279
0,2020-01-10 14:00:22.481347,2020-01-10 13:59:42.637781,0,,,,60.0,,TrialState.COMPLETE,0,0.02132,0.103443,0.065801,2.378971,0.102969,0.570492
1,2020-01-10 14:01:05.922637,2020-01-10 14:00:22.484975,1,,,,100.0,,TrialState.COMPLETE,1,0.013965,0.06623,0.049989,2.673296,0.063081,0.411475
0,2020-01-10 13:55:12.442364,2020-01-10 13:54:47.507182,0,8.0,7.0,,,,TrialState.COMPLETE,0,0.040025,0.22377,0.079841,1.321193,0.252684,0.72459
1,2020-01-10 13:55:31.983976,2020-01-10 13:55:12.445910,1,1.0,9.0,,,,TrialState.COMPLETE,1,0.040025,0.22377,0.079841,1.321193,0.252684,0.72459
2,2020-01-10 13:55:50.463493,2020-01-10 13:55:31.988342,2,0.0,7.0,,,,TrialState.COMPLETE,2,0.040025,0.22377,0.079841,1.321193,0.252684,0.72459
3,2020-01-10 13:56:09.206799,2020-01-10 13:55:50.466856,3,1.0,3.0,,,,TrialState.COMPLETE,3,0.040025,0.22377,0.079841,1.321193,0.252684,0.72459
4,2020-01-10 13:56:27.184263,2020-01-10 13:56:09.210281,4,5.0,3.0,,,,TrialState.COMPLETE,4,0.040025,0.22377,0.079841,1.321193,0.252684,0.72459
5,2020-01-10 13:56:45.794654,2020-01-10 13:56:27.188808,5,0.0,5.0,,,,TrialState.COMPLETE,5,0.040025,0.22377,0.079841,1.321193,0.252684,0.72459


### Получение рекомендаций <a name="predict-scenario"></a>

In [42]:
recs = scenario.production(
    best_params, 
    log,
    users=None, 
    items=None,
    k=10
)

10-Jan-20 14:18:29, root, DEBUG: Проверка датафреймов
10-Jan-20 14:18:29, root, DEBUG: Предварительная стадия обучения (pre-fit)
10-Jan-20 14:19:46, root, DEBUG: Основная стадия обучения (fit)
10-Jan-20 14:20:37, root, DEBUG: Проверка датафреймов
10-Jan-20 14:20:38, root, DEBUG: Выделение дефолтных юзеров
10-Jan-20 14:20:38, root, DEBUG: Выделение дефолтных айтемов


In [43]:
recs.show()

+-------+-------+------------------+----------+
|item_id|user_id|         relevance|   context|
+-------+-------+------------------+----------+
|   1580|    195|11.499230672798609|no_context|
|   2115|    195|  8.99529232689553|no_context|
|   2683|    195| 8.067385610963404|no_context|
|   2959|    195| 7.578516529357971|no_context|
|   2628|    195| 7.495957558985156|no_context|
|   2997|    195| 7.376587494014001|no_context|
|   1265|    195| 6.870196201535873|no_context|
|    541|    195|  6.82382219478528|no_context|
|   1240|    195| 6.534130351660293|no_context|
|    356|    195| 6.152749812022566|no_context|
|   6539|    223| 9.439693235979506|no_context|
|   2959|    223|  7.64093444541947|no_context|
|   1210|    223|  6.79125386042567|no_context|
|   4993|    223| 6.160575178364514|no_context|
|    356|    223| 5.832570913130105|no_context|
|   5952|    223| 4.727428693405704|no_context|
|   1580|    223| 4.308394621811742|no_context|
|   1196|    223|3.9040472686378402|no_c