# Задание 1: Подготовка данных

Для каждого алгоритма преобразуйте данные для BackTest'а:

1. Соедините исторические данные и предсказания
2. Преобразуйте данные (нормализация количество заказов на трафик + группировка)

In [1]:
from typing import List, Dict, Any, Tuple, Union, Optional

import pandas as pd

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig()
logger = logging.getLogger("back_test")
logger.setLevel(logging.INFO)

In [2]:
from enum import Enum
from typing import List


class BackTestAlgo(str, Enum):
    epsilon_greedy_sum = "epsilon_greedy_sum"

    @classmethod
    def to_list(cls) -> List[str]:
        return list(map(lambda c: c.value, cls))  # type: ignore


class BackTestLevel(str, Enum):
    group_1 = "group_1"
    item_id = "item_id"
    sku_id = "sku_id"

    @classmethod
    def to_list(cls) -> List[str]:
        return list(map(lambda c: c.value, cls))  # type: ignore


class BackTestMetric(str, Enum):
    revenue = "revenue"
    margin = "margin"
    orders_num = "orders_num"

    @classmethod
    def to_list(cls) -> List[str]:
        return list(map(lambda c: c.value, cls))  # type: ignore


In [3]:
DS_FMT = "%Y%m%d"


def get_ds_list(ds_ranges: List[Union[Tuple[str, str], str]]) -> List[str]:
    ds_list = []
    for ds_range in ds_ranges:
        if isinstance(ds_range, tuple):
            start, end = ds_range
            ds_list = [date.strftime(DS_FMT) for date in pd.date_range(start, end)]
            ds_list.extend(ds_list)
        elif isinstance(ds_range, str):
            ds_list.append(ds_range)
        else:
            raise ValueError(
                "You should set `ds_range` as in example: "
                "('20220801', '20220802') or "
                "[('20220801', '20220802'), '20220804']"
            )
    return ds_list

In [4]:
ALGO = BackTestAlgo.epsilon_greedy_sum
LVL = BackTestLevel.group_1
ALGO_PARAMS = {
    "epsilon": 0.01,
    "do_show_intersection": True,
}
DS_RANGES = [("20231008", "20231021")]
METRICS = [BackTestMetric.orders_num]

In [5]:
ab_df = pd.read_parquet("../data/simulated_data.parquet")
ab_df["ds"] = ab_df["ds"].astype(str)
ab_df.head()

Unnamed: 0,group_1,sku_id,ab_test_id,markup,revenue,traffic,orders_num,ds
0,group_1000,sku_100000,ab_100000,0.01,2539.41,0.05,1.0,20231008
1,group_1001,sku_100001,ab_100001,0.02,6057.44,0.05,3.0,20231008
2,group_1002,sku_100002,ab_100002,0.01,541.35,0.05,8.0,20231008
3,group_1002,sku_100003,ab_100003,0.06,697.7,0.05,0.0,20231008
4,group_1001,sku_100004,ab_100004,0.01,1413.99,0.05,0.0,20231008


In [6]:
test_dfs = {}
for algo_id in range(1, 6):
    algo_name = f"algo_{algo_id}"
    algo_df = pd.read_parquet(f"../data/algo/{algo_name}.parquet")
    algo_df["ds"] = algo_df["ds"].astype(str)
    test_dfs[algo_name] = algo_df

In [7]:
test_dfs["algo_1"].head()

Unnamed: 0,group_1,markup,ds
0,group_1000,0.06,20231008
1,group_1001,0.04,20231008
2,group_1002,0.06,20231008
3,group_1003,0.05,20231008
4,group_1004,0.06,20231008


In [8]:
def combine_history_and_predictions(
    df: pd.DataFrame,
    test_dfs: Dict[str, pd.DataFrame],
) -> pd.DataFrame:
    """
    Возвращается один датасет с колонками `control_markup` и `test_markup`
    """
    def set_markup(
        prefix: str,
        df: pd.DataFrame,
        group_df: pd.DataFrame,
    ) -> pd.DataFrame:
        group_df = group_df.rename(columns={"markup": f"{prefix}_markup"})
        common_cols = list(set(group_df.columns).intersection(set(df.columns)))
        logger.info(f"Start joining {prefix} df on {common_cols}")
        df = df.merge(group_df, how="left", on=common_cols)
        return df
    for algo_name, test_df in test_dfs.items():
        df = set_markup(
            prefix=algo_name,
            df=df,
            group_df=test_df,
        )
    return df

In [9]:
def postprocess(
    df: pd.DataFrame,
    lvl: str,
    metrics: List[BackTestMetric],
    algo_names: List[str],
) -> pd.DataFrame:
    """
    Возвращается преобразованный датасет, исходя из того, на каком уровне проводится BackTest,
    усредняются значения наценок на выбранном уровне
    """
    agg_funcs = {metric: "sum" for metric in metrics}
    agg_funcs.update({f"{algo_name}_markup": "mean" for algo_name in algo_names})
    agg_funcs.update({"markup": "mean", "traffic": "mean"})
    df = df.groupby(["ds", lvl, "ab_test_id"]).agg(agg_funcs).reset_index()
    for metric in metrics:
        df[metric] = df[metric] / df["traffic"]
    agg_funcs = {metric: "mean" for metric in metrics}
    agg_funcs.update({f"{algo_name}_markup": "mean" for algo_name in algo_names})
    df = df.groupby(["ds", lvl, "markup"]).agg(agg_funcs).reset_index()
    df = df.round(2)
    return df

In [10]:
def get_ab_df(
    df: pd.DataFrame,
    ds_list: List[str],
    lvl: BackTestLevel,
    metrics: List[BackTestMetric],
    test_dfs: Dict[str, pd.DataFrame],
) -> pd.DataFrame:
    """
    Готовит данные для BackTest'a:
    1) выбирает нужные даты
    2) соединяет исторические и предсказанные наценки
    3) преобразует датасет под заданный уровень
    """
    df = df[df["ds"].isin(ds_list)]
    df = combine_history_and_predictions(
        df=df,
        test_dfs=test_dfs,
    )
    df = postprocess(df=df, lvl=lvl, metrics=metrics, algo_names=list(test_dfs.keys()))
    return df

In [11]:
ds_list = get_ds_list(ds_ranges=DS_RANGES)

result_1_df = get_ab_df(
    df=ab_df,
    ds_list=ds_list,
    lvl=LVL,
    metrics=METRICS,
    test_dfs=test_dfs,
)
result_1_df = result_1_df.rename(columns={BackTestMetric.orders_num: "orders_num"})
result_1_df.head()

INFO:back_test:Start joining algo_1 df on ['group_1', 'ds']
INFO:back_test:Start joining algo_2 df on ['group_1', 'ds']
INFO:back_test:Start joining algo_3 df on ['group_1', 'ds']
INFO:back_test:Start joining algo_4 df on ['group_1', 'ds']
INFO:back_test:Start joining algo_5 df on ['group_1', 'ds']


Unnamed: 0,ds,group_1,markup,orders_num,algo_1_markup,algo_2_markup,algo_3_markup,algo_4_markup,algo_5_markup
0,20231008,group_1000,0.01,40020.0,0.06,0.02,0.01,0.02,0.05
1,20231008,group_1000,0.01,37880.0,0.06,0.02,0.01,0.02,0.05
2,20231008,group_1000,0.01,30280.0,0.06,0.02,0.01,0.02,0.05
3,20231008,group_1000,0.03,36500.0,0.06,0.02,0.01,0.02,0.05
4,20231008,group_1000,0.03,27240.0,0.06,0.02,0.01,0.02,0.05


In [12]:
result_1_df.to_csv("../data/homework_2_1_solution.csv", index=False)