# 1. Amazon petshop
*Данные взяты из репозитория [petshop-root-cause-analysis](https://github.com/amazon-science/petshop-root-cause-analysis), распространяемого под лицензией Creative Commons Attribution 4.0 International (CC BY 4.0).*

In [1]:
from itertools import islice
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd

In [63]:
ALL_FILES = {
    'low': Path('/home/jupyter/datasphere/s3/storage/amazon_petshop/low_traffic/noissue/metrics.csv'),
    'high': Path('/home/jupyter/datasphere/s3/storage/amazon_petshop/high_traffic/noissue/metrics.csv'),
    'temporal': Path('/home/jupyter/datasphere/s3/storage/amazon_petshop/temporal_traffic1/noissue/metrics.csv'),
}

BASE_PREPARED_PATH = Path('/home/jupyter/datasphere/s3/storage/prepared/amazon_petshop')

In [64]:
def calculate_rps(df: pd.DataFrame) -> pd.DataFrame:
    """Вычисляет RPS для всех колонок с метрикой requests"""
    # Находим все колонки с requests
    requests_cols = [col for col in df.columns if col.endswith(' | requests | Sum')]
    
    for col in requests_cols:
        # Вычисляем RPS = diff(value) / diff(time)
        delta_seconds = df.index.to_series().diff().dt.total_seconds()
        rps = df[col] / delta_seconds
        
        # Заменяем исходную колонку на RPS
        df[col.replace('Sum', 'rps')] = rps
        df.drop(col, axis=1, inplace=True)
    
    return df


In [65]:
def load_df(filename: str | Path) -> pd.DataFrame:
    """Загружает датафрейм из csv, форматирует названия колонок, удаляет лишнее"""
    df = pd.read_csv(filename,
                     header=[0, 1, 2, 3],
                     )
    df = df.loc[:, df.columns.get_level_values(1) != 'availability']
    df = df.loc[:, ~(
        (df.columns.get_level_values(1) == 'latency') &  # Если метрика latency
        (df.columns.get_level_values(2) != 'p99')        # И перцентиль НЕ p99
    )]
    df.columns = df.columns.map(lambda x: ' | '.join(islice(x, len(x) - 1)))
    df = df.rename(columns={df.columns[0]: 'timestamp'})
    df['timestamp'] = pd.to_datetime(df['timestamp'], unit='s')
    df = df.set_index('timestamp')
    return df

In [66]:
for slug, path in ALL_FILES.items():
    df = load_df(path)
    df = calculate_rps(df)
    ts_delta = df.index.to_series().diff()
    assert ts_delta.nunique() == 1, 'Нерегулярный ряд'
    df.to_csv(BASE_PREPARED_PATH / f'{slug}__{int(df.index.min().timestamp())}-{int(df.index.max().timestamp())}__{int(ts_delta[1].total_seconds())}s.csv', index=False)


# 2. SURF Machine Metric Dataset [2019-12-29, 2020-08-07]
*Данные взяты из датасета [SURF Machine Metric Dataset](https://zenodo.org/records/4459519), распространяемого под лицензией Creative Commons Attribution 4.0 International (CC BY 4.0).*

In [2]:
ALL_PATHS = {
    'memory_usage': Path('/home/jupyter/datasphere/s3/storage/surf_machine_data/processed-surf-dataset/((node_memory_MemTotal%20-%20node_memory_MemFree%20-%20node_memory_Cached)%20%2F%20(node_memory_MemTotal))%20*%20100'),
    'swap_usage': Path('/home/jupyter/datasphere/s3/storage/surf_machine_data/processed-surf-dataset/((node_memory_SwapTotal%20-%20node_memory_SwapFree)%20%2F%20(node_memory_SwapTotal))%20*%20100'),
    'node_load': Path('/home/jupyter/datasphere/s3/storage/surf_machine_data/processed-surf-dataset/node_load5'),
}
BASE_PREPARED_PATH = Path('/home/jupyter/datasphere/s3/storage/prepared/surf')

In [16]:
for metric, path in ALL_PATHS.items():
    parquet_files = (f for f in path.iterdir() if f.name.endswith('.parquet'))
    all_dataframes = []
    for file in parquet_files:
        df = pd.read_parquet(file)
        df.index = pd.to_datetime(df.index, unit='s')
        all_dataframes.append(df.resample('5min').mean())

    combined_df = pd.concat(all_dataframes).sort_index()
    combined_df.to_csv(BASE_PREPARED_PATH / f'{metric}__{int(df.index.min().timestamp())}-{int(df.index.max().timestamp())}__300s.csv', index=False)

In [3]:
for metric, path in ALL_PATHS.items():
    parquet_files = (f for f in path.iterdir() if f.name.endswith('.parquet'))
    all_dataframes = []
    for file in parquet_files:
        df = pd.read_parquet(file)
        df.index = pd.to_datetime(df.index, unit='s')
        all_dataframes.append(df)

    combined_df = pd.concat(all_dataframes).sort_index()
    combined_df.to_csv(BASE_PREPARED_PATH / f'{metric}__{int(df.index.min().timestamp())}-{int(df.index.max().timestamp())}__15s.csv', index=False)

In [5]:
for metric, path in ALL_PATHS.items():
    parquet_files = (f for f in path.iterdir() if f.name.endswith('.parquet'))
    all_dataframes = []
    for file in parquet_files:
        df = pd.read_parquet(file)
        df.index = pd.to_datetime(df.index, unit='s')
        all_dataframes.append(df.resample('1min').mean())

    combined_df = pd.concat(all_dataframes).sort_index()
    combined_df.to_csv(BASE_PREPARED_PATH / f'{metric}__{int(df.index.min().timestamp())}-{int(df.index.max().timestamp())}__60s.csv', index=False)

# Результат
1. amazon petshop:
    - https://storage.yandexcloud.net/sssemion-diplom-storage/prepared/amazon_petshop/high__1693922400-1694098800__300s.csv
    - https://storage.yandexcloud.net/sssemion-diplom-storage/prepared/amazon_petshop/low__1692608400-1692784800__300s.csv
    - https://storage.yandexcloud.net/sssemion-diplom-storage/prepared/amazon_petshop/temporal__1682193300-1682688600__300s.csv
    
1. surf machine metrics:
    - https://storage.yandexcloud.net/sssemion-diplom-storage/prepared/surf/memory_usage__1589839200-1589925585__300s.csv
    - https://storage.yandexcloud.net/sssemion-diplom-storage/prepared/surf/node_load__1596751200-1596837585__300s.csv
    - https://storage.yandexcloud.net/sssemion-diplom-storage/prepared/surf/swap_usage__1589839200-1589925585__300s.csv