In [170]:
import datetime as dt

In [250]:
import json
from datetime import timedelta

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import torch
from chronos import ChronosPipeline

from sklearn.metrics import mean_absolute_error, mean_absolute_error, mean_absolute_percentage_error

In [251]:
import plotly.graph_objects as go

layout_params = {
    'font_color': '#000000',
    'font_family': 'Avenir Next',
    'font_size': 11,
    'margin': {'l': 0, 'r': 0, 't': 80, 'b': 0},
    'paper_bgcolor': '#FFFFFF',
    'plot_bgcolor': '#f5f5f5'
}

In [252]:
pipeline = ChronosPipeline.from_pretrained(
  "amazon/chronos-t5-tiny",
  device_map="mps",
  torch_dtype=torch.bfloat16,
)

In [253]:
# load data

PATH = "data"

sales_df = pd.read_csv(f"{PATH}/shop_sales.csv")
dates_df = pd.read_csv(f"{PATH}/shop_sales_dates.csv")

# preprocces
train = sales_df.merge(dates_df[['date', 'date_id', 'year', 'wm_yr_wk']], on='date_id')
# del dates_df, sales_df

train['store_item_id'] = train.item_id
train.item_id = train.item_id.apply(lambda x: x.split('_')[-1])

train.date = pd.to_datetime(train.date)
train = train.rename(columns={'cnt': 'sales'})

In [169]:
def get_historical_data(item_store_ids: list, data_path: str):
    
    # load data
    train = pd.read_csv(f"{data_path}/shop_sales.csv")
    dates_df = pd.read_csv(f"{data_path}/shop_sales_dates.csv")

    # preprocces
    train = train.merge(dates_df[['date', 'date_id', 'year', 'wm_yr_wk']], on='date_id')
    train['store_item_id'] = train.item_id
    train.item_id = train.item_id.apply(lambda x: x.split('_')[-1])
    
    # get ids train data
    train = train[train.store_item_id.isin(item_store_ids)]
    
    return train

In [None]:
# def preprocess_data(train: pd.DataFrame(), prediction_type: str):
    
#     if prediction_type == 'week':
#         train = train.groupby(['wm_yr_wk', 'store_item_id'], as_index=False).

In [199]:
def get_validation_dates(train: pd.DataFrame(), prediction_length, prediction_type: None):
    
    train.date = pd.to_datetime(train.date)
    end_date = train['date'].max()
    start_date = end_date - dt.timedelta(days=prediction_length-1)
    date_range = pd.date_range(start_date, end_date)

    return date_range

In [195]:
# valid = train[train.date.isin(date_range)].copy()
# train = train[~train.date.isin(date_range)]

In [217]:
def make_predictions(prediction_length: int, train: pd.DataFrame(), date_range: list):
    
    all_predicts = []
    for store_item_id in train.store_item_id.unique():
        context = torch.tensor(df_train["sales"].tolist())
        forecast = pipeline.predict(context, prediction_length)  
        low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)
         
        # result = {
        #     'store_item_id': store_item_id,
        #     'date': date_range.tolist(),
        #     'low': low.tolist(),
        #     'median': median.tolist(),
        #     'high': high.tolist()
        # }
        # all_predicts.append(result) 
        
        data = {
            'date': date_range.tolist(), 
            'low': low.tolist(),
            'median': median.tolist(),
            'high': high.tolist()
            }
        result = pd.DataFrame.from_dict(data)
        result['store_item_id'] = store_item_id
        
        all_predicts.append(result) 
        
    all_predicts = pd.concat(all_predicts)    
          
    return all_predicts

In [236]:

store_item_ids = train.store_item_id.unique()
data_path = "data"
prediction_length = 30

train = get_historical_data(item_store_ids=store_item_ids, data_path=data_path)
date_range = get_validation_dates(train=train, prediction_length=prediction_length, prediction_type=None)

valid = train[train.date.isin(date_range)].copy()
train = train[~train.date.isin(date_range)]

result = make_predictions(prediction_length=prediction_length, train=train, date_range=date_range)


In [237]:
valid = valid.merge(result, on=['store_item_id', 'date'])

In [238]:
mean_absolute_error(valid['cnt'], valid['median'])

np.float64(15.784673581219623)

In [239]:
for store_item_id in train.store_item_id.unique():
    test = valid[valid.store_item_id==store_item_id].copy()
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=test['date'], y=test['cnt'], mode='lines', name='test'))
    fig.add_trace(go.Scatter(x=test['date'], y=test['median'], mode='lines', name='pred'))
    fig.add_trace(go.Scatter(x=test['date'], y=test['low'], mode='lines', name='low'))
    fig.add_trace(go.Scatter(x=test['date'], y=test['high'], mode='lines', name='high'))
    fig.update_layout(**layout_params, width=700).show()

In [None]:
import json
from datetime import timedelta

import numpy as np
import pandas as pd

from tqdm.notebook import tqdm

import torch
from chronos import ChronosPipeline


def get_historical_data(item_store_ids: list, data_path: str):
    
    # load data
    train = pd.read_csv(f"{data_path}/shop_sales.csv")
    dates_df = pd.read_csv(f"{data_path}/shop_sales_dates.csv")

    # preprocces
    train = train.merge(dates_df[['date', 'date_id', 'year', 'wm_yr_wk']], on='date_id')
    train['store_item_id'] = train.item_id
    train.item_id = train.item_id.apply(lambda x: x.split('_')[-1])
    
    # get ids train data
    train = train[train.store_item_id.isin(item_store_ids)]
    
    return train


def get_validation_dates(train: pd.DataFrame(), prediction_length, prediction_type: None):
    
    train.date = pd.to_datetime(train.date)
    end_date = train['date'].max()
    start_date = end_date - dt.timedelta(days=prediction_length-1)
    date_range = pd.date_range(start_date, end_date)

    return date_range


def make_predictions(prediction_length: int, train: pd.DataFrame(), date_range: list):
    
    all_predicts = []
    
    pipeline = ChronosPipeline.from_pretrained(
        "amazon/chronos-t5-tiny",
        device_map="mps", #"cpu"
        torch_dtype=torch.bfloat16,
        )
    
    for store_item_id in train.store_item_id.unique():
        context = torch.tensor(df_train["sales"].tolist())
        forecast = pipeline.predict(context, prediction_length)  
        low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)
         
        # result = {
        #     'store_item_id': store_item_id,
        #     'date': date_range.tolist(),
        #     'low': low.tolist(),
        #     'median': median.tolist(),
        #     'high': high.tolist()
        # }
        # all_predicts.append(result) 
        
        data = {
            'date': date_range.tolist(), 
            'low': low.tolist(),
            'median': median.tolist(),
            'high': high.tolist()
            }
        result = pd.DataFrame.from_dict(data)
        result['store_item_id'] = store_item_id
        
        all_predicts.append(result) 
        
    all_predicts = pd.concat(all_predicts)    
          
    return all_predicts

def run():
    store_item_ids = train.store_item_id.unique()
    data_path = "data"
    prediction_length = 30

    train = get_historical_data(item_store_ids=store_item_ids, data_path=data_path)
    date_range = get_validation_dates(train=train, prediction_length=prediction_length, prediction_type=None)

    valid = train[train.date.isin(date_range)].copy()
    train = train[~train.date.isin(date_range)]

    result = make_predictions(prediction_length=prediction_length, train=train, date_range=date_range)


         date       low     median        high store_item_id
0  2015-12-23  5.909453   8.991706   15.104337   STORE_2_085
1  2015-12-24  5.909453  11.520624   16.807574   STORE_2_085
2  2015-12-25  4.712000   9.985982   15.199442   STORE_2_085
3  2015-12-26  3.968455   7.003156   15.000587   STORE_2_085
4  2015-12-27  3.916580   7.500294   14.006312   STORE_2_085
..        ...       ...        ...         ...           ...
25 2016-01-17  0.000001   0.720725   79.351621   STORE_2_090
26 2016-01-18  0.000001   0.720725   72.072319   STORE_2_090
27 2016-01-19  0.000001   0.360363   84.396689   STORE_2_090
28 2016-01-20  0.000001   0.720725  123.964391   STORE_2_090
29 2016-01-21  0.000001   1.441448  125.333760   STORE_2_090

[150 rows x 5 columns]


In [None]:
# Для меня для теста 
import json
import pandas as pd
import numpy as np
import torch
from datetime import timedelta
from tqdm.notebook import tqdm
from chronos import ChronosPipeline

class SalesPredictor:
    def __init__(self, data_path: str, prediction_length: int, store_item_ids: list):
        self.store_item_ids = store_item_ids
        self.data_path = data_path
        self.prediction_length = prediction_length
        self.pipeline = ChronosPipeline.from_pretrained(
            "amazon/chronos-t5-tiny",
            device_map="mps", # на маке mps, но надо будет поменять на "cpu"
            torch_dtype=torch.bfloat16,
        )

    def get_historical_data(self) -> pd.DataFrame:
        """Загрузка и предварительная обработка исторических данных."""
        
        #### TO DO - заменить подрузгу данных
        
        # train = pd.read_csv(f"{self.data_path}/shop_sales.csv")
        # dates_df = pd.read_csv(f"{self.data_path}/shop_sales_dates.csv")
        
        train = train.merge(dates_df[['date', 'date_id', 'year', 'wm_yr_wk']], on='date_id')
        train['store_item_id'] = train.item_id
        train.item_id = train.item_id.apply(lambda x: x.split('_')[-1])
        
        # Фильтрация данных
        train = train[train.store_item_id.isin(self.store_item_ids)]
        
        return train

    def get_validation_dates(self, train: pd.DataFrame) -> pd.DatetimeIndex:
        """Получение диапазона дат для валидации."""
        
        train.date = pd.to_datetime(train.date)
        end_date = train['date'].max()
        start_date = end_date - timedelta(days=self.prediction_length - 1)
        return pd.date_range(start_date, end_date)

    def make_predictions(self, train: pd.DataFrame, date_range: pd.DatetimeIndex) -> pd.DataFrame:
        """Создание предсказаний для каждого store_item_id."""
        
        all_predicts = []

        for store_item_id in train.store_item_id.unique():
            context = torch.tensor(train[train.store_item_id == store_item_id]["cnt"].tolist())
            forecast = self.pipeline.predict(context, self.prediction_length)
            low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)

            # Формирование DataFrame с результатами
            data = {
                'date': date_range.tolist(),
                'low': low.tolist(),
                'median': median.tolist(),
                'high': high.tolist()
            }
            result = pd.DataFrame.from_dict(data)
            result['store_item_id'] = store_item_id
            
            all_predicts.append(result)

        return pd.concat(all_predicts)

    def run(self):
        """Основной метод для запуска предсказаний."""
        # store_item_ids = self.get_historical_data([]).store_item_id.unique()
        train = self.get_historical_data()
        date_range = self.get_validation_dates(train=train)

        valid = train[train.date.isin(date_range)].copy()
        train = train[~train.date.isin(date_range)]

        result = self.make_predictions(train=train, date_range=date_range)
        return result

# Пример использования
if __name__ == "__main__":
    data_path = "data"
    prediction_length = 28
    ids = [
    'STORE_2_085', 
    'STORE_2_043', 
    'STORE_2_054', 
    'STORE_2_325',
    'STORE_2_090', 
    'Имитация айди без истории'
    ]
    predictor = SalesPredictor(data_path=data_path, prediction_length=prediction_length, store_item_ids=ids)
    predictions = predictor.run()
    print(predictions)

In [285]:
# Для системы

import json
import pandas as pd
import numpy as np
import torch
from datetime import timedelta
from tqdm.notebook import tqdm
from chronos import ChronosPipeline

class SalesPredictor:
    def __init__(self, data_path: str, prediction_length: int, store_item_ids: list):
        self.store_item_ids = store_item_ids
        self.data_path = data_path
        self.prediction_length = prediction_length
        self.pipeline = ChronosPipeline.from_pretrained(
            "amazon/chronos-t5-tiny",
            device_map="mps", # на маке mps, но надо будет поменять на "cpu"
            torch_dtype=torch.bfloat16,
        )

    def get_historical_data(self) -> pd.DataFrame:
        """Загрузка и предварительная обработка исторических данных."""
        
        #### TO DO - заменить подрузгу данных
        
        train = pd.read_csv(f"{self.data_path}/shop_sales.csv")
        dates_df = pd.read_csv(f"{self.data_path}/shop_sales_dates.csv")
        
        # Объединение и предобработка данных 
        train = train.merge(dates_df[['date', 'date_id', 'year', 'wm_yr_wk']], on='date_id')
        train['store_item_id'] = train.item_id
        train.item_id = train.item_id.apply(lambda x: x.split('_')[-1])
        
        # Фильтрация данных
        train = train[train.store_item_id.isin(self.store_item_ids)]
        
        return train


    def make_predictions(self, train: pd.DataFrame) -> pd.DataFrame:
        """Создание предсказаний для каждого store_item_id."""
        
        all_predicts = []
        
        train.date = pd.to_datetime(train.date)
        start_date = train['date'].max() + timedelta(days=1)
        end_date = start_date + timedelta(days=self.prediction_length - 1)
        predict_range = pd.date_range(start_date, end_date)
        
        for store_item_id in train.store_item_id.unique():
            store_item_id_df = train[train.store_item_id == store_item_id].copy()
            
            store_id = store_item_id_df.store_id.unique()[0]
            item_id = store_item_id_df.item_id.unique()[0]
            context = torch.tensor(store_item_id_df["cnt"].tolist())
            forecast = self.pipeline.predict(context, self.prediction_length)
            low, median, high = np.quantile(forecast[0].numpy(), [0.1, 0.5, 0.9], axis=0)

            # Формирование DataFrame с результатами
            data = {
                'date': predict_range.tolist(),
                'low': low.tolist(),
                'median': median.tolist(),
                'high': high.tolist()
            }
            result = pd.DataFrame.from_dict(data)
            result['store_item_id'] = store_item_id
            result['store_id'] = store_id
            result['item_id'] = item_id
            
            all_predicts.append(result)

        return pd.concat(all_predicts)

    def run(self):
        """Основной метод для запуска предсказаний."""
        
        train = self.get_historical_data()
        result = self.make_predictions(train=train)
        
        return result

# Пример использования
if __name__ == "__main__":
    data_path = "data"
    prediction_length = 28
    ids = [
    'STORE_2_085', 
    'STORE_2_043', 
    'STORE_2_054', 
    'STORE_2_325',
    'STORE_2_090', 
    'Имитация айди без истории'
    ]
    predictor = SalesPredictor(data_path=data_path, prediction_length=prediction_length, store_item_ids=ids)
    predictions = predictor.run()

         date        low      median        high store_item_id store_id  \
0  2016-01-22   2.894978    5.986529   14.028133   STORE_2_085  STORE_2   
1  2016-01-23   2.993264    6.500298   11.329729   STORE_2_085  STORE_2   
2  2016-01-24   3.779555    6.991729   15.069076   STORE_2_085  STORE_2   
3  2016-01-25   2.894978    7.014067   11.092948   STORE_2_085  STORE_2   
4  2016-01-26   2.010401    5.003665   13.983459   STORE_2_085  STORE_2   
..        ...        ...         ...         ...           ...      ...   
23 2016-02-14  58.313071  117.106934  135.170935   STORE_2_090  STORE_2   
24 2016-02-15  65.593620  105.430580  135.926465   STORE_2_090  STORE_2   
25 2016-02-16  64.906776   81.391041  127.753021   STORE_2_090  STORE_2   
26 2016-02-17  63.807826   74.866020   93.067390   STORE_2_090  STORE_2   
27 2016-02-18  63.739142   71.088371   86.611051   STORE_2_090  STORE_2   

   item_id  
0      085  
1      085  
2      085  
3      085  
4      085  
..     ...  
23     0

In [286]:
predictions

Unnamed: 0,date,low,median,high,store_item_id,store_id,item_id
0,2016-01-22,2.894978,5.986529,14.028133,STORE_2_085,STORE_2,085
1,2016-01-23,2.993264,6.500298,11.329729,STORE_2_085,STORE_2,085
2,2016-01-24,3.779555,6.991729,15.069076,STORE_2_085,STORE_2,085
3,2016-01-25,2.894978,7.014067,11.092948,STORE_2_085,STORE_2,085
4,2016-01-26,2.010401,5.003665,13.983459,STORE_2_085,STORE_2,085
...,...,...,...,...,...,...,...
23,2016-02-14,58.313071,117.106934,135.170935,STORE_2_090,STORE_2,090
24,2016-02-15,65.593620,105.430580,135.926465,STORE_2_090,STORE_2,090
25,2016-02-16,64.906776,81.391041,127.753021,STORE_2_090,STORE_2,090
26,2016-02-17,63.807826,74.866020,93.067390,STORE_2_090,STORE_2,090


In [278]:
start_date

Timestamp('2016-01-22 00:00:00')

In [277]:
pd.date_range(start_date, end_date)

DatetimeIndex(['2016-01-22'], dtype='datetime64[ns]', freq='D')

In [None]:

start_date = end_date - timedelta(days=self.prediction_length - 1)
return pd.date_range(start_date, end_date)

# Прогнощирование по неделям

In [255]:
dates_df

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,date_id,CASHBACK_STORE_1,CASHBACK_STORE_2,CASHBACK_STORE_3
0,2011-01-29,11101,Saturday,1,1,2011,,,,,1,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,,,,,2,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,,,,,3,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,,,,,4,0,1,1
4,2011-02-02,11101,Wednesday,5,2,2011,,,,,5,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1814,2016-01-17,11551,Sunday,2,1,2016,,,,,1815,0,0,0
1815,2016-01-18,11551,Monday,3,1,2016,MartinLutherKingDay,National,,,1816,0,0,0
1816,2016-01-19,11551,Tuesday,4,1,2016,,,,,1817,0,0,0
1817,2016-01-20,11551,Wednesday,5,1,2016,,,,,1818,0,0,0


In [256]:
dates_df.wm_yr_wk.value_counts()

wm_yr_wk
11101    7
11407    7
11409    7
11410    7
11411    7
        ..
11240    7
11241    7
11242    7
11243    7
11551    6
Name: count, Length: 260, dtype: int64

In [254]:
train

Unnamed: 0,item_id,store_id,date_id,sales,date,year,wm_yr_wk,store_item_id
0,085,STORE_2,1,3,2011-01-29,2011,11101,STORE_2_085
1,085,STORE_2,2,8,2011-01-30,2011,11101,STORE_2_085
2,085,STORE_2,3,0,2011-01-31,2011,11101,STORE_2_085
3,085,STORE_2,4,3,2011-02-01,2011,11101,STORE_2_085
4,085,STORE_2,5,0,2011-02-02,2011,11101,STORE_2_085
...,...,...,...,...,...,...,...,...
81850,727,STORE_1,1815,2,2016-01-17,2016,11551,STORE_1_727
81851,727,STORE_1,1816,3,2016-01-18,2016,11551,STORE_1_727
81852,727,STORE_1,1817,1,2016-01-19,2016,11551,STORE_1_727
81853,727,STORE_1,1818,4,2016-01-20,2016,11551,STORE_1_727


In [264]:
week = train.groupby(['wm_yr_wk', 'store_item_id'], as_index=False).agg({"sales": "sum", "item_id": "count", "date": "max"})
week

Unnamed: 0,wm_yr_wk,store_item_id,sales,item_id,date
0,11101,STORE_1_064,3,7,2011-02-04
1,11101,STORE_1_065,3,7,2011-02-04
2,11101,STORE_1_090,864,7,2011-02-04
3,11101,STORE_1_252,133,7,2011-02-04
4,11101,STORE_1_325,0,7,2011-02-04
...,...,...,...,...,...
11695,11551,STORE_3_555,257,6,2016-01-21
11696,11551,STORE_3_586,474,6,2016-01-21
11697,11551,STORE_3_587,201,6,2016-01-21
11698,11551,STORE_3_714,72,6,2016-01-21


In [261]:
week1 = week[week.item_id==7]

In [263]:
week.loc[week.item_id!=7, "sales"] = np.nan

In [None]:
prediction_length = 4

In [271]:
train['week'] = train['date'].dt.weekofyear

AttributeError: 'DatetimeProperties' object has no attribute 'weekofyear'

In [270]:
train

Unnamed: 0,item_id,store_id,date_id,sales,date,year,wm_yr_wk,store_item_id,week
0,085,STORE_2,1,3,2011-01-29,2011,11101,STORE_2_085,5
1,085,STORE_2,2,8,2011-01-30,2011,11101,STORE_2_085,6
2,085,STORE_2,3,0,2011-01-31,2011,11101,STORE_2_085,0
3,085,STORE_2,4,3,2011-02-01,2011,11101,STORE_2_085,1
4,085,STORE_2,5,0,2011-02-02,2011,11101,STORE_2_085,2
...,...,...,...,...,...,...,...,...,...
81850,727,STORE_1,1815,2,2016-01-17,2016,11551,STORE_1_727,6
81851,727,STORE_1,1816,3,2016-01-18,2016,11551,STORE_1_727,0
81852,727,STORE_1,1817,1,2016-01-19,2016,11551,STORE_1_727,1
81853,727,STORE_1,1818,4,2016-01-20,2016,11551,STORE_1_727,2


In [265]:
train[['date', 'wm_yr_wk']].sort_values(by='date').wm_yr_wk.

Unnamed: 0,date,wm_yr_wk
0,2011-01-29,11101
78217,2011-01-29,11101
3638,2011-01-29,11101
56389,2011-01-29,11101
25466,2011-01-29,11101
...,...,...
27284,2016-01-21,11551
30922,2016-01-21,11551
21827,2016-01-21,11551
7275,2016-01-21,11551


In [None]:
valid_dates = sorted(train.date.unique())

In [None]:




valid = train[train.date.isin(date_range)].copy()
train = train[~train.date.isin(date_range)]

result = make_predictions(prediction_length=prediction_length, train=train, date_range=date_range)


In [None]:

# store_item_ids = train.store_item_id.unique()
# data_path = "data"
# prediction_length = 30

# train = get_historical_data(item_store_ids=store_item_ids, data_path=data_path)
# date_range = get_validation_dates(train=train, prediction_length=prediction_length, prediction_type=None)

# valid = train[train.date.isin(date_range)].copy()
# train = train[~train.date.isin(date_range)]

# result = make_predictions(prediction_length=prediction_length, train=train, date_range=date_range)
