Предсказание длинны очереди

У нас есть два типа данных:
Индивидуальные - для каждой секунды
Агреггированные - для каждого 30 минутного отрезка

Сгенерируем первый вариант датасета

Время работы магазина - с 7 часов до 24

Данных сгенерируем одну неделю, по условию, возьмем первую неделю 2018 года.



In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

import datetime

In [2]:
# Constants

open_hour = 7
close_hour = 24
open_hours_range = range(open_hour,close_hour)

opening_hours = close_hour-open_hour

chunks_in_hour = 2

day_chunks = opening_hours*chunks_in_hour

In [3]:
# Time parameters

start_date = datetime.date(2018, 1, 1)
end_date = datetime.date(2018, 1, 7)


mu = 0.007


golden_hours = [8, 17, 18, 19, 22]

golden_hours_mu_bonus = 0.003



# Performance parameters
#chunk_clients_mean = 15
#chunk_clients_sd = chunk_clients_mean / 5

individual_serving_time_const = 120
#individual_serving_time_mean = 60
#individual_serving_time_sd = 30

In [4]:
from datetime import time
import datetime
list(open_hours_range)
def day_chunks(open_hours_range = open_hours_range):
    day_chunks = []
    for hour in open_hours_range:
        day_chunks.append(datetime.time(hour, 0))
        day_chunks.append(datetime.time(hour, 30))
    return day_chunks

In [5]:
dates = pd.date_range(start_date, end_date)
pd.Series(dates)

0   2018-01-01
1   2018-01-02
2   2018-01-03
3   2018-01-04
4   2018-01-05
5   2018-01-06
6   2018-01-07
dtype: datetime64[ns]

In [6]:
def plot_order_df(df, plot = False):
    # x = poisson.rvs(mu, loc, size=seconds, random_state=random_state)
    x = df["arrive"]
    if plot == "cdf":
        fig, ax = plt.subplots(1, 1)
        ax.plot(x, poisson.pmf(x, mu), 'bo', ms=8, label='poisson pmf')
        ax.vlines(x, 0, poisson.cdf(x, mu), colors='b', lw=5, alpha=0.5) 
    if plot == "hist":
        plt.hist(x)
    #return x, sum(x)

In [7]:
def generate_order_df(mu = 0.007, loc = 0, seconds = 60*30, random_state = None, dropna = False):
    from scipy.stats import poisson
    x = poisson.rvs(mu, loc, size=seconds, random_state=random_state)

    chunk_df = pd.DataFrame({"arrive": x, "queued": np.NaN, "number_in_queue": np.NaN, "waiting_in_queue": np.NaN, "serving_time": np.NaN})
    queued = 0
    queued_time_to_wait = 0
    import collections
    order = collections.deque()
    for i, row in chunk_df.iterrows():
        if row["arrive"]:
            chunk_df.at[i,'waiting_in_queue'] = queued_time_to_wait
            chunk_df.at[i,"serving_time"] = individual_serving_time_const + queued_time_to_wait

            queued_time_to_wait += row["arrive"]*individual_serving_time_const
            order.append([i,row])
            if len(order) > 1:
                queued = 1
            else:
                queued = 0
            chunk_df.at[i,'queued'] = queued        

        
        chunk_df.at[i,'number_in_queue'] = len(order)

            
        if queued_time_to_wait > 0:
            queued_time_to_wait += -1
        if queued_time_to_wait == 0:
            order.clear()
    if dropna:
        chunk_df.dropna(thresh=2, inplace=True)
    return chunk_df
#df = pd.DataFrame({'B': [0, 1, 0, 1, 1]})    
#chunk_df["arrive"].rolling(120).sum()
#df.rolling(2, win_type='triang').sum()

In [8]:
def generate_chunk_parameters(chunk_time = None, mu = 0.007, loc = 0, seconds = 60*30, random_state=None):   
    gen = generate_order_df(mu = mu , loc = loc, random_state = random_state)
    clients_count = gen.query("arrive > 0").describe()["arrive"]["count"]
    queue_length_mean = gen.describe()["number_in_queue"]["mean"]
    queue_length_sd = gen.describe()["number_in_queue"]["std"]
    serving_time_mean = gen.query("arrive > 0").describe()["serving_time"]["mean"]
    serving_time_sd = gen.query("arrive > 0").describe()["serving_time"]["std"]
    df = pd.DataFrame()
    df = df.append({
            "time": chunk_time
            , "clients_count": clients_count
            , "queue_length_mean": queue_length_mean
            , "queue_length_sd": queue_length_sd
            , "serving_time_mean": serving_time_mean
            , "serving_time_sd": serving_time_sd
        }, ignore_index=True) 
    return df

In [9]:
# generate_order_df(mu = 0.003, loc = 0, seconds = 60*30, random_state=None)

generate_chunk_parameters()

Unnamed: 0,clients_count,queue_length_mean,queue_length_sd,serving_time_mean,serving_time_sd,time
0,12.0,1.642222,2.077379,193.666667,103.066028,


In [10]:
def generate_dataset(random_state="timestamp"):
    
    global mu
    
    from joblib import Parallel, delayed
    import multiprocessing  
    from tqdm import tqdm, tqdm_notebook

    dataset = pd.DataFrame()
    chunks = []
    for date in dates:
        for chunk_time in day_chunks():
            chunk_datetime = datetime.datetime.combine(date, chunk_time)
            if chunk_datetime.hour in golden_hours:
                # print("! Golden hour %s" % chunk_datetime.hour)
                chunk_mu = mu + golden_hours_mu_bonus
            else:
                chunk_mu = mu
            chunks.append({"datetime": chunk_datetime, "mu": chunk_mu})

    for chunk in tqdm_notebook(chunks):
        if random_state == "timestamp":
            random_state = np.random.RandomState(int(chunk.get("datetime").timestamp()))
        row = generate_chunk_parameters(chunk_time=chunk.get("datetime"), mu=chunk.get("mu"), random_state=random_state)
        dataset = dataset.append(row, ignore_index=True)
    return dataset

In [11]:
int(datetime.datetime(2018, 1, 1, 7, 0).timestamp())

1514779200

In [12]:
import os.path
filename =  "data_processed.csv"
def get_or_create(filename, generate_dataset_function, recreate=True):
    if filename.endswith('.gz'):
        compression = 'gzip'
    elif filename.endswith('.csv'):
        compression = None
    else:
        raise AttributeError("Check filename")
    
    if not os.path.exists(filename) or recreate:
        df = generate_dataset_function()
        df.to_csv(filename, compression=compression)
    else:
        df = pd.read_csv(filename, compression=compression)
        df.drop(columns=["Unnamed: 0"], inplace=True)
    return df

In [13]:
#mu = 0.007
#golden_hours_mu_bonus = 0.003

In [14]:
get_or_create(filename, generate_dataset, recreate=True)

HBox(children=(IntProgress(value=0, max=238), HTML(value='')))






Unnamed: 0,clients_count,queue_length_mean,queue_length_sd,serving_time_mean,serving_time_sd,time
0,5.0,0.438889,0.681396,160.400000,90.337146,2018-01-01 07:00:00
1,10.0,1.572778,1.589723,173.100000,61.826729,2018-01-01 07:30:00
2,16.0,5.877778,4.343832,299.000000,110.839824,2018-01-01 08:00:00
3,17.0,7.252222,5.080425,358.470588,138.756584,2018-01-01 08:30:00
4,6.0,0.627222,0.968333,138.500000,30.716445,2018-01-01 09:00:00
5,10.0,1.331111,1.413557,160.400000,64.806035,2018-01-01 09:30:00
6,13.0,7.928333,4.548776,377.769231,129.370369,2018-01-01 10:00:00
7,15.0,3.563333,3.370142,256.000000,117.438738,2018-01-01 10:30:00
8,13.0,1.913889,2.142460,201.615385,86.793374,2018-01-01 11:00:00
9,8.0,0.811667,0.968416,137.625000,33.542883,2018-01-01 11:30:00
