In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [35]:

raw_shelves_df = pd.read_csv('shelves.csv')
raw_shedules_df = pd.read_csv('shedules.csv')
raw_products_df = pd.read_csv('products.csv')


In [36]:
shedules_df = raw_shedules_df.rename(columns={
    'N': 'id',
    'name_TT': 'store',
    'НаименованиеГрафика': 'schedule_name',
    'Длительность (недель)': 'duration_weeks',
    'Стеллажи': 'shelves',
    'неделя 1': 'week_1',
    'неделя 2': 'week_2',
    'неделя 3': 'week_3',
    'неделя 4': 'week_4',
    'Кол-во дней': 'days_count',
    'ЧастотаПостеллажная': 'shelf_frequency',
    'ЧастотаГрафика': 'schedule_frequency'
}, inplace=False)

shedules_df = shedules_df[shedules_df['store'].str.contains("5018ДС_СПб_Полюстровский59", case=False, na=False)]

shedules_df = shedules_df.melt(
    id_vars=['id', 'store', 'schedule_name', 'duration_weeks', 'shelves', 'days_count', 'shelf_frequency', 'schedule_frequency'],
    value_vars=['week_1', 'week_2', 'week_3', 'week_4'],
    var_name='week_number',
    value_name='week_day'
)

shedules_df = shedules_df.dropna(subset=['week_day']).reset_index(drop=True)

day_of_week_map = {'пн': 0, 'вт': 1, 'ср': 2, 'чт': 3, 'пт': 4, 'сб': 5, 'вс': 6}

shedules_df['day_index'] = shedules_df['week_day'].map(day_of_week_map)
shedules_df['day'] = shedules_df.apply(
    lambda row: (int(row['week_number'][-1]) - 1) * 7 + row['day_index'], axis=1
)


shedules_df = shedules_df.drop(columns=['week_number', 'week_day', 'day_index', 'schedule_frequency', 'shelf_frequency', 'days_count', 'duration_weeks'])
shedules_df = shedules_df.sort_values('day').reset_index(drop=True)

shedules_df.head()

Unnamed: 0,id,store,schedule_name,shelves,day
0,5018,5018ДС_СПб_Полюстровский59,Индивидуальный график постеллажной. ДС 5018,A1;A10;A10;A10;A10;A10;A10;A10;A10;A11;A11;A11...,0
1,5018,5018ДС_СПб_Полюстровский59,Индивидуальный график постеллажной. ДС 5018,K1;K10;K11;K12;K13;K14;K15;K16;K17;K18;K19;K2;...,1
2,5018,5018ДС_СПб_Полюстровский59,Индивидуальный график постеллажной. ДС 5018,G13;O1;O2;O3;O4;O5;O6;O7;O8;Г1;Г10;Г11;Г12;Г2;...,2
3,5018,5018ДС_СПб_Полюстровский59,Индивидуальный график постеллажной. ДС 5018,Z1;Z10;Z12;Z13;Z14;Z15;Z16;Z17;Z18;Z2;Z3;Z4;Z5...,3
4,5018,5018ДС_СПб_Полюстровский59,Индивидуальный график постеллажной. ДС 5018,Z18;Z19;Z20;Z21;Z22;Z23;Z24;Z25;Z26;Z27;Z28;Z2...,4


In [37]:
products_df = raw_products_df.rename(columns={
    'id_tov': 'id_tov',
    'Наименование': 'name',
    'Категория': 'category',
    'Группа': 'group',
    'Кол-во чеков за месяц': 'monthly_hours',
    'Единица измерения': 'system',
    'Кол-во за месяц': 'per_month',
    'Среднее кол-во за месяц': 'avg_per_month',
    'Сумма продаж за месяц': 'sum_per_month',
}, inplace=False)

products_df = products_df[products_df['category'].str.contains("Овощи. Фрукты. Грибы. Зелень", case=False, na=False)]

products_df = products_df.reset_index(drop=True, inplace=False)

products_df.head()

Unnamed: 0,id_tov,name,category,group,monthly_hours,system,per_month,avg_per_month,sum_per_month
0,23 074,Манго Египет,Овощи. Фрукты. Грибы. Зелень,Манго (все виды),8352,кг,6972.48,0.83,2194135.41
1,731,Бананы,Овощи. Фрукты. Грибы. Зелень,Банан,6869,кг,6338.46,0.92,952972.7
2,669,Арбуз,Овощи. Фрукты. Грибы. Зелень,Бахчевые,1621,кг,11265.38,6.95,683772.7
3,64 336,Картофель молодой,Овощи. Фрукты. Грибы. Зелень,Картофель подгруппа,3508,кг,5922.67,1.69,389319.41
4,22 564,Томаты Махитос,Овощи. Фрукты. Грибы. Зелень,Томаты,1603,кг,1043.58,0.65,375737.35


In [38]:

shelves_df = raw_shelves_df
shelves_df['shelve'] = shelves_df['shelve'].apply(lambda x: ''.join(x.split('-')[:2]))

shelves_df.head()

Unnamed: 0,name,price,shelve
0,Апельсины,215,A7
1,Апельсины Навелин,289,A3
2,Арбуз,67,A7
3,Баклажаны грунтовые,138,Fr1
4,Бананы,158,A7


In [39]:

# The training data will look like this
# sku_id,	name, category,	avg_cart,	t_expiery, stocktaking_time,	dt,	trigger

columns = ['sku_id', 'name', 'category', 'avg_cart', 't_expiery', 'stocktaking_time', 'dt', 'trigger']
training_df = pd.DataFrame(columns=columns)

training_df.head()


Unnamed: 0,sku_id,name,category,avg_cart,t_expiery,stocktaking_time,dt,trigger


In [40]:
def determine_trigger(sku_id, name, day_index):
  shelf_df = shelves_df[shelves_df['name'] == name]['shelve']
  shelf_df = shelf_df.reset_index(drop=True)
  
  day_df = shedules_df[shedules_df['day'] == day_index]
  day_df = day_df.reset_index(drop=True)
  
  if (not shelf_df.any() or not day_df['shelves'].any()): 
    return False
  
  return shelf_df[0] in day_df['shelves'][0]

In [41]:
# Now, for every product, for every day of the week, get the fucking trigger and sku_id
training_data = []
  
for _, row in products_df.iterrows():
  sku_id = row['id_tov']
  name = row['name']
  
  # Add 13 rows for each product with dt from 0 to 12
  for dt in range(14):
    training_row = {
        'sku_id': sku_id,
        'name': name,
        'category': row['category'],  # Arbitrary value from products_df
        'avg_cart': 1,  # Arbitrary value
        't_expiery': 100,  # Arbitrary value
        'stocktaking_time': 30,  # Arbitrary value
        'dt': dt,
        'trigger': determine_trigger(sku_id, name, dt)
    }
    training_data.append(training_row)

training_df = pd.DataFrame(training_data)

# training_df.head(20)

In [42]:
X = training_df.drop(columns=['trigger'])
y = training_df['trigger']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_data = X_train.copy()
train_data['trigger'] = y_train


In [43]:
task = Task('binary')

automl = TabularAutoML(task=task, timeout=1200)

oof_pred = automl.fit_predict(train_data, roles={'target': 'trigger'})

test_pred = automl.predict(X_test)

In [66]:
print(test_pred.data[:10])

[[0.01526909]
 [0.00941303]
 [0.01552769]
 [0.00911077]
 [0.00911019]
 [0.00911034]
 [0.00972324]
 [0.00910093]
 [0.01649824]
 [0.01526909]]


In [67]:
report = classification_report(y_test, test_pred.data.round())
print(report)

              precision    recall  f1-score   support

       False       0.99      1.00      0.99      1082
        True       1.00      0.63      0.77        38

    accuracy                           0.99      1120
   macro avg       0.99      0.82      0.88      1120
weighted avg       0.99      0.99      0.99      1120

