In [16]:
import pandas as pd

import plotly.graph_objects as go

layout_params = {
    'font_color': '#000000',
    'font_family': 'Avenir Next',
    'font_size': 11,
    'margin': {'l': 0, 'r': 0, 't': 80, 'b': 0},
    'paper_bgcolor': '#FFFFFF',
    'plot_bgcolor': '#f5f5f5'
}

In [68]:
PATH = "data"

sales_df = pd.read_csv(f"{PATH}/shop_sales.csv")
dates_df = pd.read_csv(f"{PATH}/shop_sales_dates.csv")
price_df = pd.read_csv(f"{PATH}/shop_sales_prices.csv")

In [69]:
display(sales_df.head(3), dates_df.head(3), price_df.head(3))

Unnamed: 0,item_id,store_id,date_id,cnt
0,STORE_2_085,STORE_2,1,3
1,STORE_2_085,STORE_2,2,8
2,STORE_2_085,STORE_2,3,0


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,event_name_1,event_type_1,event_name_2,event_type_2,date_id,CASHBACK_STORE_1,CASHBACK_STORE_2,CASHBACK_STORE_3
0,2011-01-29,11101,Saturday,1,1,2011,,,,,1,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,,,,,2,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,,,,,3,0,0,0


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,STORE_2,STORE_2_085,11101,1.0
1,STORE_2,STORE_2_043,11101,2.88
2,STORE_2,STORE_2_054,11101,2.28


In [59]:
# Пропуски в данных

display(sales_df.isna().sum(), dates_df.isna().sum(), price_df.isna().sum())

item_id     0
store_id    0
date_id     0
cnt         0
dtype: int64

date                   0
wm_yr_wk               0
weekday                0
wday                   0
month                  0
year                   0
event_name_1        1673
event_type_1        1673
event_name_2        1815
event_type_2        1815
date_id                0
CASHBACK_STORE_1       0
CASHBACK_STORE_2       0
CASHBACK_STORE_3       0
dtype: int64

store_id      0
item_id       0
wm_yr_wk      0
sell_price    0
dtype: int64

In [70]:
# Дубликаты в данных

(sales_df.duplicated().sum(), dates_df.duplicated().sum(), price_df.duplicated().sum())

(np.int64(0), np.int64(0), np.int64(0))

In [72]:
# Пропуски в датах

sales_df = sales_df.merge(dates_df[['date', 'date_id', 'wm_yr_wk']], on='date_id')
sales_df['store_item_id'] = sales_df.item_id
sales_df.item_id = sales_df.item_id.apply(lambda x: x.split('_')[-1])

In [73]:
# Количество дат в данных

sales_df['date'].nunique()

1819

In [74]:
sales_df[['date', 'store_item_id']].groupby("store_item_id", as_index=False).date.nunique()['date'].nunique()

1

Дубликатов в датах на товар нет

In [75]:
sales_df[['date', 'store_item_id']].store_item_id.value_counts().reset_index()['count'].nunique()

1

У всех товаров одинаковая длина ряда, пропусков в датах нет

In [76]:
sales_df.item_id.nunique()

31

In [77]:
sales_df[['store_id', 'item_id']].drop_duplicates().store_id.value_counts()

store_id
STORE_2    15
STORE_3    15
STORE_1    15
Name: count, dtype: int64

In [78]:
sales_df

Unnamed: 0,item_id,store_id,date_id,cnt,date,wm_yr_wk,store_item_id
0,085,STORE_2,1,3,2011-01-29,11101,STORE_2_085
1,085,STORE_2,2,8,2011-01-30,11101,STORE_2_085
2,085,STORE_2,3,0,2011-01-31,11101,STORE_2_085
3,085,STORE_2,4,3,2011-02-01,11101,STORE_2_085
4,085,STORE_2,5,0,2011-02-02,11101,STORE_2_085
...,...,...,...,...,...,...,...
81850,727,STORE_1,1815,2,2016-01-17,11551,STORE_1_727
81851,727,STORE_1,1816,3,2016-01-18,11551,STORE_1_727
81852,727,STORE_1,1817,1,2016-01-19,11551,STORE_1_727
81853,727,STORE_1,1818,4,2016-01-20,11551,STORE_1_727


In [79]:
plot = sales_df.groupby(["store_id", "date"], as_index=False).cnt.sum()
colors = ['red', 'blue', 'orange']
for i, store_id in enumerate(plot.store_id.unique()):
    fig = go.Figure()
    plot_df = plot[plot.store_id == store_id].copy()
    fig.add_trace(go.Scatter(x=plot_df['date'], y=plot_df['cnt'], mode='lines', name=store_id, marker_color=colors[i]))
    fig.update_layout(**layout_params, title=f"Суммарные продажи {store_id}").show()

In [86]:
colors = ['red', 'blue', 'orange']
for i, store_id in enumerate(plot.store_id.unique()):
    fig = go.Figure()
    plot_df = sales_df[sales_df.store_id == store_id].copy()
    plot_df = plot_df.groupby(["item_id"], as_index=False).cnt.sum().sort_values(by='cnt', ascending=False)
    fig.add_trace(go.Bar(x=plot_df['item_id'], y=plot_df['cnt'],  marker_color=colors[i]))
    fig.update_layout(**layout_params, title=f"Суммарные продажи по товарам - {store_id}", width=800, height=400).show()

In [87]:
colors = ['red', 'blue', 'orange']
for i, store_id in enumerate(plot.store_id.unique()):
    fig = go.Figure()
    plot_df = sales_df[sales_df.store_id == store_id].copy()
    plot_df = plot_df.groupby(["item_id"], as_index=False).cnt.mean().sort_values(by='cnt', ascending=False)
    fig.add_trace(go.Bar(x=plot_df['item_id'], y=plot_df['cnt'],  marker_color=colors[i]))
    fig.update_layout(**layout_params, title=f"Среднедневные продажи по товарам - {store_id}", width=800, height=400).show()

In [97]:
price_df = price_df.rename(columns={"item_id": "store_item_id"})
sales_df = sales_df.merge(price_df[['store_item_id', 'wm_yr_wk', 'sell_price']], on=['store_item_id', 'wm_yr_wk'])

In [99]:
sales_df['profit'] = sales_df['cnt'] * sales_df['sell_price']

In [100]:
plot = sales_df.groupby(["store_id", "date"], as_index=False).profit.sum()
colors = ['red', 'blue', 'orange']
for i, store_id in enumerate(plot.store_id.unique()):
    fig = go.Figure()
    plot_df = plot[plot.store_id == store_id].copy()
    fig.add_trace(go.Scatter(x=plot_df['date'], y=plot_df['profit'], mode='lines', name=store_id, marker_color=colors[i]))
    fig.update_layout(**layout_params, title=f"Выручка по товарам {store_id}").show()