Welcome to CS598 MP1! We will be demonstrating OLA with Plotly Plots on the [Predict Future Sales](https://www.kaggle.com/competitions/competitive-data-science-predict-future-sales) dataset.
Note: you **do not** have to modify the code in this Notebook. Once you are finished with implementing all the required OLA classes in `ola.py`, you can run the cells to verify the results - the Plotly plots should dynamically update as data is processed.

In [1]:
from ola import *
from utils import *

import numpy as np
import plotly.graph_objects as go
import pandas as pd
import random
import time

In [2]:
# Read the dataframe. See https://www.kaggle.com/competitions/competitive-data-science-predict-future-sales for the schema.
df = pd.read_csv("sales_train.csv")

In [3]:
# Split the dataframe for OLA. We will be processing 10% of the rows in the dataframe, one 20000-row slice at a time.
df_list = sample_split_df(df, sample_percentage=0.1, slice_size=20000)

Performing OLA for computing mean (this is already implemented as an example, see ola.py):-----------------------------------

In [4]:
avg_widget = generate_plot("Average item price of transactions", "", "Average price")
avg_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': 'c7657817-e4e1-45fd-961c-aee4e881848c',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Average item price of transactions'},
               'xaxis': {'title': {'text': ''}},
               'yaxis': {'title': {'text': 'Average price'}}}
})

In [5]:
# Incrementally process the slices. The plot should update during the processing.
avg_ola = AvgOla(avg_widget, "item_price")
for df_slice in df_list:
    # The sleep statement (and others below) is for observing the incremental update process. Feel free to remove it.
    time.sleep(0.5)
    avg_ola.process_slice(df_slice)

(889.7659557442009,)
(891.3224223413121,)
(892.2543256028523,)
(891.720798749568,)
(890.0007017824632,)
(891.7142393748634,)
(890.297514856013,)
(893.3638952532349,)
(891.9511188113408,)
(892.0400896624155,)
(891.4171952923839,)
(891.0865293522445,)
(892.2408063156804,)
(891.6084119362192,)
(893.4487293129439,)


Performing OLA for computing filtered mean:-----------------------------------

In [6]:
filter_avg_widget = generate_plot("Average price per transaction of item ID 22154", "", "Average price")
filter_avg_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': 'c573d945-ac86-4e15-8b83-921e780e46eb',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Average price per transaction of item ID 22154'},
               'xaxis': {'title': {'text': ''}},
               'yaxis': {'title': {'text': 'Average price'}}}
})

In [7]:
# Incrementally process the slices. The plot should update during the processing.
filter_avg_ola = FilterAvgOla(filter_avg_widget, "item_id", 22154, "item_price")
for df_slice in df_list:
    time.sleep(0.5)
    filter_avg_ola.process_slice(df_slice)

(299.0,)
(299.0,)
(299.0,)
(299.0,)
(649.0,)
(649.0,)
(765.6666666666666,)
(765.6666666666666,)
(765.6666666666666,)
(649.0,)
(649.0,)
(649.0,)
(579.0,)
(579.0,)
(579.0,)


Performing OLA for computing grouped means:-----------------------------------

In [8]:
group_by_avg_widget = generate_plot("Average items per transaction by date block", "Date Block ID", "Items per transaction")
group_by_avg_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': '50d9fb70-d03c-471c-b643-4ca757840d07',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Average items per transaction by date block'},
               'xaxis': {'title': {'text': 'Date Block ID'}},
               'yaxis': {'title': {'text': 'Items per transaction'}}}
})

In [9]:
# Incrementally process the slices. The plot should update during the processing.
group_by_avg_ola = GroupByAvgOla(group_by_avg_widget, "date_block_num", "item_cnt_day")
for df_slice in df_list:
    time.sleep(0.5)
    group_by_avg_ola.process_slice(df_slice)

(1.1450381679389312, 1.2181303116147308, 1.1662621359223302, 1.1272141706924315, 1.0947030497592296, 1.1909620991253644, 1.1229868228404098, 1.1964549483013294, 1.3482810164424515, 1.3817891373801916, 1.2798833819241981, 1.2786709539121115, 1.1362229102167183, 1.3037766830870279, 1.238390092879257, 1.1856060606060606, 1.29182156133829, 1.0907504363001745, 1.0941619585687383, 1.1996951219512195, 1.2966601178781925, 1.2354892205638475, 1.2638146167557933, 1.2866666666666666, 1.2104430379746836, 1.1577909270216962, 1.2537313432835822, 1.7209944751381216, 1.1959287531806615, 1.1805929919137466, 1.1782729805013927, 1.1365979381443299, 1.3522727272727273, 1.207492795389049)
(1.1444229529335912, 1.199579831932773, 1.1802184466019416, 1.1219709208400646, 1.1072279586973788, 1.3288288288288288, 1.1412884333821376, 1.185878962536023, 1.322804314329738, 1.4066985645933014, 1.2828427853553481, 1.261716692996314, 1.1232472324723248, 1.2822647793505413, 1.2359723289777094, 1.1677662582469368, 1.3088

(1.1350857977364002, 1.1886172577060585, 1.2047000086080744, 1.1252623439743732, 1.170773248552288, 1.2608695652173914, 1.1563682219419924, 1.214335699290213, 1.3746610261416639, 1.3637498581318805, 1.345764173313214, 1.2642841545964043, 1.1582983193277312, 1.2295857988165682, 1.230900409276944, 1.249833533093621, 1.2376950780312126, 1.1822033898305084, 1.1509357200976404, 1.1845441248171624, 1.3647816750178956, 1.3158103813559323, 1.3552308447937131, 1.2910513291290813, 1.1986221391872958, 1.1772760378732703, 1.1753188964698902, 1.3858905875851906, 1.3356603773584905, 1.1801923076923078, 1.1448499904452514, 1.1551057957681692, 1.4162070428510818, 1.3578760363205684)
(1.132590917000174, 1.188094139363175, 1.2053836022390516, 1.125910290237467, 1.172473678497775, 1.259207365892714, 1.1543852623147777, 1.216658733936221, 1.3684809602649006, 1.3622608883605318, 1.341747372759118, 1.2644771559505623, 1.158084191580842, 1.2305004508566275, 1.244972972972973, 1.2462401223553403, 1.2436548223

Performing OLA for computing grouped sums:-----------------------------------

In [10]:
group_by_sum_widget = generate_plot("Total items sold per shop", "Shop ID", "Items sold")
group_by_sum_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': 'fef7a7bb-f425-469e-8a62-21d0db1a08dc',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Total items sold per shop'},
               'xaxis': {'title': {'text': 'Shop ID'}},
               'yaxis': {'title': {'text': 'Items sold'}}}
})

In [11]:
# Incrementally process the slices. The plot should update during the processing.
group_by_sum_ola = GroupBySumOla(group_by_sum_widget, len(df), "shop_id", "item_cnt_day")
for df_slice in df_list:
    time.sleep(0.5)
    group_by_sum_ola.process_slice(df_slice)

(79.0, 38.0, 216.0, 184.0, 272.0, 308.0, 642.0, 462.0, 23.0, 102.0, 157.0, 1.0, 312.0, 129.0, 295.0, 452.0, 452.0, 160.0, 411.0, 481.0, 19.0, 411.0, 556.0, 52.0, 418.0, 1773.0, 431.0, 985.0, 1196.0, 363.0, 420.0, 2075.0, 52.0, 41.0, 54.0, 497.0, 1.0, 314.0, 375.0, 104.0, 47.0, 333.0, 968.0, 355.0, 301.0, 239.0, 544.0, 454.0, 149.0, 113.0, 513.0, 326.0, 307.0, 423.0, 1196.0, 464.0, 523.0, 955.0, 608.0, 348.0)
(162.0, 70.0, 379.0, 382.0, 566.0, 572.0, 1380.0, 905.0, 45.0, 286.0, 314.0, 2.0, 865.0, 269.0, 585.0, 914.0, 829.0, 342.0, 922.0, 1014.0, 55.0, 874.0, 967.0, 108.0, 902.0, 3356.0, 857.0, 1946.0, 2507.0, 768.0, 829.0, 4122.0, 115.0, 74.0, 100.0, 1008.0, 2.0, 733.0, 731.0, 195.0, 81.0, 650.0, 1996.0, 643.0, 639.0, 652.0, 1074.0, 898.0, 300.0, 236.0, 993.0, 667.0, 599.0, 885.0, 2389.0, 962.0, 1017.0, 1858.0, 1116.0, 672.0)
(264.0, 116.0, 544.0, 608.0, 859.0, 867.0, 1968.0, 1330.0, 60.0, 434.0, 506.0, 11.0, 1344.0, 373.0, 880.0, 1449.0, 1307.0, 533.0, 1368.0, 1548.0, 118.0, 1356.0, 14

Performing OLA for computing grouped counts:-----------------------------------

In [12]:
group_by_count_widget = generate_plot("Total transactions per shop", "Shop ID", "Total transactions")
group_by_count_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': '47b369fb-5eb1-4fff-9174-e6d212f42ea6',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Total transactions per shop'},
               'xaxis': {'title': {'text': 'Shop ID'}},
               'yaxis': {'title': {'text': 'Total transactions'}}}
})

Performing OLA for computing filtered distinct counts:-----------------------------------

In [13]:
# Incrementally process the slices. The plot should update during the processing.
group_by_count_ola = GroupByCountOla(group_by_count_widget, len(df), "shop_id", "item_cnt_day")
for df_slice in df_list:
    time.sleep(0.5)
    group_by_count_ola.process_slice(df_slice)

In [14]:
filter_distinct_widget = generate_plot("Number of distinct items sold in shop ID 10", "", "Number of distinct items")
filter_distinct_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': 'bb49a62c-a98a-4b20-9bba-e1596512c48a',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Number of distinct items sold in shop ID 10'},
               'xaxis': {'title': {'text': ''}},
               'yaxis': {'title': {'text': 'Number of distinct items'}}}
})

In [15]:
# Incrementally process the slices. The plot should update during the processing.
filter_distinct_ola = FilterDistinctOla(filter_distinct_widget, "shop_id", 10, "item_id")
for df_slice in df_list:
    time.sleep(0.5)
    filter_distinct_ola.process_slice(df_slice)