Welcome to CS598 MP1! We will be demonstrating OLA with Plotly Plots on the [Predict Future Sales](https://www.kaggle.com/competitions/competitive-data-science-predict-future-sales) dataset.
Note: you **do not** have to modify the code in this Notebook. Once you are finished with implementing all the required OLA classes in `ola.py`, you can run the cells to verify the results - the Plotly plots should dynamically update as data is processed.

In [1]:
from ola import *
from utils import *

import numpy as np
import plotly.graph_objects as go
import pandas as pd
import random
import time

In [2]:
# Read the dataframe. See https://www.kaggle.com/competitions/competitive-data-science-predict-future-sales for the schema.
df = pd.read_csv("sales_train.csv")

In [3]:
# Split the dataframe for OLA. We will be processing 10% of the rows in the dataframe, one 20000-row slice at a time.
df_list = sample_split_df(df, sample_percentage=0.1, slice_size=20000)

Performing OLA for computing mean (this is already implemented as an example, see ola.py):-----------------------------------

In [4]:
avg_widget = generate_plot("Average item price of transactions", "", "Average price")
avg_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': 'e8972d48-c788-4bea-8262-7a52eef9dd31',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Average item price of transactions'},
               'xaxis': {'title': {'text': ''}},
               'yaxis': {'title': {'text': 'Average price'}}}
})

In [5]:
# Incrementally process the slices. The plot should update during the processing.
avg_ola = AvgOla(avg_widget, "item_price")
for df_slice in df_list:
    # The sleep statement (and others below) is for observing the incremental update process. Feel free to remove it.
    time.sleep(0.5)
    avg_ola.process_slice(df_slice)

Performing OLA for computing filtered mean:-----------------------------------

In [6]:
filter_avg_widget = generate_plot("Average price per transaction of item ID 22154", "", "Average price")
filter_avg_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': '39540f42-0c83-4265-861e-1f443bd5c94e',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Average price per transaction of item ID 22154'},
               'xaxis': {'title': {'text': ''}},
               'yaxis': {'title': {'text': 'Average price'}}}
})

In [7]:
# Incrementally process the slices. The plot should update during the processing.
filter_avg_ola = FilterAvgOla(filter_avg_widget, "item_id", 22154, "item_price")
for df_slice in df_list:
    time.sleep(0.5)
    filter_avg_ola.process_slice(df_slice)

Performing OLA for computing grouped means:-----------------------------------

In [8]:
group_by_avg_widget = generate_plot("Average items per transaction by date block", "Date Block ID", "Items per transaction")
group_by_avg_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': '1f746233-88ef-4149-914d-3247f0c278a9',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Average items per transaction by date block'},
               'xaxis': {'title': {'text': 'Date Block ID'}},
               'yaxis': {'title': {'text': 'Items per transaction'}}}
})

In [9]:
# Incrementally process the slices. The plot should update during the processing.
group_by_avg_ola = GroupByAvgOla(group_by_avg_widget, "date_block_num", "item_cnt_day")
for df_slice in df_list:
    time.sleep(0.5)
    group_by_avg_ola.process_slice(df_slice)

Performing OLA for computing grouped sums:-----------------------------------

In [10]:
group_by_sum_widget = generate_plot("Total items sold per shop", "Shop ID", "Items sold")
group_by_sum_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': '01e3fee0-6140-4857-a0cb-c57a0459c071',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Total items sold per shop'},
               'xaxis': {'title': {'text': 'Shop ID'}},
               'yaxis': {'title': {'text': 'Items sold'}}}
})

In [11]:
# Incrementally process the slices. The plot should update during the processing.
group_by_sum_ola = GroupBySumOla(group_by_sum_widget, len(df), "shop_id", "item_cnt_day")
for df_slice in df_list:
    time.sleep(0.5)
    group_by_sum_ola.process_slice(df_slice)

Performing OLA for computing grouped counts:-----------------------------------

In [12]:
group_by_count_widget = generate_plot("Total transactions per shop", "Shop ID", "Total transactions")
group_by_count_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': '47b369fb-5eb1-4fff-9174-e6d212f42ea6',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Total transactions per shop'},
               'xaxis': {'title': {'text': 'Shop ID'}},
               'yaxis': {'title': {'text': 'Total transactions'}}}
})

Performing OLA for computing filtered distinct counts:-----------------------------------

In [13]:
# Incrementally process the slices. The plot should update during the processing.
group_by_count_ola = GroupByCountOla(group_by_count_widget, len(df), "shop_id", "item_cnt_day")
for df_slice in df_list:
    time.sleep(0.5)
    group_by_count_ola.process_slice(df_slice)

In [14]:
filter_distinct_widget = generate_plot("Number of distinct items sold in shop ID 10", "", "Number of distinct items")
filter_distinct_widget

FigureWidget({
    'data': [{'type': 'bar',
              'uid': 'bb49a62c-a98a-4b20-9bba-e1596512c48a',
              'width': 0.3,
              'x': [wait for data],
              'y': [0]}],
    'layout': {'font': {'color': 'Black', 'family': 'Courier New, monospace', 'size': 18},
               'template': '...',
               'title': {'text': 'Number of distinct items sold in shop ID 10'},
               'xaxis': {'title': {'text': ''}},
               'yaxis': {'title': {'text': 'Number of distinct items'}}}
})

In [15]:
# Incrementally process the slices. The plot should update during the processing.
filter_distinct_ola = FilterDistinctOla(filter_distinct_widget, "shop_id", 10, "item_id")
for df_slice in df_list:
    time.sleep(0.5)
    filter_distinct_ola.process_slice(df_slice)