## End to end organic consumer insights with Voila and JupySQL

In [1]:
%%bash 
pip install voila jupysql pandas bqplot ipywidgets duckdb-engine ipyvuetify --quiet

In [2]:
from bqplot import pyplot as plt
import ipywidgets as widgets
from bqplot import (
    Axis, LinearScale, OrdinalScale,
    Figure, Bars, Scatter
)
import numpy as np
from bqplot import ColorScale, ColorAxis
import ipyvuetify as v
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, IntSlider, Dropdown


In [3]:
%load_ext sql
%sql duckdb://

In [5]:
%%sql
SELECT Geographic_Region, Loyalty_Status, Television_Region FROM organics.csv LIMIT 5

*  duckdb://
Done.


Geographic_Region,Loyalty_Status,Television_Region
Midlands,Gold,Wales & West
Midlands,Gold,Wales & West
Midlands,Silver,Wales & West
Midlands,Tin,Midlands
Midlands,Tin,Midlands


In [6]:
# Define the function that will be called when the button is clicked
def bar_chart(df, x_col, y_col, title):
    """
    
    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to plot.
    x_col : str
        Name of column to plot, x axis
    y_col: str
        Name of column to plot, y axis
    title: str
        Title of the plot

    Returns:
    fig: bqplot.Figure
        The figure to plot.
    """
    # Create the scales
    x_ord_scale = OrdinalScale()
    y_lin_scale = LinearScale()
    color_scale = ColorScale(scheme='viridis')

    # Create the axes
    x_axis = Axis(scale=x_ord_scale, label=x_col)
    y_axis = Axis(scale=y_lin_scale, orientation='vertical', label=y_col)

    # Create the bar values
    x = df[x_col].values
    y = df[y_col].values

    # Sort organics_sold in descending order and get the sorted indices
    sorted_indices = np.argsort(y)[::-1]

    # Apply the sorted indices to both organics_sold and regions
    sorted_y = y[sorted_indices]
    sorted_x = x[sorted_indices]

    # Create the bar chart
    bar_chart = Bars(
        x=sorted_x,
        y=sorted_y,
        scales={'x': x_ord_scale, 'y': y_lin_scale, 'color': color_scale}
    )

    # Create the figure
    fig = Figure(marks=[bar_chart], axes=[x_axis, y_axis], title=title)
    return fig

def scatter_plot(df, x, y, title):
    """
    
    Parameters
    ----------
    df : pandas.DataFrame
        The dataframe to plot.
    x : str
        Name of column to plot, x axis
    y: str
        Name of column to plot, y axis
    title: str
        Title of the plot

    Returns:
    fig: bqplot.Figure
        The figure to plot.
    """

    # Create the scales
    x_scale = LinearScale()
    y_scale = LinearScale()

    # Create the scatter plot
    scatter_chart = Scatter(
        x=df[x],
        y=df[y],
        scales={'x': x_scale, 'y': y_scale},
        default_size=64,
    )

    # Create the axes
    x_axis = Axis(scale=x_scale, label=x)
    y_axis = Axis(scale=y_scale, label=y, orientation='vertical')

    fig = Figure(marks=[scatter_chart], axes=[x_axis, y_axis], title=title)
    return fig


def set_fig_layout(fig, width, height, min_height):
    """
    
    Parameters
    ----------
    fig : bqplot.Figure
        The figure to plot.
    width : int
        Width of the plot.
    height: int
        Height of the plot.
    min_height: int
        Minimum height of the plot.
        
    """
    fig.layout.width = width
    fig.layout.height = height
    fig.layout.min_width = min_height
    return fig

### Part I: Purchases by region

In [7]:
# Function to execute the SQL query and return a DataFrame
def get_organics_by_threshold(threshold):
    global organics_by_threshold_df
    query = f"""
    SELECT Television_Region, COUNT(*) as NUM_Purchases
    FROM organics.csv
    WHERE Organics_Purchase_Indicator = 1
    GROUP BY Television_Region
    HAVING COUNT(*) >= {threshold}
    """
    print("Performing query")
    # Use JupySQL magic %sql to execute the query
    result = %sql {{query}}
    # Convert the result to a pandas DataFrame
    organics_by_threshold_df = result.DataFrame()

    # Create the bar chart
    fig_bar_geo = bar_chart(organics_by_threshold_df,  'Television_Region', 'NUM_Purchases', 'Number of organic purchases made by region')
    fig_bar_geo = set_fig_layout(fig_bar_geo, 'auto', '400px', '300px')
    display(organics_by_threshold_df)
    display(fig_bar_geo)

# Create a variable for the threshold selection
threshold = widgets.IntSlider(
    min=0, max=1000, step=100, value=0,
    description='Threshold:',
    disabled=False,
)


# Use ipywidgets.interact_manual to create a dynamic interface
interact_manual(get_organics_by_threshold, threshold=threshold);

interactive(children=(IntSlider(value=0, description='Threshold:', max=1000, step=100), Button(description='Ru…

### Part II: Consumer trends by gender and age group

In [8]:
def get_age_group_purchase(age_groups):
    global age_group_purchase_df
    query = f"""
    SELECT Gender, Age_Group, AVG(Total_Spend) as Average_Total_Spend
    FROM (
        SELECT Gender, 
            CASE 
                WHEN Age < 30 THEN 'Under 30'
                WHEN Age BETWEEN 30 AND 50 THEN '30-50'
                WHEN Age BETWEEN 51 AND 70 THEN '51-70'
                ELSE 'Over 70'
            END as Age_Group,
            Total_Spend
        FROM organics.csv
        WHERE Organics_Purchase_Indicator = 1
    ) as subquery
    WHERE Age_Group IN {age_groups}
    GROUP BY Gender, Age_Group

    """
    # Use JupySQL magic %sql to execute the query
    print("Performing query")
    result = %sql {{query}}
    # Convert the result to a pandas DataFrame
    age_group_purchase_df = result.DataFrame()

    # Create the bar chart
    fig_bar_gender = bar_chart(age_group_purchase_df, 'Gender', 'Average_Total_Spend', 'Average total expenses of Purchases by age group and gender')
    fig_bar_gender = set_fig_layout(fig_bar_gender, 'auto', '400px', '300px')
    display(age_group_purchase_df)
    display(fig_bar_gender)

# Create a variable for the age group selection
age_groups = widgets.SelectMultiple(
    options=['Under 30', '30-50', '51-70', 'Over 70'],
    value=['Under 30'],
    description='Age Groups:',
    disabled=False,
)

# Use ipywidgets.interact to create a dynamic interface
interact_manual(get_age_group_purchase, age_groups=age_groups);

interactive(children=(SelectMultiple(description='Age Groups:', index=(0,), options=('Under 30', '30-50', '51-…

### Part III: Relationship between average expenses and number of purchases by gender

In [9]:
def get_purchased_by_affluence(affluence_min, selected_gender):
    query = f"""
    SELECT Affluence_Grade, COUNT(*) as Num_Purchased, AVG(Total_Spend) as AVG_Expenses
    FROM organics.csv
    WHERE Organics_Purchase_Indicator = 1 AND Affluence_Grade >= {affluence_min} AND Gender = '{selected_gender}'
    GROUP BY Affluence_Grade
    ORDER BY Affluence_Grade
    """
    print("Performing query")
    # Use JupySQL magic %sql to execute the query
    result = %sql {{query}}
    # Convert the result to a pandas DataFrame
    aff_g_df =  result.DataFrame()

    # Create the scatter plot
    fig = scatter_plot(aff_g_df, 'AVG_Expenses', 'Num_Purchased', 'Number of Purchases vs Total Expenses by Affluence Grade and Gender')
    fig = set_fig_layout(fig, 'auto', '400px', '300px')

    display(fig)

affluence_min = IntSlider(
    min=1, max=15, step=1, value=1,
    description='Min Aff:',
    disabled=False,
)

gender_dropdown = Dropdown(
    options=['M', 'F', 'U'],
    value='M',
    description='Gender:',
    disabled=False,
)

interact_manual(get_purchased_by_affluence, affluence_min=affluence_min, selected_gender=gender_dropdown);


interactive(children=(IntSlider(value=1, description='Min Aff:', max=15, min=1), Dropdown(description='Gender:…