In [1]:
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [2]:
recipes = pd.read_csv('./data/kaggle-beer-recipes/recipeData.csv', encoding='latin1')
recipes.columns

Index(['BeerID', 'Name', 'URL', 'Style', 'StyleID', 'Size(L)', 'OG', 'FG',
       'ABV', 'IBU', 'Color', 'BoilSize', 'BoilTime', 'BoilGravity',
       'Efficiency', 'MashThickness', 'SugarScale', 'BrewMethod', 'PitchRate',
       'PrimaryTemp', 'PrimingMethod', 'PrimingAmount', 'UserId'],
      dtype='object')

In [3]:
# unique Style
styles = recipes['Style'].unique()
len(styles)

# count of recipes per style
style_counts = recipes['Style'].value_counts().reset_index()
style_counts.columns = ['Style', 'Count']
# keep styles with more than 100 recipes
style_counts = style_counts[style_counts['Count'] > 250]

print(f"Number of styles with more than 250 recipes: {len(style_counts)}")

recipes = recipes[recipes['Style'].isin(style_counts['Style'])]

# group by Style and remove beers where the ibu is greater than the 99th percentile or above 120
recipes = recipes.groupby('Style', group_keys=False).apply(
    lambda x: x[(x['IBU'] < x['IBU'].quantile(0.97)) & (x['IBU'] > x['IBU'].quantile(0.01))]
)

# set any ibu values greater than 150 to 150
recipes.loc[recipes['IBU'] > 150, 'IBU'] = 150

# group by Style and remove beers where the abv is greater than the 99th percentile or above 20
recipes = recipes.groupby('Style', group_keys=False).apply(
    lambda x: x[x['ABV'] < x['ABV'].quantile(0.98)]
)

recipes.head()

Number of styles with more than 250 recipes: 69


  recipes = recipes.groupby('Style', group_keys=False).apply(
  recipes = recipes.groupby('Style', group_keys=False).apply(


Unnamed: 0,BeerID,Name,URL,Style,StyleID,Size(L),OG,FG,ABV,IBU,...,BoilGravity,Efficiency,MashThickness,SugarScale,BrewMethod,PitchRate,PrimaryTemp,PrimingMethod,PrimingAmount,UserId
59,60,Red Trolley Clone,/homebrew/recipe/view/15348/red-trolley-clone,American Amber Ale,4,20.82,1.06,1.016,5.71,17.56,...,1.045,75.0,,Specific Gravity,All Grain,1.25,20.0,corn sugar,5 oz,365.0
103,104,White House Honey Ale (Official Recipe),/homebrew/recipe/view/17371/white-house-honey-...,American Amber Ale,4,20.82,1.06,1.017,5.63,27.73,...,,30.0,,Specific Gravity,extract,,,Dextrose,3/4 Cup,
149,150,Jeremy Corbeeryn (Hoppy Red Ale),/homebrew/recipe/view/521119/jeremy-corbeeryn-...,American Amber Ale,4,28.0,1.055,1.012,5.82,44.62,...,1.055,75.0,,Specific Gravity,BIAB,0.75,18.0,,,52625.0
152,153,Bakke Brygg American Red Ale 25 L,/homebrew/recipe/view/90044/bakke-brygg-americ...,American Amber Ale,4,25.0,1.065,1.015,6.61,65.02,...,1.056,74.0,,Specific Gravity,All Grain,1.0,18.0,Sukkerlake,6 5 g sukker/l,18325.0
263,264,American Red Ale,/homebrew/recipe/view/24623/american-red-ale,American Amber Ale,4,22.71,1.066,1.016,6.63,49.61,...,1.047,72.0,,Specific Gravity,All Grain,,,,,455.0


In [4]:
categories = ['IPA', 'Stout', 'Porter', 'Lager', 'Wheat', 'Saison', 'Other', 'Ale', 'Pilsener', 'Brown Ale', 'Barleywine']

style_to_category = {
    "Munich Dunkel": "Lager",
    'Saison': 'Ale',
    'Kölsch': 'Ale',
    'Märzen': 'Lager',
    'Oktoberfest/Märzen': 'Lager',
    'Belgian Tripel': 'Ale',
    'Belgian Dubbel': 'Ale',
}

def categorize_style(style):
    
    if style in style_to_category:
        return style_to_category[style]
    
    # pilsner
    if 'pilsner' in style.lower() or 'pils' in style.lower():
        return 'Pilsener'
    
    # if contains weiss or hefe, categorize as Wheat
    if 'weiss' in style.lower() or 'hefe' in style.lower() or 'witbier' in style.lower():
        return 'Wheat'
    
    for category in categories:
        if category.lower() in style.lower():
            return category
    return 'Other'
recipes['Category'] = recipes['Style'].apply(categorize_style)

In [9]:
border_col = '#c2b472'
grid_col = "#e8e2c5"
# get the median IBU for each category
category_medians = recipes.groupby('Category')['IBU'].median().sort_values(ascending=False)
# order categories by median IBU
categories = category_medians.index.tolist()


# use make_subplots to create a grid of subplots. one column, and 1 row per category
# Calculate the number of styles in each category to scale row heights
category_style_counts = [recipes[recipes['Category'] == cat]['Style'].nunique() for cat in categories]
total = sum(category_style_counts)
row_heights = [count / total for count in category_style_counts]


import matplotlib.colors as mcolors

# Define color scale: 0 (pale yellow) to 120+ (brownish yellow)
cmap = mcolors.LinearSegmentedColormap.from_list(
    "ibu_yellow_brown",
    ["#fac852", "#543b00"],  # pale yellow to brownish yellow
    N=120
)

cmap_abv = mcolors.LinearSegmentedColormap.from_list(
    "ibu_blue_red",
    ["#6497b1", "#b22222"],  # darker blue to darker red
    N=120
)

fig = make_subplots(
    rows=len(categories),
    cols=2,
    shared_xaxes=True,
    vertical_spacing=0.02,
    horizontal_spacing=0.02,
    # subplot_titles=categories,
    row_heights=row_heights,
    # column_widths=[0.8, 0.2]
    column_widths=[0.2, 0.8]
)

for i, category in enumerate(categories, start=1):
    category_data = recipes[recipes['Category'] == category]
    # Calculate median IBU per style within the category and order styles
    style_medians = category_data.groupby('Style')['IBU'].median().sort_values(ascending=True)
    ordered_styles = style_medians.index.tolist()
    # Map median IBU to color
    colors = [
        mcolors.to_hex(cmap(min(median / 100, 1.0)))
        for median in style_medians.values
    ]
    
    colors_abv = [
        mcolors.to_hex(cmap_abv(min(category_data[category_data['Style'] == style]['ABV'].median() / 12, 1.0)))
        for style in ordered_styles
    ]
    
    for k, (style, color) in enumerate(zip(ordered_styles, colors)):
        style_data = category_data[category_data['Style'] == style]
        fig.add_trace(
            go.Box(
                x=style_data['IBU'],
                y=[style] * len(style_data),
                # name=style,
                orientation='h',
                boxpoints='outliers',
                marker=dict(size=3, color=color),
                # line=dict(width=1, color=color),
                # boxmean=True,
                showlegend=False
            ),
            row=i, col=2
        )
        
        fig.add_trace(
            go.Box(
                x=style_data['ABV'],
                y=[style] * len(style_data),
                # name=style,
                name="",
                orientation='h',
                boxpoints='outliers',
                marker=dict(size=3, color=colors_abv[k]),
                # line=dict(width=1, color=color),
                # boxmean=True,
                showlegend=False
            ),
            row=i, col=1
        )
        
    fig.update_yaxes(showticklabels=False, categoryorder='array', categoryarray=ordered_styles, row=i, col=2, showline=True, linewidth=1, linecolor=border_col, mirror=True)
    fig.update_yaxes(showticklabels=True, row=i, col=1, showline=True, linewidth=1, linecolor=border_col, mirror=True)

    fig.update_xaxes(
        row=i, col=1,
        showline=True,
        linewidth=1,
        linecolor=border_col,
        mirror=True,
        ticks='outside',
        tickwidth=1,
        tickcolor=border_col
    )
    fig.update_xaxes(
        row=i, col=2,
        showticklabels=True if i == len(categories) else False,
        showline=True,
        linewidth=1,
        linecolor=border_col,
        mirror=True,
        ticks='outside',
        tickwidth=1,
        tickcolor=border_col
    )

fig.update_layout(
    title='IBU Distribution by Beer Category (Colored by Median IBU)',
    height=1500,
    showlegend=False,
    paper_bgcolor='#fcfbf2',  # very light yellow
    plot_bgcolor='#faf6e3',   # very light yellow
)

fig.update_yaxes(gridcolor=grid_col)
fig.update_xaxes(gridcolor=grid_col, zerolinecolor=grid_col)


# increase x axis tick font size
fig.update_xaxes(tickfont=dict(size=20))

fig.show(format='png', width=800, height=1500)

In [15]:
# Calculate median ABV per style and sort
median_abv = recipes.groupby('Style')['IBU'].median().sort_values(ascending=False)
ordered_styles = median_abv.index.tolist()

fig = px.box(
    recipes,
    y='Style',
    x='IBU',
    # facet_col='Category',
    # facet_col_wrap=1,
    # color='Category',
    title='IBU by Style',
    # points='all',
    orientation='h',
    category_orders={'Style': ordered_styles},
)
fig.update_traces(
    marker=dict(
        size=3,
    ),
    whiskerwidth=0
    
)
fig.update_layout(
    yaxis_title='Style',
    xaxis_title='IBU (bitterness)',
    yaxis_tickangle=0,
    # xaxis_tickformat='%',
    width=800,
    height=1200,
)
fig.show(format='png')

In [31]:
# Calculate median ABV per style and sort
median_abv = recipes.groupby('Style')['ABV'].median().sort_values(ascending=False)
ordered_styles = median_abv.index.tolist()

fig = px.box(
    recipes,
    y='Style',
    x='ABV',
    title='ABV by Style',
    # points='all',
    orientation='h',
    category_orders={'Style': ordered_styles}
)
fig.update_layout(
    yaxis_title='Style',
    xaxis_title='ABV',
    yaxis_tickangle=0,
    # xaxis_tickformat='%',
    width=800,
    height=1200
)
fig.show(format='png')



