In [186]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
from scipy import stats
from scipy.stats import gaussian_kde


In [241]:
base_path = Path("../dataset/processed")
recipes = pd.read_csv(base_path / "recipes_normalized.csv")
malts = pd.read_csv(base_path / "malts_normalized.csv")
hops = pd.read_csv(base_path / "hops_normalized.csv")
water = pd.read_csv(base_path / "water_normalized.csv")
yeast = pd.read_csv(base_path / "yeast_normalized.csv")

In [134]:
df = recipes[recipes['style'] == 'IMPERIAL STOUT']
df.head()

Unnamed: 0,recipe_id,title,style,year,url,original_category,final_category,medal,batch_size_gal,og,...,aroma_oz_gal,dry_hop_oz_gal,Ca_ppm,Mg_ppm,Na_ppm,Cl_ppm,SO4_ppm,HCO3_ppm,SO4_Cl_ratio,medal_category
19,19,Black Is Beautiful Imperial Stout (Countdown B...,IMPERIAL STOUT,2021.0,https://www.homebrewersassociation.org/homebre...,"['Extract', 'All-Grain', 'Ale']",Beer,No Medal,3.0,1.088,...,0.0,0.0,,,,,,,,No Medal
98,98,Richard’s Brownie Batter Imperial Stout,IMPERIAL STOUT,2019.0,https://www.homebrewersassociation.org/homebre...,"['Extract', 'All-Grain', 'Ale']",Beer,NHC GOLD,3.0,1.093,...,0.0,0.0,53.0,5.0,61.0,45.0,37.0,217.0,0.822222,Gold
140,140,Great Feats of Strength Imperial Stout,IMPERIAL STOUT,2017.0,https://www.homebrewersassociation.org/homebre...,"['Extract', 'Ale']",Beer,No Medal,10.0,1.121,...,0.0,0.0,42.0,14.0,13.0,18.0,10.0,129.0,0.555556,No Medal
155,155,Bell’s Brewery Expedition Stout Clone,IMPERIAL STOUT,2002.0,https://www.homebrewersassociation.org/homebre...,"['Extract', 'Ale', 'Pro-Clone']",Beer,CLONE,5.0,1.11,...,0.05,0.0,,,,,,,,Clone
160,160,New Holland Dragon’s Milk Bourbon Barrel-Aged ...,IMPERIAL STOUT,2018.0,https://www.homebrewersassociation.org/homebre...,"['Extract', 'All-Grain', 'Ale', 'Pro-Clone']",Beer,CLONE,5.5,1.096,...,0.0,0.0,,,,,,,,Clone


In [97]:
def plot_bell_curve(data, title, BJCP_range_x0=None, BJCP_range_x1=None):

    # Remove NaN values
    data = data[~np.isnan(data)]

    # Calculate stats
    mean = np.mean(data)
    std_dev = np.std(data)
    median = np.median(data)
    skewness = stats.skew(data)

    # Fit skew normal distribution to your data
    params = stats.skewnorm.fit(data)
    a, loc, scale = params  # a = skewness parameter

    # Generate skewed bell curve
    x = np.linspace(min(data) - 0.5*std_dev, max(data) + 0.5*std_dev, 1000)
    y = stats.skewnorm.pdf(x, a, loc, scale)
    y_max = max(y)
    
    # Create figure
    fig = go.Figure()
    
    # Add mean line
    fig.add_shape(
        type="line",
        x0=mean, x1=mean,
        y0=0, y1=y_max*1.08,
        line=dict(color="red", width=2, dash="solid"),
        layer="below"
    )

    # Add median line
    fig.add_shape(
        type="line",
        x0=median, x1=median,
        y0=0, y1=y_max*1.08,  # 50% of max height
        line=dict(color="green", width=2, dash="dash"),
        layer="below"
    )

    # Add std dev lines
    fig.add_shape(
        type="line",
        x0=mean - std_dev, x1=mean - std_dev,
        y0=0, y1=y_max*1.08,  # 50% of max height
        line=dict(color="orange", width=1.5, dash="dot"),
        layer="below"
    )
    fig.add_shape(
        type="line",
        x0=mean + std_dev, x1=mean + std_dev,
        y0=0, y1=y_max*1.08,  # 50% of max height
        line=dict(color="orange", width=1.5, dash="dot"),
        layer="below"
    )

    # Add bell curve (on top)
    fig.add_trace(go.Scatter(
        x=x, y=y,
        mode='lines',
        line=dict(color='#4a7cb5', width=2.5),
        fill='tozeroy',
        fillcolor='rgba(74, 124, 181, 0.1)'
    ))

    # Add data points (on x-axis)
    fig.add_trace(go.Scatter(
        x=data,
        y=[0] * len(data),
        mode='markers',
        marker=dict(color='darkblue', size=10, symbol='circle'),
        name='Data Points'
    ))

    # Add annotations
    fig.add_annotation(x=mean, y=y_max*1.15, text=f"Mean<br>{mean:.3f}", 
                    showarrow=False, font=dict(color="red", size=10))

    fig.add_annotation(x=median, y=y_max*1.15, text=f"Median<br>{median:.3f}", 
                    showarrow=False, font=dict(color="green", size=10))

    fig.add_annotation(x=mean-std_dev, y=y_max*1.15, text=f"-1σ<br>{mean-std_dev:.3f}", 
                    showarrow=False, font=dict(color="orange", size=10))

    fig.add_annotation(x=mean+std_dev, y=y_max*1.15, text=f"+1σ<br>{mean+std_dev:.3f}", 
                    showarrow=False, font=dict(color="orange", size=10))


    if BJCP_range_x0 and BJCP_range_x1:
        # Add shaded BJCP range
        fig.add_vrect(
            x0=BJCP_range_x0, x1=BJCP_range_x1,
            fillcolor= 'whitesmoke', # "rgba(100, 150, 255, 0.3)",
            line_width=0,
            layer="below"
        )

        # BJCP range annotation
        fig.add_annotation(x=(BJCP_range_x0 + BJCP_range_x1)/2, y=y_max*1.35, text="BJCP<br>RANGE", 
                        showarrow=False, font=dict(size=12))

    fig.update_layout(
        title=title,
        xaxis=dict(
            # tickmode='linear',
            # tick0=1.032,
            # dtick=0.002,
            tickangle=-90,
            tickformat='.3f',
            showgrid=False,
            # gridcolor='lightgray'
        ),
        yaxis=dict(
            showticklabels=False,
            showgrid=False,
            # gridcolor='lightgray'
        ),
        template='plotly_white',
        showlegend=False
    )

    fig.show()

In [98]:
plot_bell_curve(df['og'], 'Original Gravity', 1.090, 1.11)

In [313]:
def plot_scatter(data, x_col, y_col, title):
    data = data.dropna(subset=[x_col, y_col])

    pearson_r, pearson_p = stats.pearsonr(data[x_col], data[y_col])

    fig = px.scatter(
        data,
        x=x_col,
        y=y_col,
        trendline="ols",
        title=title, 
        labels={
            x_col: "Year",
            y_col: "Original Gravity (OG)"
        }
    )

    # Points
    fig.update_traces(
        selector=dict(mode="markers"),
        marker=dict(
            color="#4a7cb5",
            size=8,
            opacity=0.8
        )
    )

    # Trendline
    fig.update_traces(
        selector=dict(mode="lines"),
        line=dict(
            color="darkgray",
            dash="dash",
            width=3
        )
    )

    # Annotation
    fig.add_annotation(
        x=0.05,
        y=0.95,
        xref="paper",
        yref="paper",
        text=(
            f"Pearson r = {pearson_r:.3f}<br>"
            f"p-value = {pearson_p:.4f}"
        ),
        showarrow=False,
        font=dict(color="black", size=13),
        align="left"
    )

    # Axes + layout
    # fig.update_xaxes(tickformat="d", showgrid=False)
    # fig.update_yaxes(showgrid=False)
    fig.update_layout(template="plotly_white")

    fig.show()

    print(f"Pearson's r: {pearson_r:.4f}")
    print(f"P-value: {pearson_p:.4f}")
    print(f"R-squared: {pearson_r**2:.4f}")
    print(f"Trend: {'Increasing' if pearson_r > 0 else 'Decreasing'} over time")


In [132]:
plot_scatter(df, 'year', 'og')

Pearson's r: -0.2026
P-value: 0.2918
R-squared: 0.0411
Trend: Decreasing over time


In [178]:
def plot_donut(data, names_col, values_col, title=None):
    fig = px.pie(
        data,
        names=names_col,
        values=values_col,
        hole=0.5
    )

    fig.update_traces(
        textinfo="percent",
        textposition="inside",
        hovertemplate="%{label}: %{value:.2f}<extra></extra>",
        marker=dict(line=dict(color="white", width=2))
    )

    fig.update_layout(
        title=title,
        template="plotly_white",
        legend_title_text=names_col
    )
    fig.show()

create a mapping handbook 
pale malt, base malt
vienna malt, crytal malt

In [200]:
df_malt = malts.merge(
df[['recipe_id']],
    on='recipe_id',
    how='inner'
)

In [201]:
# one recipe can have multiple base malt, special malt => so we have to sum it first
df_sum_malt_type = df_malt.groupby(['recipe_id', 'malt_type'], as_index=False)['pct_of_grist'].sum()
df_sum_malt_type.head()

Unnamed: 0,recipe_id,malt_type,pct_of_grist
0,19,base,95.238095
1,19,crystal,4.761905
2,98,adjunct,6.261343
3,98,base,54.446461
4,98,crystal,26.76951


In [None]:
avg_malt_pct = df_sum_malt_type.groupby('malt_type', as_index=False)['pct_of_grist'].mean()
avg_malt_pct['pct'] = (
    avg_malt_pct['pct_of_grist']
    / avg_malt_pct['pct_of_grist'].sum()
    * 100
)
avg_malt_pct

Unnamed: 0,malt_type,pct_of_grist,pct
0,adjunct,5.690118,5.018076
1,base,66.963088,59.054289
2,crystal,7.488093,6.603698
3,roast,10.832662,9.55325
4,specialty,18.202022,16.052239
5,wheat,4.216437,3.718447


In [181]:
plot_donut(data=avg_malt_pct, names_col='malt_type', values_col='pct_of_grist', title='Malt Types')

In [419]:
def plot_malt_kde(
    df,
    value_col="pct_of_grist",
    type_col="malt_type",
    min_points=10
):
    fig = go.Figure()

    x_grid = np.linspace(0, 100, 1000)

    for malt, grp in df.groupby(type_col):
        values = grp[value_col].values

        if len(values) < min_points:
            continue

        kde = gaussian_kde(values)
        y = kde(x_grid)

        # normalize so peak = 100%
        y = y / y.max() * 100

        fig.add_trace(
            go.Scatter(
                x=x_grid,
                y=y,
                mode="lines",
                name=f"%{malt}",
                hovertemplate=(
                    f"{malt}<br>"
                    "% of grist: %{x:.1f}<br>"
                    "% of recipes: %{y:.1f}"
                    "<extra></extra>"
                )
            )
        )

    fig.update_layout(
        title="Distribution of Malt Types if Used",
        xaxis_title="% of Grist",
        yaxis_title="% of Recipes that Used This Malt",
        template="plotly_white",
        legend_title_text="Malt Type",
        hovermode="x unified"
    )

    fig.update_xaxes(range=[0, 100])
    fig.update_yaxes(range=[0, 100])

    fig.show()


# KDE & Trend
- Base Malt & Sub
- Crytal
- Roast
- Adjunct

In [None]:
plot_malt_kde(df_sum_malt_type, value_col='pct_of_grist', type_col='malt_type', title="Distribution of Malt Types if Used")

In [235]:
# brands = ['weyermann', 'simpsons', 'briess', 'dingemans', 'bestmalz', 'rahr']

# clean_names = malts['malt_name'].str.lower()

# for b in brands:
#     clean_names = clean_names.str.replace(b, '', regex=False)

# clean_names = clean_names.str.strip()
# clean_names

In [318]:

per_recipe = (
    hops
    .groupby(['recipe_id', 'style_group', 'hop_type', 'hop_name_normalized'], as_index=False)['oz_per_gal']
    .sum()
)

per_recipe["pct"] = (
    per_recipe
    .groupby(['recipe_id', 'style_group', 'hop_type'])['oz_per_gal']
    .transform(lambda x: x / x.sum() * 100)
)

avg_hops = (
    per_recipe
    .groupby(['recipe_id', 'style_group', 'hop_name_normalized'], as_index=False)["pct"]
    .mean()
)

In [319]:
avg_hops

Unnamed: 0,recipe_id,style_group,hop_name_normalized,pct
0,0,STRONG_ALE,Cascade,31.182796
1,0,STRONG_ALE,Centennial,31.182796
2,0,STRONG_ALE,Chinook,37.634409
3,1,BLONDE_PALE,Summit,100.000000
4,1,BLONDE_PALE,Willamette,100.000000
...,...,...,...,...
2811,1392,IPA,Columbus whole,41.666667
2812,1392,IPA,Simcoe/Amarillo/Cascade whole,33.333333
2813,1392,IPA,Simcoe/Cascade whole,33.333333
2814,1393,UNKNOWN,Hallertauer,100.000000


In [268]:
avg_hops_ipa = avg_hops[avg_hops['style_group'] == 'IPA'].copy()
avg_hops_ipa_bitter = avg_hops_ipa[avg_hops_ipa['hop_type'] == 'bittering'].copy()
plot_donut(data=avg_hops_ipa_bitter, names_col='hop_name_normalized', values_col='pct', title='Bittering Hops Used')

In [270]:
def plot_hop_rate_kde(
    df,
    rate_col="oz_per_gal",
    stage_col="hop_type",
    stages=("flavour", "aroma"),
    bandwidth=None
):
    fig = go.Figure()

    x_grid = np.linspace(
        df[rate_col].min(),
        df[rate_col].max(),
        500
    )

    for stage in stages:
        values = df.loc[
            df[stage_col] == stage,
            rate_col
        ].dropna().values

        if len(values) < 10:
            continue

        kde = gaussian_kde(values, bw_method=bandwidth)
        y = kde(x_grid)

        # normalize to percent of recipes
        y = y / y.max() * 100

        fig.add_trace(
            go.Scatter(
                x=x_grid,
                y=y,
                mode="lines",
                name=stage.capitalize(),
                hovertemplate=(
                    f"{stage.capitalize()}<br>"
                    "Hop rate: %{x:.3f} oz/gal<br>"
                    "Recipes: %{y:.1f}%"
                    "<extra></extra>"
                )
            )
        )

    fig.update_layout(
        title="Hop Rate of Additions",
        xaxis_title="Hop Rate (oz / gallon)",
        yaxis_title="Percent of Recipes Using This Addition",
        template="plotly_white",
        hovermode="x unified"
    )

    fig.update_yaxes(range=[0, 100])
    fig.show()


In [None]:
hops_ipa = hops[hops['style'] == 'IPA'].copy()
plot_hop_rate_kde(
    hops_ipa,
    stages=('bittering', 'flavour', 'aroma')
)

In [275]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from scipy.stats import pearsonr
from statsmodels.nonparametric.smoothers_lowess import lowess

In [329]:

used = (
    hops.groupby(["year", "recipe_id", "hop_type"])
    .size()
    .reset_index(name="n")
)

total_recipes = (
    hops.groupby("year")["recipe_id"]
    .nunique()
    .rename("total")
)

stage_counts = (
    used.groupby(["year", "hop_type"])["recipe_id"]
    .nunique()
    .rename("count")
    .reset_index()
)

stage_counts = stage_counts.merge(total_recipes, on="year")

stage_counts["pct"] = (
    stage_counts["count"] / stage_counts["total"] * 100
)

In [330]:
yearly_df = stage_counts.copy()
yearly_df.head()

Unnamed: 0,year,hop_type,count,total,pct
0,1974.0,aroma,1,1,100.0
1,1974.0,bittering,1,1,100.0
2,1977.0,aroma,1,1,100.0
3,1977.0,bittering,1,1,100.0
4,1977.0,flavour,1,1,100.0


In [331]:
import numpy as np
import plotly.graph_objects as go
from scipy.stats import pearsonr


In [332]:
def plot_late_hop_usage(yearly_df):
    fig = go.Figure()

    colors = {
        "flavour": "#4a7cb5",
        "aroma": "#d07c2c"
    }

    for stage in ["flavour", "aroma"]:
        d = yearly_df[yearly_df["hop_type"] == stage].sort_values("year")

        x = d["year"].values
        y = d["pct"].values

        # Scatter points
        fig.add_trace(
            go.Scatter(
                x=x,
                y=y,
                mode="markers",
                name=f"%{stage.capitalize()}",
                marker=dict(size=9, color=colors[stage])
            )
        )

        # Linear trend
        m, b = np.polyfit(x, y, 1)
        x_fit = np.array([x.min(), x.max()])
        y_fit = m * x_fit + b

        fig.add_trace(
            go.Scatter(
                x=x_fit,
                y=y_fit,
                mode="lines",
                name=f"Linear (%{stage.capitalize()})",
                line=dict(color=colors[stage], dash="dot", width=2)
            )
        )

        # Pearson r
        r, _ = pearsonr(x, y)

        fig.add_annotation(
            x=x_fit.mean(),
            y=y_fit.mean(),
            text=f"Pearson r = {r:.2f}",
            showarrow=False,
            font=dict(size=12, color="black")
        )

    fig.update_layout(
        title="Usage of Late Hops",
        xaxis_title="Year",
        yaxis_title="% of Recipes that use",
        template="plotly_white",
        hovermode="x unified"
    )

    fig.update_yaxes(range=[0, 100])
    fig.show()


In [None]:
plot_late_hop_usage(yearly_df[yearly_df["hop_type"].isin(["flavour", "aroma"])])

In [334]:
def loess_ci(x, y, frac=0.4, n_boot=500, grid_size=200):
    x = np.asarray(x)
    y = np.asarray(y)

    x_grid = np.linspace(x.min(), x.max(), grid_size)

    # LOESS on original data
    loess_fit = lowess(y, x, frac=frac, return_sorted=True)
    y_loess = np.interp(x_grid, loess_fit[:, 0], loess_fit[:, 1])

    # Bootstrap
    boot_curves = []

    rng = np.random.default_rng(42)
    for _ in range(n_boot):
        idx = rng.integers(0, len(x), len(x))
        xb = x[idx]
        yb = y[idx]

        fit = lowess(yb, xb, frac=frac, return_sorted=True)
        yb_interp = np.interp(x_grid, fit[:, 0], fit[:, 1])
        boot_curves.append(yb_interp)

    boot_curves = np.vstack(boot_curves)

    lower = np.percentile(boot_curves, 2.5, axis=0)
    upper = np.percentile(boot_curves, 97.5, axis=0)

    return x_grid, y_loess, lower, upper


def plot_late_hop_usage_with_loess_ci(yearly_df):
    fig = go.Figure()

    colors = {
        "flavour": "#4a7cb5",
        "aroma": "#d07c2c"
    }

    for stage in ["flavour", "aroma"]:
        d = yearly_df[yearly_df["hop_type"] == stage].sort_values("year")

        x = d["year"].values
        y = d["pct"].values

        # --------------------
        # Scatter
        # --------------------
        fig.add_trace(
            go.Scatter(
                x=x,
                y=y,
                mode="markers",
                name=f"%{stage.capitalize()}",
                marker=dict(size=9, color=colors[stage])
            )
        )

        # --------------------
        # Linear trend + Pearson
        # --------------------
        m, b = np.polyfit(x, y, 1)
        x_fit = np.array([x.min(), x.max()])
        y_fit = m * x_fit + b

        fig.add_trace(
            go.Scatter(
                x=x_fit,
                y=y_fit,
                mode="lines",
                name=f"Linear (%{stage.capitalize()})",
                line=dict(color=colors[stage], dash="dot", width=2)
            )
        )

        r, _ = pearsonr(x, y)

        fig.add_annotation(
            x=x_fit.mean(),
            y=y_fit.mean(),
            text=f"Pearson r = {r:.2f}",
            showarrow=False,
            font=dict(size=12, color="black")
        )

        # --------------------
        # LOESS + CI
        # --------------------
        xg, y_loess, lo, hi = loess_ci(x, y, frac=0.4)

        # CI band
        fig.add_trace(
            go.Scatter(
                x=np.concatenate([xg, xg[::-1]]),
                y=np.concatenate([hi, lo[::-1]]),
                fill="toself",
                fillcolor=colors[stage].replace(")", ",0.15)").replace("rgb", "rgba")
                if "rgb" in colors[stage]
                else "rgba(0,0,0,0.12)",
                line=dict(width=0),
                hoverinfo="skip",
                showlegend=False
            )
        )

        # LOESS line
        fig.add_trace(
            go.Scatter(
                x=xg,
                y=y_loess,
                mode="lines",
                name=f"LOESS (%{stage.capitalize()})",
                line=dict(color=colors[stage], width=3)
            )
        )

    fig.update_layout(
        title="Usage of Late Hops",
        xaxis_title="Year",
        yaxis_title="% of Recipes that use",
        template="plotly_white",
        hovermode="x unified"
    )

    fig.update_yaxes(range=[0, 100])
    fig.show()


In [None]:
plot_late_hop_usage_with_loess_ci(yearly_df[yearly_df["hop_type"].isin(["flavour", "aroma"])])

In [None]:
def yearly_hop_rate(df):
    aroma = df[df["hop_type"] == "aroma"] #bitter, flavour

    # total aroma hops per recipe per year
    per_recipe = (
        aroma
        .groupby(["year", "recipe_id"], as_index=False)["oz_per_gal"]
        .sum()
    )

    # average across recipes
    yearly = (
        per_recipe
        .groupby("year", as_index=False)["oz_per_gal"]
        .mean()
    )

    return yearly


In [343]:
yearly_aroma = yearly_hop_rate(hops)
yearly_aroma.head()

Unnamed: 0,year,oz_per_gal
0,1974.0,0.25
1,1977.0,1.166667
2,1992.0,0.4
3,1995.0,0.320833
4,1997.0,0.166667


In [342]:
plot_scatter(yearly_aroma, 'year', 'oz_per_gal', title='Aroma Hop Rate (oz / gallon)')

Pearson's r: 0.1686
P-value: 0.3564
R-squared: 0.0284
Trend: Increasing over time


In [358]:
ipa_recipe = recipes[recipes['style'] == 'IPA'].copy()
mash_counts = ipa_recipe["mash_type"].value_counts().reset_index()

plot_donut(
    data=mash_counts,
    names_col="mash_type",
    values_col="count",
    title="Mash Type Distribution"
)


In [363]:
plot_bell_curve(ipa_recipe['boil_time_min'], 'test')

In [364]:
plot_scatter(ipa_recipe, 'year', 'boil_time_min', title='Boil Duration')    

Pearson's r: -0.5342
P-value: 0.0087
R-squared: 0.2853
Trend: Decreasing over time


In [None]:
import json

ipa_recipe["yeast_parsed"] = ipa_recipe["yeast_json"].apply(
    lambda x: json.loads(x) if pd.notna(x) else {}
)
def yeast_label(y):
    brand = y.get("brand")
    code = y.get("product_code")

    if brand and code:
        return f"{brand} {code}"
    else:
        return "Other"
    
ipa_recipe["yeast_brand"] = [y.get("brand") for y in ipa_recipe["yeast_parsed"]]
ipa_recipe["yeast_code"] = [y.get("product_code") for y in ipa_recipe["yeast_parsed"]]

In [402]:
yeast_counts = ipa_recipe["yeast_label"].value_counts().reset_index()

In [403]:
plot_donut(
    data=yeast_counts,
    names_col="yeast_label",
    values_col="count",
    title="Yeast Usage (IPA Recipes)"
)


In [404]:
recipes.columns

Index(['recipe_id', 'title', 'style', 'year', 'url', 'original_category',
       'final_category', 'medal', 'batch_size_gal', 'og', 'fg', 'abv_pct',
       'ibu', 'srm', 'efficiency_pct', 'malts_json', 'hops_json', 'yeast_json',
       'adjuncts_json', 'num_malts', 'num_hops', 'num_adjuncts', 'mash_type',
       'mash_steps_json', 'num_mash_steps', 'boil_time_min',
       'fermentation_stages_json', 'num_fermentation_stages',
       'water_description', 'water_Ca_ppm', 'water_Mg_ppm', 'water_Na_ppm',
       'water_Cl_ppm', 'water_SO4_ppm', 'water_HCO3_ppm',
       'water_salt_additions_json', 'water_volume_gal', 'extract_version',
       'style_group', 'parent_style', 'grist_composition', 'base_malt_pct',
       'crystal_pct', 'roast_pct', 'adjunct_pct', 'hop_schedule',
       'bittering_oz_gal', 'flavor_oz_gal', 'aroma_oz_gal', 'dry_hop_oz_gal',
       'Ca_ppm', 'Mg_ppm', 'Na_ppm', 'Cl_ppm', 'SO4_ppm', 'HCO3_ppm',
       'SO4_Cl_ratio', 'medal_category'],
      dtype='object')

In [410]:
ions = [
    "Ca_ppm", "Mg_ppm", "Na_ppm",
    "Cl_ppm", "SO4_ppm", "HCO3_ppm"
]
water_long = (ipa_recipe[ions]
              .dropna(how="all")
              .melt(var_name="ion", value_name="ppm")
              .dropna(subset=["ppm"]) 
)
water_long

Unnamed: 0,ion,ppm
0,Ca_ppm,100.0
1,Ca_ppm,100.0
2,Ca_ppm,274.5
3,Ca_ppm,165.0
5,Mg_ppm,18.0
7,Mg_ppm,12.0
9,Na_ppm,16.0
10,Na_ppm,195.0
11,Na_ppm,17.0
12,Cl_ppm,125.0


In [418]:
ipa_recipe[ions].dropna(how="all")

Unnamed: 0,Ca_ppm,Mg_ppm,Na_ppm,Cl_ppm,SO4_ppm,HCO3_ppm
105,100.0,,,125.0,150.0,23.0
428,100.0,18.0,16.0,150.0,50.0,
782,274.5,,195.0,315.0,643.5,
939,165.0,12.0,17.0,300.0,55.0,


In [417]:
fig = px.box(water_long, y="ppm", x="ion", color='ion', points="all",
          hover_data=water_long.columns)
fig.show()

In [431]:
import numpy as np
import pandas as pd

mash_steps = pd.read_csv('/Users/famepatcharapol/Desktop/Learning/craft_beer_analysis/dataset/processed/mash_steps_normalized.csv')

mash_steps_ipa = mash_steps[mash_steps['recipe_id'].isin(ipa_recipe['recipe_id'])]

# normalize step names
mash_steps_ipa["step_name_clean"] = (
    mash_steps_ipa["step_name"]
    .str.lower()
    .str.strip()
    .replace({
        "mash": "mash",
        "": "unknown",
        np.nan: "unknown",
        "decoction rests": "decoction"
    })
)

# keep only rows with temperature
mash_steps_ipa = mash_steps_ipa.dropna(subset=["temp_F"])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [433]:
mash_steps_ipa

Unnamed: 0,recipe_id,recipe_title,style,style_group,year,step_number,step_name,temp_F,time_min,step_name_clean
50,55,Tuxedo Speedo Black IPA,IPA,IPA,2013.0,1,mash,152.0,60,mash
82,80,Wry Smile Rye IPA,IPA,IPA,2019.0,1,Mash at 153\u00b0F (67\u00b0C),153.0,60,mash at 153\u00b0f (67\u00b0c)
115,104,Three Floyds Brewing Zombie Dust IPA Clone,IPA,IPA,2019.0,1,Mash,155.0,60,mash
116,105,Cigar City Jai Alai IPA Classic Clone,IPA,IPA,2019.0,1,Mash,150.0,60,mash
152,130,A2 Brew Tay,IPA,IPA,2019.0,1,Mash Rest,152.0,45,mash rest
222,177,Stew’s Brew Imperial Black IPA,IPA,IPA,2018.0,1,Mash,150.0,60,mash
234,184,TukTukTea IPA,IPA,IPA,2018.0,1,mash,153.0,60,mash
238,189,American-Style Cascadian India Dark Double Bla...,IPA,IPA,2018.0,1,primary mash,150.0,60,primary mash
239,189,American-Style Cascadian India Dark Double Bla...,IPA,IPA,2018.0,2,mash out,168.0,10,mash out
299,230,Smoked Habanero IPA,IPA,IPA,2017.0,1,Mash,152.0,60,mash


In [441]:
import plotly.express as px

fig = px.violin(
    mash_steps_ipa,
    x="step_name_clean",
    y="temp_F",
    box=True,          # median + IQR
    points="outliers"  # or "all"
)

fig.update_layout(
    title="Mash Step Temperature Distribution",
    xaxis_title="Mash Step",
    yaxis_title="Temperature (°F)",
    template="plotly_white",
    showlegend=False
)

fig.show()


In [443]:
import pandas as pd
import numpy as np

ferm_steps = pd.read_csv('/Users/famepatcharapol/Desktop/Learning/craft_beer_analysis/dataset/processed/fermentation_stages_normalized.csv')

df = ferm_steps.copy()

df["stage_clean"] = (
    df["stage"]
    .str.lower()
    .str.strip()
    .replace({
        "primary": "primary",
        "secondary": "secondary",
        "dry hop": "dry hop",
        "dry hopping": "dry hop",
        "diacetyl rest": "diacetyl rest",
        "cold crash": "cold crash",
        "lagering": "lagering"
    })
)

# midpoint temperature
df["temp_F"] = df[["start_temp_F", "end_temp_F"]].mean(axis=1)

df = df.dropna(subset=["temp_F"])


In [444]:
import plotly.express as px

fig = px.violin(
    df,
    x="stage_clean",
    y="temp_F",
    box=True,           # show median & IQR
    points="outliers"   # or "all"
)

fig.update_layout(
    title="Fermentation Temperature Distribution by Stage",
    xaxis_title="Fermentation Stage",
    yaxis_title="Temperature (°F)",
    template="plotly_white",
    showlegend=False
)

fig.show()
