# Import and Functions

In [None]:
from dataclasses import dataclass
import typing
from pathlib import Path

import pandas as pd
import numpy as np

import sklearn as sk
from sklearn.manifold import TSNE
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from matplotlib import pyplot as plt
import seaborn as sns

import bokeh, bokeh.io

bokeh.io.output_notebook()


In [None]:
def feature_summary(data):
    col_names = list(data.columns)
    dtypes = [data[col].dtype.name for col in data]
    types = [type(data[col].iloc[0]).__name__ for col in data]
    nb_unique_val = [
        len(data[col].unique())
        if isinstance(data[col].iloc[0], typing.Hashable)
        else np.nan
        for col in data
    ]
    ratio_null = [data[col].isnull().sum() / data[col].shape[0] * 100 for col in data]

    table = pd.DataFrame(
        {
            "columns": col_names,
            "DType": dtypes,
            "Type": types,
            "Nb unique values": nb_unique_val,
            "% null": ratio_null,
        }
    ).set_index("columns")
    table = table.style.background_gradient(
        axis=None, vmin=0, vmax=np.max(ratio_null), cmap="Reds", subset=["% null"]
    )
    return table


def nutriscore_grade(score):
    grade = np.full_like(score, fill_value=np.nan, dtype="object")
    mask_nan = ~np.isnan(score)
    indices = np.argmin(
        np.array(score[mask_nan])[:, np.newaxis]
        > np.array([-1, 4, 12, 17, 40, 1000])[np.newaxis, :],
        axis=1,
    )
    grade[mask_nan] = np.array(list("ABCDE"))[indices]
    return grade


def flatten(arr):
    for item in arr:
        if isinstance(
            item,
            (
                list,
                tuple,
                np.ndarray,
            ),
        ):
            yield from flatten(item)
        else:
            yield item


# Configuration des figures

In [None]:
import matplotlib

matplotlib.rcdefaults()
matplotlib.rc(
    "axes",
    labelsize="large",
    labelweight="bold",
    titlesize="xx-large",
    titleweight="bold",
)


# Load Data

In [None]:
@dataclass
class GlobalState:
    data: pd.DataFrame


g = GlobalState(data=pd.read_pickle("clean_data.pkl"))


g.data.sample(10).T


# Nutriscore

In [None]:
nutriscore_colors = [
    "#0a8e45",
    "#7ac547",
    "#ffc734",
    "#ff7d24",
    "#ff421a",
    "#7b7c80",
]  # A -> E -> Nan
palette_nutriscore = sns.color_palette(nutriscore_colors)
palette_nutriscore


In [None]:
g.data["nutrition_grade_fr"].cat.categories


In [None]:
g.data["nutrition_grade_fr_"] = (
    g.data["nutrition_grade_fr"].cat.add_categories("NaN").fillna("NaN")
)


In [None]:
g.data["nutrition_grade_fr__"] = g.data["nutrition_grade_fr"].cat.codes


Regardons la proportion de produits selon la catégorie du Nutri-Score.

In [None]:
g.data["nutrition_grade_fr"].value_counts().sort_index().plot.pie(
    colors=palette_nutriscore,
    labeldistance=0.8,
    labels=[
        f"{k}\n{v:.1%}"
        for k, v in g.data["nutrition_grade_fr"]
        .value_counts(normalize=True)
        .sort_index()
        .items()
    ],
    textprops=dict(
        color="w", fontsize=11, fontweight="bold", horizontalalignment="center"
    ),
    wedgeprops=dict(width=0.4),
)
_ = plt.ylabel("")


Les catégories sont relativements équilibrés, il n'y a pas de risque de non-représentativité d'une classes si l'on veut faire de la prédiction avec une algorithme de machine learning.

In [None]:
g.data["nutrition_grade_fr_"].value_counts().sort_index().plot.pie(
    colors=palette_nutriscore,
    labeldistance=0.8,
    labels=[
        f"{k}\n{v:.1%}"
        for k, v in g.data["nutrition_grade_fr_"]
        .value_counts(normalize=True)
        .sort_index()
        .items()
    ],
    textprops=dict(
        color="w", fontsize=11, fontweight="bold", horizontalalignment="center"
    ),
    wedgeprops=dict(width=0.4),
)
_ = plt.ylabel("")


In [None]:
g.data["nutrition_grade_fr_"].value_counts().sort_index().plot.pie(
    colors=palette_nutriscore,
    labeldistance=0.8,
    labels=[
        f"{k}\n{v:.1%}"
        for k, v in g.data["nutrition_grade_fr_"]
        .value_counts(normalize=True)
        .sort_index()
        .items()
    ],
    textprops=dict(
        color="w", fontsize=11, fontweight="bold", horizontalalignment="center"
    ),
    wedgeprops=dict(width=0.4),
)
plt.ylabel("")


In [None]:
# g.data["nutrition-score-fr_100g"].plot.hist(bins=np.linspace(-15.5, 35.5, 52))
_ = sns.histplot(
    data=g.data,
    stat="count",
    multiple="stack",
    x="nutrition-score-fr_100g",
    kde=False,
    palette=palette_nutriscore[:-1],
    hue="nutrition_grade_fr",
    element="bars",
    legend=True,
    bins=np.linspace(-15.5, 35.5, 52),
)


In [None]:
mask = (
    (g.data["pnns_groups_1"] != "beverages")
    & (~g.data["pnns_groups_1"].isnull())
    & (g.data["pnns_groups_1"] != "unknown")
)

_ = sns.histplot(
    data=g.data[mask],
    stat="count",
    multiple="stack",
    x="nutrition-score-fr_100g",
    kde=False,
    palette=palette_nutriscore[:-1],
    hue="nutrition_grade_fr",
    element="bars",
    legend=True,
    bins=np.linspace(-15.5, 35.5, 52),
)


# Nutriments vs Nutriscore

In [None]:
nutriments = [
    f"{nut}_100g"
    for nut in [
        "energy",
        "fat",
        "saturated-fat",
        "carbohydrates",
        "sugars",
        "fiber",
        "proteins",
        "sodium",
    ]
]
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(25, 10))
for feature, ax in zip(nutriments, flatten(axes)):
    sns.histplot(
        data=g.data,
        stat="count",
        multiple="stack",
        x=feature,
        kde=False,
        element="bars",
        legend=True,
        bins=np.linspace(-0.5, 100.5, 102),
        ax=ax,
    )
    # plt.yscale("log")


In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(30, 16))
for feature, ax in zip(nutriments, flatten(axes)):
    if "energy" in feature:
        bins = np.linspace(-0.5, 4000.5, 82)
        max_x = 4000
        max_hist_y = 2500
    else:
        max_x = g.data[feature].max()
        bins = np.linspace(-0.5, max_x, 101)
        max_hist_y = 8000
    ax_hist = sns.histplot(data=g.data, x=feature, color="lightblue", bins=bins, ax=ax)
    ax_kde = sns.kdeplot(
        data=g.data,
        x=feature,
        palette=palette_nutriscore,
        hue="nutrition_grade_fr_",
        legend=True,
        linewidth=3,
        ax=ax.axes.twinx(),
    )
    ax.set_xlim(0, max_x)
    ax_hist.set_ylim(0, max_hist_y)
    ax_kde.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))
fig.subplots_adjust(wspace=0.35, hspace=0.35)
# plt.tight_layout()


In [None]:
ax_hist = sns.histplot(
    data=g.data, x="energy_100g", color="lightblue", bins=np.linspace(-0.5, 4000.5, 82)
)
ax_kde = sns.kdeplot(
    data=g.data,
    x="energy_100g",
    palette=palette_nutriscore,
    hue="nutrition_grade_fr_",
    legend=True,
    linewidth=3,
    ax=ax_hist.axes.twinx(),
)
ax_hist.set_xlim(-0.5, 4000.5)
ax_hist.set_ylim(0, 2500)
ax_kde.set_ylim(0, 3e-4)
ax_kde.ticklabel_format(axis="y", style="sci", scilimits=(0, 0))


In [None]:
features_numeric = [
    f
    for f in g.data.select_dtypes(np.number)
    if "nutri" not in f and "emb" not in f and "created" not in f and "tsne" not in f
]

fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(25, 18))
for feature, ax in zip(features_numeric, flatten(axes)):
    sns.violinplot(
        data=g.data,
        x=feature,
        y="nutrition_grade_fr_",
        palette=palette_nutriscore,
        cut=0,
        scale="count",
        linewidth=0.5,
        ax=ax,
    )


In [None]:
plt.figure(figsize=(6, 6))
sns.scatterplot(
    data=g.data,
    x="sugars_100g",
    y="nutrition-score-fr_100g",
    hue="nutrition_grade_fr_",
    s=3,
    alpha=0.2,
    palette=palette_nutriscore,
)
plt.xlim(0, 80)
plt.ylim(-20, 40)


# Categories vs Nutri-Score

In [None]:
from matplotlib.patches import Patch

product_categories = [cat for cat in g.data["pnns_groups_1"].cat.categories]

# fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(25, 18))
# for cat, ax in zip(product_categories, flatten(axes)):
fig = plt.figure(figsize=(14, 14))

for i, cat in enumerate(product_categories):
    ax = fig.add_subplot(3, 3, i + 1)
    mask = g.data["pnns_groups_1"] == cat
    g.data[mask]["nutrition_grade_fr"].value_counts().sort_index().plot.pie(
        colors=palette_nutriscore,
        autopct="%1.1f%%",
        pctdistance=0.7,
        labels=None,
        textprops=dict(color="w", fontsize=12, fontweight="bold"),
        ax=ax,
    )
    ax.set_title(cat)
    ax.set_ylabel("")
lines = [Patch(facecolor=c, edgecolor=None) for c in palette_nutriscore.as_hex()]
labels = g.data["nutrition_grade_fr"].cat.categories
fig.legend(lines, labels, loc="center left", bbox_to_anchor=(1, 0.5), prop={"size": 20})
plt.tight_layout()


In [None]:
from matplotlib.patches import Patch

product_categories = [cat for cat in g.data["pnns_groups_1"].cat.categories]

# fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(25, 18))
# for cat, ax in zip(product_categories, flatten(axes)):
fig = plt.figure(figsize=(14, 14))

for i, cat in enumerate(product_categories):
    ax = fig.add_subplot(3, 3, i + 1)
    mask = g.data["pnns_groups_1"] == cat
    g.data[mask]["nutrition_grade_fr_"].value_counts().sort_index().plot.pie(
        colors=palette_nutriscore,
        autopct="%1.1f%%",
        pctdistance=0.7,
        labels=None,
        textprops=dict(color="w", fontsize=12, fontweight="bold"),
        ax=ax,
    )
    ax.set_title(cat)
    ax.set_ylabel("")
lines = [Patch(facecolor=c, edgecolor=None) for c in palette_nutriscore.as_hex()]
labels = g.data["nutrition_grade_fr_"].cat.categories
fig.legend(lines, labels, loc="center left", bbox_to_anchor=(1, 0.5), prop={"size": 20})
plt.tight_layout()


In [None]:
category = "pnns_groups_2"
fig = plt.figure(figsize=(12, 10))
df = g.data.value_counts([category, "nutrition_grade_fr"])
df = df.reset_index(name="Frequency").pivot(
    index=category, columns=["nutrition_grade_fr"]
)["Frequency"]
df = (df.T / df.T.sum()).T
df.plot.barh(stacked=True, color=palette_nutriscore, width=0.8, ax=plt.gca())
plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
plt.xlabel("Ratio de nombre produits par catégories")
plt.xlim(0, 1)


In [None]:
category = "pnns_groups_2"
fig = plt.figure(figsize=(12, 10))
df = g.data.value_counts([category, "nutrition_grade_fr_"])
df = df.reset_index(name="Frequency").pivot(
    index=category, columns=["nutrition_grade_fr_"]
)["Frequency"]
df = (df.T / df.T.sum()).T
df.plot.barh(stacked=True, color=palette_nutriscore, width=0.8, ax=plt.gca())
plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
plt.xlabel("Ratio de nombre produits par catégories")
plt.xlim(0, 1)


## Anova

In [None]:
import scipy


In [None]:
# Create ANOVA backbone table
data = [
    ["Between Groups", "c - 1", "SSM", "MSM", "MSE/MSM", "p", "Falpha;I-1,n-I"],
    ["Within Groups", "n - c", "SSE", "MSE", "", "", ""],
    ["Total", "n - 1", "SST", "", "", "", ""],
]
anova_table = pd.DataFrame(
    data, columns=["Source of Variation", "dof", "SS", "MS", "F", "P-value", "F crit"]
)
anova_table.set_index("Source of Variation", inplace=True)
anova_table


In [None]:
feature_qual = "pnns_groups_1"
feature_quant = "nutrition-score-fr_100g"
df = g.data[[feature_quant, feature_qual]].dropna()

# calculate SSM and update anova table
x_bar = df[feature_quant].mean()
SSM = df.groupby(feature_qual).count() * (df.groupby(feature_qual).mean() - x_bar) ** 2
anova_table["SS"]["Between Groups"] = SSM[feature_quant].sum()

# calculate SSE and update anova table
SSE = (df.groupby(feature_qual).count() - 1) * df.groupby(feature_qual).std() ** 2
anova_table["SS"]["Within Groups"] = SSE[feature_quant].sum()

# calculate SST and update anova table
SST = SSM[feature_quant].sum() + SSE[feature_quant].sum()
anova_table["SS"]["Total"] = SST

# update degree of freedom
anova_table["dof"]["Between Groups"] = df[feature_qual].nunique() - 1
anova_table["dof"]["Within Groups"] = df.shape[0] - df[feature_qual].nunique()
anova_table["dof"]["Total"] = df.shape[0] - 1

# calculate MS
anova_table["MS"] = anova_table["SS"] / anova_table["dof"]

# calculate F
F = anova_table["MS"]["Between Groups"] / anova_table["MS"]["Within Groups"]
anova_table["F"]["Between Groups"] = F

# p-value
anova_table["P-value"]["Between Groups"] = 1 - scipy.stats.f.cdf(
    F, anova_table["dof"]["Between Groups"], anova_table["dof"]["Within Groups"]
)

# F critical
alpha = 0.05
# possible types "right-tailed, left-tailed, two-tailed"
tail_hypothesis_type = "two-tailed"
if tail_hypothesis_type == "two-tailed":
    alpha /= 2
anova_table["F crit"]["Between Groups"] = scipy.stats.f.ppf(
    1 - alpha, anova_table["dof"]["Between Groups"], anova_table["dof"]["Within Groups"]
)

# Final ANOVA Table
anova_table


In [None]:
arr = [
    df[df[feature_qual] == group][feature_quant].to_numpy()
    for group in df[feature_qual].cat.categories
]
scipy.stats.f_oneway(*arr)


Vérifion si le Nutri-Score suit une loi normale, qui est une des conditions pour que le test ANOVA soit interprétable.

In [None]:
plt.figure(figsize=(10, 8))
ax = sns.kdeplot(
    data=g.data,
    x="nutrition-score-fr_100g",
    hue="pnns_groups_1",
)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1))
plt.setp(ax.get_legend().get_title(), fontweight="bold")


In [None]:
plt.figure(figsize=(10, 8))
ax = sns.kdeplot(
    data=g.data,
    x="nutrition-score-fr_100g",
    hue="pnns_groups_2",
)
sns.move_legend(ax, "upper left", bbox_to_anchor=(1, 1), ncols=2)
plt.setp(ax.get_legend().get_title(), fontweight="bold")


Selon les catégories nous voyons que le score nutrionnel ne suit pas une loi normale. L'utilisation du test ANOVA n'est donc pas pertinente.

## Kruskal Wallis test

Le test H de Kruskal-Wallis teste l'hypothèse nulle selon laquelle la médiane de la population de tous les groupes est égale. Il s'agit d'une version non paramétrique de l'ANOVA. Le test fonctionne sur 2 échantillons indépendants ou plus, qui peuvent avoir des tailles différentes.

Pour les groupes PNNS 1 :

In [None]:
res = scipy.stats.kruskal(
    *[
        s[1].to_numpy()
        for s in g.data.dropna(subset="nutrition-score-fr_100g").groupby(
            "pnns_groups_1"
        )["nutrition-score-fr_100g"]
    ]
)
res


Pour les groupes PNNS 2 :

In [None]:
scipy.stats.kruskal(
    *[
        s[1].to_numpy()
        for s in g.data.dropna(subset="nutrition-score-fr_100g").groupby(
            "pnns_groups_2"
        )["nutrition-score-fr_100g"]
    ]
)


L'hypothèse nulle, selon laquelle la population de produits selon. chaque catégorie PNNS partagent la même valeur médiane de score nutritionnel, est rejeté avec un niveau de confiance supérieur à 99.9% (p-value < 0.01). Cela est constaté à la foie pour les catégories PNNS 1 et 2 

Cela confirme que les catégories PNNS sont utiles pour la prédiction du Nutri-Score, puisque selon la catégorie un produit aura plus ou moins de chance d'avoir soit une Nutri-Score élevé.

# Matrice de corrélation

Regardons la matrice de corrélation pour voir si les variables d'intérêts selectionnées sont corrélées avec le Nutri-Score. Si ce n'est pas le cas, cela veut dire qu'elles n'apportent aucune information pour la prédiction du Nutri-Score. Toutefois nous nous méfierons qu'un facteur de corrélation de 0 n'indique pas forcément une absence de corrélation. 

In [None]:
columns_numeric = [
    col
    for col in g.data
    if pd.api.types.is_numeric_dtype(g.data[col])
    if "emb" not in col and "tsne" not in col
]
columns_numeric.remove("created_t")
# columns_numeric.remove("nutrition-score-fr_100g")

plt.figure(figsize=(14, 12))
mat_corr = g.data[columns_numeric].corr()
mask = np.zeros_like(mat_corr)
mask[np.triu_indices(mat_corr.shape[0])] = True
_ = sns.heatmap(
    mat_corr,
    # mask=mask,
    # linewidths=0,
    annot=True,
    fmt=".0%",
    cmap="RdBu_r",
    vmin=-1,
    vmax=1,
    annot_kws={"fontsize": 9},
    linewidths=2,
    linecolor="gray",
    square=True,
)
sns.despine(
    fig=None,
    ax=None,
    top=False,
    right=False,
    left=False,
    bottom=False,
    offset=None,
    trim=False,
)

plt.figure(figsize=(14, 12))
_ = sns.heatmap(
    np.abs(mat_corr),
    # mask=mask,
    # linewidth=0,
    annot=True,
    fmt=".0%",
    cmap="Reds",
    vmin=0,
    vmax=1,
    annot_kws={"fontsize": 9},
    linewidths=2,
    linecolor="gray",
)
sns.despine(
    fig=None,
    ax=None,
    top=False,
    right=False,
    left=False,
    bottom=False,
    offset=None,
    trim=False,
)


Nous remarquons que la plupart des variables ont un facteur de corrélation superieure à 10% avec le Nutri-Score (nutrition_grade_fr) ce qui valide notre choix dans les variables à utiliser pour la prédiction du Nutri-Score. Les variables qui montrent le plus de corrélation avec le Nutri-Score sont les quantités d'énergie, de graisses et de sucres, elles auront un rôle clé dans la prédiction du Nutri-Score. 

# Analyse PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer


In [None]:
pipeline = Pipeline([("scale", StandardScaler()), ("pca", PCA())])


In [None]:
columns_numeric.remove("nutrition-score-fr_100g")
columns_numeric.remove("nutrition_grade_fr__")
df = g.data[columns_numeric].dropna()
df


In [None]:
df_pca = pd.DataFrame(
    pipeline.fit_transform(df),
    columns=[f"F{i+1}" for i, _ in enumerate(df.columns)],
    index=df.index,
)
df_pca


In [None]:
pca = pipeline["pca"]

x = np.arange(0, len(pca.explained_variance_ratio_))
y = pca.explained_variance_ratio_ * 100

plt.bar(x, y)
plt.plot(x, y.cumsum(), c="red", marker="o")
plt.xlabel("Rang de l'axe d'inertie")
plt.ylabel("Pourcentage d'inertie")
plt.title("Éboulis des valeurs propres")
plt.show(block=False)


In [None]:
def plot_pca_cercle_correlation(x_id, y_id):
    _, ax = plt.subplots(figsize=(10, 9))
    for i, color in zip(
        range(0, pca.components_.shape[1]), sns.color_palette("tab20", 30)
    ):
        ax.arrow(
            0,
            0,  # Start the arrow at the origin
            pca.components_[x_id, i],  # 0 for PC1
            pca.components_[y_id, i],  # 1 for PC2
            head_width=0.035,
            head_length=0.025,
            width=0.01,
            color=color,
            label=df.columns[i],
        )

        # plt.text(pca.components_[0, i] + 0.05, pca.components_[1, i] + 0.05, df.columns[i])

    # affichage des lignes horizontales et verticales
    plt.plot([-1, 1], [0, 0], color="grey", ls="--")
    plt.plot([0, 0], [-1, 1], color="grey", ls="--")

    # nom des axes, avec le pourcentage d'inertie expliqué
    plt.xlabel(
        "F{} ({}%)".format(
            x_id + 1, np.round(100 * pca.explained_variance_ratio_[x_id], 1)
        )
    )
    plt.ylabel(
        "F{} ({}%)".format(
            y_id + 1, np.round(100 * pca.explained_variance_ratio_[y_id], 1)
        )
    )

    plt.title("Cercle des corrélations (F{} et F{})".format(x_id + 1, y_id + 1))

    an = np.linspace(0, 2 * np.pi, 100)
    plt.plot(np.cos(an), np.sin(an))  # Add a unit circle for scale
    plt.axis("equal")
    plt.legend(loc="center left", bbox_to_anchor=(1, 0.5))
    plt.show(block=False)


plot_pca_cercle_correlation(0, 1)
plot_pca_cercle_correlation(2, 3)


In [None]:
import bokeh
import bokeh.plotting
import bokeh.io

bokeh.io.output_notebook()

p = bokeh.plotting.figure(
    tools="pan,wheel_zoom,reset,hover",
    tooltips=[
        ("Name", "@product_name"),
        ("Grade", "@nutrition_grade_fr"),
        ("Ingredients", "@ingredients_"),
    ],
)

for group, color in zip(
    g.data["nutrition_grade_fr"].cat.categories, palette_nutriscore.as_hex()
):
    mask = g.data["nutrition_grade_fr"] == group
    p.scatter(
        x="F3",
        y="F4",
        source=bokeh.models.ColumnDataSource(
            pd.concat(
                [
                    df_pca,
                    g.data,
                    pd.DataFrame({"ingredients_": g.data.ingredients.str.join("\n")}),
                ],
                axis=1,
            )[mask.to_numpy()]
        ),
        line_width=0,
        alpha=0.1,
        color=color,
        legend_label=group,
    )
p.legend.click_policy = "hide"
bokeh.plotting.show(p)


# T-SNE

In [None]:
numerical_features = [
    f
    for f in g.data.select_dtypes(include=np.number)
    if "created" not in f and "nutri" not in f
    # and "brands" not in f
    # and "ingredient" not in f
    # and "product" not in f
]

try:
    numerical_features.remove("tsne_x")
    numerical_features.remove("tsne_y")
except Exception:
    pass

numerical_features


In [None]:
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline(
    [
        (
            "pnns_encoding",
            ColumnTransformer(
                [
                    (
                        "pnns",
                        OneHotEncoder(sparse_output=False),
                        ["pnns_groups_1", "pnns_groups_2"],
                    ),
                    ("numeric", "passthrough", numerical_features),
                ]
            ),
        ),
        ("scaler", StandardScaler()),
        ("tsne", TSNE(perplexity=100, init="pca")),
    ]
)

force_computation = False
data_tsne_file = Path("tsne_reduction_coors.npy")
if data_tsne_file.exists() and not force_computation:
    coords = np.load(data_tsne_file)
else:
    coords = pipeline.fit_transform(X=g.data)
    np.save(data_tsne_file, coords)


In [None]:
g.data["tsne_x"] = coords[:, 0]
g.data["tsne_y"] = coords[:, 1]


In [None]:
from bokeh.plotting import figure, show
from bokeh.models import Legend, ColumnDataSource, HoverTool
from itertools import cycle


def style_color_marker(n, color_list, marker_list):
    style_list = []
    all_styles = cycle(
        [(color, marker) for marker in marker_list for color in color_list]
    )
    return [next(all_styles) for _ in range(n)]


def plot_tsne_with_bokeh(
    data,
    tsne_var,
    group_cat,
    tooltip_var=["product_name", "nutrition_grade_fr"],
    colors=None,
):
    data = pd.DataFrame(data)

    cat_column = data[group_cat].astype("category")
    categories = cat_column.cat.categories

    p = figure(
        height=800,
        width=900,
        tools="pan, wheel_zoom,box_zoom, reset, save",
        active_drag="pan",
        active_scroll="wheel_zoom",
    )
    try:
        p.add_layout(Legend(ncols=2, nrows=19), "right")  # bokeh 3.X
    except Exception:
        p.add_layout(Legend())  # bokeh 2.X

    if colors is None:
        colors = sns.color_palette("tab10").as_hex()
    markers = [
        "circle",
        "triangle",
        "square",
        "plus",
        "star",
        "square_pin",
        "inverted_triangle",
        "diamond",
        "hex",
    ]

    renderer_list = []
    for (
        group,
        (color, marker),
    ) in zip(categories, style_color_marker(len(categories), colors, markers)):
        mask = cat_column[data.index] == group
        df = data[mask]
        data_source = ColumnDataSource(df)

        glyph = p.scatter(
            source=data_source,
            x=f"{tsne_var}_x",
            y=f"{tsne_var}_y",
            legend_label=str(group),
            fill_alpha=0.4,
            size=6,
            line_width=0,  # 0.2,
            # line_color="white",
            color=color,
            marker=marker,
        )
        renderer_list.append(glyph)
        # glyph.muted = True

    hover_tool = HoverTool(
        renderers=renderer_list,
        tooltips=[(var, f"@{var}") for var in tooltip_var],
    )
    p.add_tools(hover_tool)
    p.legend.click_policy = "hide"

    p.xaxis.axis_label = "Embedding X"
    p.xaxis.axis_label_text_font_style = "bold"
    p.yaxis.axis_label = "Embedding Y"
    p.yaxis.axis_label_text_font_style = "bold"

    return p


In [None]:
plot = plot_tsne_with_bokeh(
    g.data, "tsne", group_cat="nutrition_grade_fr_", colors=palette_nutriscore.as_hex()
)
show(plot)


# Simple Classification

In [None]:
numerical_features = [
    f
    for f in g.data.select_dtypes(include=np.number)
    if "created" not in f and "nutri" not in f and "emb" not in f
]

try:
    numerical_features.remove("tsne_x")
    numerical_features.remove("tsne_y")
except Exception:
    pass

numerical_features


In [None]:
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import train_test_split

pipeline = Pipeline(
    [
        (
            "pnns_encoding",
            ColumnTransformer(
                [
                    (
                        "pnns",
                        OneHotEncoder(sparse_output=False),
                        ["pnns_groups_1", "pnns_groups_2"],
                    ),
                    ("numeric", "passthrough", numerical_features),
                ]
            ),
        ),
        ("scaler", StandardScaler()),
        ("regressor", KNeighborsClassifier(n_neighbors=1)),
    ]
)


X = g.data.dropna(subset="nutrition_grade_fr")[
    numerical_features + ["pnns_groups_1", "pnns_groups_2"]
]
y = g.data.dropna(subset="nutrition_grade_fr")["nutrition_grade_fr__"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.7, random_state=42
)


In [None]:
pipeline.fit(X_train, y_train)


In [None]:
y_pred = pipeline.predict(X=X_test)


In [None]:
# (y_pred - y_test).plot.barh(bins=100)


In [None]:
(y_pred == y_test).sum() / y_test.count()


In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay


cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm, display_labels="ABCDE").plot()


In [None]:
from xgboost import XGBClassifier


pipeline = Pipeline(
    [
        (
            "pnns_encoding",
            ColumnTransformer(
                [
                    (
                        "pnns",
                        OneHotEncoder(sparse_output=False),
                        ["pnns_groups_1", "pnns_groups_2"],
                    ),
                    ("numeric", "passthrough", numerical_features),
                ]
            ),
        ),
        ("scaler", StandardScaler()),
        ("predictor", XGBClassifier()),
    ]
)


X = g.data.dropna(subset="nutrition_grade_fr")[
    numerical_features + ["pnns_groups_1", "pnns_groups_2"]
]
y = g.data.dropna(subset="nutrition_grade_fr")["nutrition_grade_fr__"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.7, random_state=43
)

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X=X_test)


In [None]:
cm = confusion_matrix(y_test, y_pred)

cm_display = ConfusionMatrixDisplay(cm, display_labels="ABCDE").plot()


In [None]:
(y_pred == y_test).sum() / y_test.count()
