In [None]:
from datasets import load_dataset
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, make_scorer
import numpy as np

In [None]:
ds = load_dataset("HC-85/open-food-facts", "nutrition-feats-only")

In [None]:
df = pd.DataFrame(ds["train"])
df.sample(5, random_state=42)

In [None]:
df.info()

Dropping `glycemic-index_100g` because it is full of NaN:

In [None]:
df_dropped = df.drop("glycemic-index_100g", axis=1).sample(1000, random_state=42)
df_dropped

In [None]:
df.describe()

In [None]:
df.loc[df["energy_100g"].argmax()].values

Negative values?

# EDA

we have lots of NaNs, and it is a problem, definitely. 

## Text features

In [None]:
data = df_dropped
text_columns = data.select_dtypes(include=["object"]).columns
numeric_columns = data.select_dtypes(include=["number"]).columns


s = SimpleImputer(
    missing_values=None, strategy="constant", fill_value=""
).fit_transform(data.loc[data[text_columns].isnull().any(axis=1).index][text_columns])
s

In [None]:
s_pd = pd.DataFrame(s, columns=text_columns)
s_pd

In [None]:
CountVectorizer().fit_transform(s_pd["product_name"])

In [None]:
CountVectorizer().fit_transform(s_pd["quantity"])

In [None]:
s_pd[s_pd.isnull().any(axis=1)]

## Numeric features

In [None]:
data[numeric_columns]

In [None]:
knn = KNNImputer(missing_values=np.nan, n_neighbors=3).fit_transform(
    data[numeric_columns]
)
knn

# Making pipeline

In [None]:

def pipeline(data: pd.DataFrame, model, param_grid, verbose=100):
    text_columns = data.select_dtypes(include=["object"]).columns
    numeric_columns = data.select_dtypes(include=["number"]).columns

    # Define transformers for categorical and numerical features
    numerical_transformer = Pipeline(
        steps=[
            ("imputer", KNNImputer(missing_values=np.nan, n_neighbors=2)),  # Impute missing values
            ("scaler", StandardScaler()),  # Standardize features
        ]
    )
    def preprocess_text(data):
    # Ensure data is a string, handle missing values
        return data.astype(str)

    text_transformer = Pipeline(
        steps=[
            (
                "simputer",
                SimpleImputer(missing_values=None, strategy="constant", fill_value=""),
            ),
            ("function", FunctionTransformer(preprocess_text))
            ,
            ("vectorizer", CountVectorizer()),  # Encode text features
            ("scaler", StandardScaler()),
        ]
    )

    # Combine transformers using ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", numerical_transformer, numeric_columns),
            ("text", text_transformer, text_columns),
        ]
    )

    pipe = Pipeline(
        steps=[("preprocessor", preprocessor), ("clustering", model)], verbose=True
    )
    max_score = -1.
    def silhouette_scorer(estimator, X):
        global max_score
        # Obtain the feature-transformed data
        X_transformed = estimator.named_steps['preprocessor'].transform(X)
        # Predict the cluster labels
        cluster_labels = estimator.named_steps['clustering'].fit_predict(X_transformed)
        # Calculate the silhouette score
        max_score = np.max(silhouette_score(X_transformed, cluster_labels), max_score)
        return max_score

    # Set up GridSearchCV
 
    grid_search = GridSearchCV(
        pipe, param_grid, cv=5, scoring=silhouette_scorer, n_jobs=-1, verbose=verbose
    )

    # Fit GridSearchCV
    grid_search.fit(data)

    # Get the best parameters and best score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_
    return best_params, best_score


param_grid = {
    "preprocessor__num__imputer__n_neighbors": [
        1, 3,
    ],  # Tune n_neighbors for KNNImputer
    "clustering__n_clusters": [4],  # Tune number of clusters for KMeans
    "clustering__init": ["k-means++"],  # Different initializations
    "clustering__max_iter": [500, 1000],  # Max iterations for convergence
}
pipeline(df_dropped[numeric_columns], KMeans(random_state=42), param_grid)