In [1]:
import os
from pathlib import Path

import pandas as pd
import numpy as np

# Configuration

In [10]:
INPUT_DATA_PATH = '/Users/rgareev/data/openfoodfacts/wrk/20220831-dev/train.parquet'

# Script
## Read data

In [11]:
df = pd.read_parquet(INPUT_DATA_PATH)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 540540 entries, 817678014419 to 769363975508
Data columns (total 3 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   product_name      536882 non-null  object
 1   nova_group        540540 non-null  int8  
 2   ingredients_list  540540 non-null  object
dtypes: int8(1), object(2)
memory usage: 12.9+ MB


In [12]:
type(df.ingredients_list.sample(1).iloc[0])

numpy.ndarray

## Convert to input format accepted by preprocessors / features generators

In [13]:
FEATURE_COLS = ['ingredients_list']

def to_model_input(df: pd.DataFrame) -> list:
    return df[FEATURE_COLS].to_dict(orient='records')


def to_model_labels(df: pd.DataFrame) -> np.ndarray:
    return np.asarray(df.nova_group)

In [14]:
X_all = to_model_input(df)
len(X_all)

540540

In [15]:
y_all = to_model_labels(df)
y_all.shape

(540540,)

## Define featurizers

In [18]:
# from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction import DictVectorizer

ingredient_encoder = DictVectorizer()

## Train and tune model

In [None]:
# TODO: setup joblib.parallel_backend for ray once it is used

In [19]:
# baseline
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

nb_clf = BernoulliNB(binarize=None)
model = Pipeline([
    ('encoder', ingredient_encoder),
    ('clf', nb_clf)
])
train_valid_splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.13, random_state=42)
gs_cv = GridSearchCV(model,
    cv = train_valid_splitter,
    param_grid={
        'clf__fit_prior' : [True],
    },
    verbose=2,
)

In [20]:
gs_cv.fit(X=X_all, y=y_all)

Fitting 1 folds for each of 1 candidates, totalling 1 fits
[CV] END ................................clf__fit_prior=True; total time=   6.4s


In [23]:
gs_cv.scoring

In [21]:
pd.DataFrame(gs_cv.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__fit_prior,params,split0_test_score,mean_test_score,std_test_score,rank_test_score
0,5.610288,0.0,0.777379,0.0,True,{'clf__fit_prior': True},0.880278,0.880278,0.0,1


In [28]:
# compare to most frequent baseline
for i, v in zip(*np.unique(y_all, return_counts=True)):
    print(i, v, v / y_all.shape)

1 58662 [0.10852481]
2 10391 [0.01922337]
3 108866 [0.2014023]
4 362621 [0.67084952]
