In [None]:
import os

import joblib
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction import FeatureHasher
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (balanced_accuracy_score,
                             multilabel_confusion_matrix)
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.svm import LinearSVC
from sklearn.utils.class_weight import (compute_class_weight,
                                        compute_sample_weight)

import xgboost as xgb


# Data preparation

- Use `dtype` for performance
- `ingredients` is a string of ingredients delimited with `|`, replace with `.`. Fill empty cells with `None` as the lack of ingredients is significant
- `cooking_type` is a string of cooking types categories delimited with `|`, split into a list of categories. However, before splitting, fill empty cells with `None` as the lack of a cooking type is significant
- Concatenate the not Null text columns: `description`, `regulated_product_name`, `ingredients` into new column `text`
- Some products have duplicated `description`. To remove them, we set `pvid` as the index and sort it in an ascending order, then drop rows with duplicated `description` but keeping the one with the last `pvid` (i.e. the most recent product)
- Drop rows with any empty cell. `ingredients` and `cooking_type` empty cells are now `None` so will not be dropped
- Drop `description`, `regulated_product_name`, `ingredients` as these are now concatenated into `text`

In [None]:
df = pd.read_excel(
    os.path.join(
        'data',
        '200901_PHE_category_sheet.xlsx',
    ),
    usecols=[
        'lProductVersionID',
        'sDescription',
        'sCategoryLevel1',
        'sCategoryLevel2',
        'regulated_product_name',
        'ingredients',
        'storage_env',
        'pack_type',
        'cooking_type',
        'PHE_category_jan',
    ],
    dtype={
        'lProductVersionID': 'uint64',
        'sDescription': str,
        'sCategoryLevel1': 'category',
        'sCategoryLevel2': 'category',
        'regulated_product_name': str,
        'ingredients': str,
        'storage_env': 'category',
        'pack_type': 'category',
        'cooking_type': str,
        'PHE_category_jan': 'category',
    },
).rename(
    columns={
        'lProductVersionID': 'pvid',
        'sDescription': 'description',
        'sCategoryLevel1': 'category_level_1',
        'sCategoryLevel2': 'category_level_2',
        'PHE_category_jan': 'label',
    }
).assign(
    ingredients=lambda df: df['ingredients'].str.replace(
        '|', '.').fillna('None'),
    cooking_type=lambda df: df['cooking_type'].fillna('None').str.rsplit('| '),
    text=lambda
    df: df[['description', 'regulated_product_name', 'ingredients']].apply(
        lambda s: '. '.join(s[s.notna()]),
        axis=1,
    )
).set_index(
    'pvid',
).sort_index(
    ascending=True,
).drop_duplicates(
    subset='description',
    keep='last',
).dropna(
    how='any',
).drop(
    ['description', 'regulated_product_name', 'ingredients'],
    axis=1,
)

df.info()


<class 'pandas.core.frame.DataFrame'>
UInt64Index: 49623 entries, 6345061 to 8250896
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   category_level_1  49623 non-null  category
 1   category_level_2  49623 non-null  category
 2   storage_env       49623 non-null  category
 3   pack_type         49623 non-null  category
 4   cooking_type      49623 non-null  object  
 5   label             49623 non-null  category
 6   text              49623 non-null  object  
dtypes: category(5), object(2)
memory usage: 1.4+ MB


# Split data

- Use `label` as target labels
    - Number of classes is $63$
    - **Classes are very imbalanced**
        - Estimate class weights by using $n_{samples} / (n_{classes} * np.bincount(y))$ (not used but in case needed)
        - Calculate sample weights (not used but in case needed)
    - Stratify labels when splitting so their distribution in train/test data is similar
- Encode target labels with values between $0-62$
- Split $70/30$ for training/testing
    - training shape: $(34736, 6)$
    - testing shape: $(14887, 6)$
- Create empty dict to store all classifiers. Once populated, an item will look like:
`'classifier name' : {
    'pipeline': ...,
    'params': ...,
    'best_score': ...,
    'best_params': ...,
    'best_estimator': ...,
    'best_estimator_params': ...,
    'testing_accuracy': ...,
    'testing_conf_matrix': ...,
}`

In [None]:
y = df['label']
num_class = len(y.unique())

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=y.unique(),
    y=y,
)

sample_weights = compute_sample_weight(
    class_weight='balanced',
    y=y,
)


X = df.drop('label', axis=1)

le = LabelEncoder()
y = le.fit_transform(y)

joblib.dump(
    le,
    os.path.join(
        'models',
        'LabelEncoder.pkl',
    )
)

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.3,
    random_state=42,
    shuffle=True,
    stratify=y,
)

classifiers = dict()

## Transformers

### Create 4 transformation `Pipeline`:

1. A pipeline for text features: `text`. This will have 1 step:
    - `TfidfVectorizer` which is equivalent to `CountVectorizer` followed by `TfidfTransformer`

2. A pipeline for categorical features with 1 label each: `category_level_1`, `category_level_2`, `storage_env`. This will have 1 step:
    - `OneHotEncoder`

3. A pipeline for categorical features with multiple labels each: `cooking_type`. This will have 1 step:
    - `CountVectorizer`

4. A pipeline for categorical features to be hashed: `pack_type`. This will have 1 step:
    - `FeatureHasher`. We used the hashing trick on `pack_type` as it has 45 categories and using `OneHotEncoder` would result in 45 sparse features
    - Use a power of 2 for `n_features`
    - Collisions are likely to cancel out rather than accumulate error when `alternate_sign=True`. However, MultinomialNB estimators expect non-negative inputs so will disable `alternate_sign`

- `remainder=drop` will be used to drop any extra features that might be added to the dataframe later as a safety guard. When adding new features, either pass them through a pipeline, or change to `remainder=passthrough`

In [None]:
text_transformer = Pipeline(
    steps=[
        ('tfidf', TfidfVectorizer(lowercase=True,
                                  ngram_range=(1, 2),
                                  norm='l2',
                                  use_idf=True))
    ]
)

cat_transformer = Pipeline(
    steps=[
        ('onehot', OneHotEncoder(categories='auto',
                                 sparse=False,
                                 handle_unknown='ignore'))
    ]
)

multi_cat_transformer = Pipeline(
    steps=[
        ('binarizer', CountVectorizer(analyzer=set))
    ]
)

hash_transformer = Pipeline(
    steps=[
        ('hasher', FeatureHasher(n_features=10,
                                 input_type='string',
                                 alternate_sign=False))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, 'text'),
        ('cat', cat_transformer, ['category_level_1',
                                  'category_level_2', 'storage_env']),
        ('multi_cat', multi_cat_transformer, 'cooking_type'),
        ('hash', hash_transformer, 'pack_type'),
    ],
    remainder='drop',
)

### Check for Collisions

- Check for possible collisions in the hashing features for `pack_type`
- Increase `n_features` by 2 until no duplicated hashing features exist

In [None]:
hashed_features = hash_transformer.named_steps['hasher'].fit_transform(
    df['pack_type']
).toarray()

df_hashed_features = df[['pack_type']].reset_index(drop=True).join(
    pd.DataFrame(hashed_features)
)

df_hashed_features.groupby(
    ['pack_type']
).first().duplicated(
    keep=False
).sum()


0

# Classifiers

## Multinomial Naïve Bayes

- Normally requires bow
- tf-idf vectors are also known to work well in practice
- `alpha`: additive (Laplace/Lidstone) smoothing parameter (0 for no smoothing)
- Balanced accuracy:
    - Training: $86\%$
    - Testing: $87\%$

In [None]:
classifier = MultinomialNB(alpha=1.0)

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier),
    ]
)

params = {
    'classifier__alpha': [0.001, 0.01, 0.1, 0.3, 0.5, 1],
}

classifiers['MultinomialNB'] = {
    'pipeline': pipeline,
    'params': params,
}

## Logistic Regression

- `multinomial` logistic regression yields more accurate results and is faster to train on larger scale dataset than `ovr` logistic regression
- `saga` solver works with `multinomial` and is faster for large datasets
- `penalty=l1` trims the weights of not informative features to zero which is good if the goal is to extract the strongly discriminative vocabulary of each class. However, to get the best predictive accuracy, it is better to use `penalty=l2` instead
- Balanced accuracy:
    - Training: $90\%$
    - Testing: $90\%$

In [None]:
classifier = LogisticRegression(
    solver='saga',
    class_weight='balanced',
    multi_class='multinomial',
    penalty='l2',
    random_state=42,
    max_iter=100,
)

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier),
    ]
)

params = {
    'classifier__C': [1, 10, 100],
}

classifiers['LogisticRegression'] = {
    'pipeline': pipeline,
    'params': params,
}


## Linear Support Vector Machine

- LinearSVC is Similar to SVC with parameter `kernel=linear` but scales better
to large number of samples
- Select `dual=True` when `n_samples` < `n_features`. The TfidfVectorizer creates a large sparse matrix as we are using uni-grams and bi-grams (more than 200K sparse features)
- Select `multi_class=ovr` to train one-vs-rest classifiers
- `class_weight=balanced`: automatically adjust weights inversely proportional to class frequencies
- `C`: regularization parameter. Higher values result in less regularization (narrower margin with fewer violations)
- Advantages
    - Good for linear and non-linear classification
    - Well suited for classification of complex but small to medium datasets
- Disadvantages
    - Speed and datasets size
    - Needs scaling to be centred around zero
    - Sensitive to outliers
- Balanced accuracy:
    - Training: $89\%$
    - Testing: $90\%$

In [None]:
classifier = LinearSVC(
    penalty='l2',
    loss='squared_hinge',
    # dual=True,
    multi_class='ovr',
    class_weight='balanced',
    random_state=42,
    max_iter=1000,
)

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier),
    ]
)

params = {
    'classifier__C': [1, 10, 100],
    'classifier__dual': [True, False],
}

classifiers['LinearSVC'] = {
    'pipeline': pipeline,
    'params': params,
}

## Random Forest Ensemble

- Each tree is trained on a random subset of the training set with replacement (i.e. bootstrap aggregating or bagging)
- Each tree is trained on a random subset of features, the number of features to use is defined by `max_features`
- Advantages
    - No need for scaling
    - No need for dimensionality reduction (unless rotation is needed)
- Disadvantages
    - Sensitive to small variations in the training data
    - Over-fitting
    - Difficult to interpret
- Balanced accuracy:
    - Training: $79\%$
    - Testing: $78\%$

In [None]:
classifier = RandomForestClassifier(
    bootstrap=True,
    oob_score=True,
    max_features='sqrt',
    class_weight='balanced',
    random_state=42,
)

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier),
    ]
)

params = {
    'classifier__criterion': ['gini', 'entropy'],
    'classifier__max_depth': [4, 5, 6],
    'classifier__n_estimators': [100, 200, 400],
}

classifiers['RandomForestClassifier'] = {
    'pipeline': pipeline,
    'params': params,
}

## Stochastic Extreme Gradient Boosting (XGBoost) Ensemble

- Boosting methods train predictors sequentially, each trying to correct its predecessor. Gradient Boosting tries to fit the new predictor to the residual error made by the previous one. Extreme Boosting is more regularized to control over-fitting, which gives it better performance
- XGBoost uses trees as the base booster by default (booster=`gbtree`), which has a sklearn API (other booster don't)
- `learning_rate`: scales the contribution of each tree (ASA shrinkage). Lower `learning_rate` requires more trees to fit the training data
- `subsample`: ratio of training data to be randomly sampled to train each tree (i.e. Stochastic). Typically, set `subsample >= 0.5` for good results when `sampling_method=uniform` which is the default
- `colsample_bytree`: ratio of columns to be randomly sampled prior to train each tree
- Balanced accuracy:
    - Training: $83\%$
    - Testing: $85\%$

In [None]:
classifier = xgb.XGBClassifier(
    booster='gbtree',
    objective='multi:softmax',
    sampling_method='uniform',
    num_class=num_class,
    random_state=42,
)

pipeline = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', classifier),
    ]
)

params = {
    'classifier__learning_rate': [0.1, 0.3],
    'classifier__subsample': [0.7, 0.9],
    'classifier__colsample_bytree': [0.7, 0.9],
    'classifier__max_depth': [3, 4],
    'classifier__n_estimators': [100, 200, 400],
}

classifiers['XGBClassifier'] = {
    'pipeline': pipeline,
    'params': params,
}

# Training

## Grid search cross validation

- Use `balanced_accuracy` for scoring to deal with imbalanced classes
- Loop over each classifier in the classifiers dictionary
    - Train using training data
    - Pickle trained model

In [None]:
if not os.path.isdir('models'):
    os.mkdir('models')

for k, v in classifiers.items():

    print(f'\nRunning grid search with cross validation for {k}...')

    gs = GridSearchCV(
        v['pipeline'],
        v['params'],
        scoring='balanced_accuracy',
        cv=5,
        n_jobs=-1,
        verbose=2,
    )

    gs.fit(
        X_train,
        y_train,
    )

    joblib.dump(
        gs,
        os.path.join(
            'models',
            f'{k}.pkl',
        )
    )

# Validation

- Loop over each classifier in the classifiers dictionary
    - Load model
        - Store the grid search `best_score_`
        - Store the grid search `best_params_`
        - Store the grid search `best_estimator_`
        - Store the grid search `best_estimator_` params
    - Evalute using testing data
        - Store the `balanced_accuracy_score` (average recall obtained on each class to deal with imbalance classes)
        - Store class-wise `multilabel_confusion_matrix`

In [None]:
for k, v in classifiers.items():

    gs = joblib.load(
        os.path.join(
            'models',
            f'{k}.pkl',
        )
    )

    v['best_score'] = gs.best_score_
    v['best_params'] = gs.best_params_
    v['best_estimator'] = gs.best_estimator_
    v['best_estimator_params'] = gs.best_estimator_.named_steps['classifier'].get_params()

    print(f'Running evaluation on test data for {k}...')
    y_pred = gs.predict(X_test)

    v['testing_accuracy'] = balanced_accuracy_score(
        y_test,
        y_pred,
    )

    v['testing_conf_matrix'] = multilabel_confusion_matrix(
        y_test,
        y_pred,
        samplewise=False,
    )

## Voting classifier

- Use `hard` voting ensamble of all the classifiers `best_estimator_`
    - Classifiers should be passed as a list of `(str, estimator)`
    - Train using training data-
        - Store training `best_score_`        
- Evalute using testing data
    - Store the `balanced_accuracy_score` (average recall obtained on each class to deal with imbalance classes)
    - Store class-wise `multilabel_confusion_matrix`
- Pickle the classifiers dictionary
- Balanced accuracy:
    - Training: $97\%$
    - Testing: $90\%$

In [None]:
best_estimators = [(k, v['best_estimator'])
                   for k, v in classifiers.items()]

vc = VotingClassifier(
    estimators=best_estimators,
    voting='hard',
)

print('Fitting VotingClassifier using all best estimators...')
vc.fit(X_train, y_train)

y_pred = vc.predict(X_test)

classifiers['VotingClassifier'] = {

    'best_score': vc.score(X_train, y_train),

    'testing_accuracy': balanced_accuracy_score(
        y_test,
        y_pred,
    ),

    'testing_conf_matrix': multilabel_confusion_matrix(
        y_test,
        y_pred,
        samplewise=False,
    )
}

joblib.dump(
    vc,
    os.path.join(
        'models',
        'VotingClassifier.pkl',
    )
)

### Save results as flat file

- Create DataFrame of results from all classifiers and save into cvs

In [None]:
pd.DataFrame.from_dict(
    classifiers,
    orient='index',
).to_csv(
    os.path.join(
        'models',
        'models.csv',
    )
)

# Predict

### Read example JSON file

- Parse out all the features as expected by trained models
    - `category_level_1`: string category
    - `category_level_2`: string category
    - `regulated_product_name`: string
    - `ingredients`: list of strings. Join with '. '
    - `text`: concatenated from `description`, `regulated_product_name`, and `ingredients` with '. '
    - `storage_env`: string category
    - `pack_type`: string category
    - `cooking_type`: a list of categories that only exists if there are cooking types. If it does exist, return the list, otherwise, return a list of 'None'

In [None]:
df = pd.read_json(
    os.path.join(
        'data',
        'trial-json-products.json',
    ),
    orient='records',
    encoding='utf-16',
    lines=False,
).set_index(
    'pvid',
).sort_index(
    ascending=True,
)

In [None]:
df['category_level_1'] = df['categories'].apply(
    lambda
    c: c[0]['description'],
)

df['category_level_2'] = df['categories'].apply(
    lambda
    c: c[1]['description'],
)

df['regulated_product_name'] = df['languages'].apply(
    lambda
    c: c[0]['groupingSets'][0]['attributes']['regulatedProductName']
)

df['ingredients'] = df['languages'].apply(
    lambda
    c: '. '.join(
        c[0]['groupingSets'][0]['attributes']['ingredients']
    )
)

df['text'] = df[
    ['description', 'regulated_product_name', 'ingredients']
].apply(
    lambda s: '. '.join(s[s.notna()]),
    axis=1,
)

df['storage_env'] = df['languages'].apply(
    lambda
    c: c[0]['groupingSets'][0]['attributes']['storageType'][0]
    ['lookupValue']
)

df['pack_type'] = df['languages'].apply(
    lambda
    c: c[0]['groupingSets'][0]['attributes']['packType'][0]
    ['lookupValue']
)


def parse_cooking_guidelines(c):
    try:
        return [
            item['nameValue']
            for item in c[0]['groupingSets'][0]['attributes']
            ['cookingGuidelines']
        ]

    except KeyError:
        return ['None']


df['cooking_type'] = df['languages'].apply(
    parse_cooking_guidelines
)

df = df[[
    'category_level_1',
    'category_level_2',
    'storage_env',
    'pack_type',
    'cooking_type',
    'text'
]]

#### - Load label encoder to get label names

In [None]:
le = joblib.load(
    os.path.join(
        'models',
        'LabelEncoder.pkl',
    )
)

#### - Load VotingClassifier
- Make predictions
- Transform labels back to original encoding

In [None]:
vc = joblib.load(
    os.path.join(
        'models',
        'VotingClassifier.pkl',
    )
)

df['predict'] = le.inverse_transform(vc.predict(df))