# Style Classifier
Let's predict style from ingredients.
This notebook depends on: 
* `recipe_vecs.h5`, which is generated by `recipe2vec.py`
* `all_recipes.h5`, which is generated by `converter.py`

In [9]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# Import data

In [2]:
with pd.HDFStore('recipe_vecs.h5', 'r') as store:
    recipe_vecs = store.get('vecs')

In [3]:
with pd.HDFStore('all_recipes.h5', 'r') as store:
    recipe_catalog = store.get('core')
    recipe_catalog = recipe_catalog.loc[recipe_vecs.index]

# Filter data: pick a small number of styles

In [4]:
recipe_catalog.style_name.value_counts().head(10)

american ipa                  30461
american pale ale             24671
specialty beer                11383
imperial ipa                   7781
american amber ale             5799
saison                         5531
american wheat or rye beer     4740
american brown ale             4234
robust porter                  3892
blonde ale                     3700
Name: style_name, dtype: int64

In [218]:
# selected_styles = ['american ipa', 'robust porter']

In [219]:
#selected_recipes = recipe_catalog.style_name.isin(selected_styles)

In [220]:
#recipe_catalog_select = recipe_catalog[selected_recipes].copy()
#recipe_vecs_select = recipe_vecs[selected_recipes].copy()

# Easy target variable: is_ipa
* 1 if the style is American IPA
* 0 if not

In [6]:
recipe_catalog['is_ipa'] = (recipe_catalog.style_name == 'american ipa').astype(int)

In [8]:
Y = recipe_catalog['is_ipa'].values

In [10]:
scaler = RobustScaler()

In [11]:
scaler.fit(recipe_vecs.values)

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)

In [12]:
X = scaler.transform(recipe_vecs.values)

In [13]:
X

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 3.00000000e+01],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.35217745e-01, 0.00000000e+00, 6.65373064e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [24]:
train_test_split?

[0;31mSignature:[0m [0mtrain_test_split[0m[0;34m([0m[0;34m*[0m[0marrays[0m[0;34m,[0m [0;34m**[0m[0moptions[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Split arrays or matrices into random train and test subsets

Quick utility that wraps input validation and
``next(ShuffleSplit().split(X, y))`` and application to input data
into a single call for splitting (and optionally subsampling) data in a
oneliner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float, int or None, optional (default=None)
    If float, should be between 0.0 and 1.0 and represent the proportion
    of the dataset to include in the test split. If int, represents the
    absolute number of test samples. If None, the value is set to the
    complement of the train size. If ``train_size`` is

In [14]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [19]:
lr = LogisticRegression(max_iter=1000)

In [20]:
lr.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [23]:
lr.score(x_test, y_test)

0.8213395457192778

# One hot encode categorical labels: style categories

In [222]:
le = LabelEncoder()

In [223]:
encoded_values = le.fit_transform(recipe_catalog_select.style_name.values)
encoded_values = encoded_values.reshape(len(encoded_values), 1)
encoded_values

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [224]:
ohe = OneHotEncoder()

In [225]:
ohe_labels = ohe.fit_transform(encoded_values)
ohe_labels

<34353x2 sparse matrix of type '<class 'numpy.float64'>'
	with 34353 stored elements in Compressed Sparse Row format>

# Scale inputs, fit to labels

In [206]:
scaler = RobustScaler()

In [207]:
scaler.fit(recipe_vecs_select.values)

RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
             with_scaling=True)

In [208]:
X = scaler.transform(recipe_vecs_select.values)
X

array([[0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.70238095e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 1.16415889e-01, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [6.64408860e-01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [2.87742032e+01, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 3.00000000e+01],
       [1.77557540e+00, 0.00000000e+00, 6.65373064e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [246]:
Y = ohe_labels.toarray()

In [238]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [239]:
lr = LinearRegression()

In [240]:
lr.fit(x_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [241]:
y_pred = lr.predict(x_test)

# Evaluate the model

In [242]:
r2_score(y_test, y_pred)

-207.1271140806192

In [243]:
mean_squared_error(y_test, y_pred)

20.5494817161023