# Introduction

This notebook predicts the `beer_style` using a neural network on the PyTorch
framework. It is a modification of the 2_pytorch.ipynb notebook. The low
frequency `brewery_name`s will be merged into a new category called 'other'.

## Summary
Reducing the number of unique `brewery_names` did not lead to a model with
better performance. The [classification report](#Classification-report) shows
that the test accuracy is 0.27. A decrease from the previous attempt (0.3).



In [None]:
artefact_prefix = '3_pytorch'
target = 'beer_style'

In [None]:
%load_ext autoreload
%autoreload 2

In [48]:
from dotenv import find_dotenv
from datetime import datetime
import pandas as pd
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchviz import make_dot
from torch.utils.data import Dataset, DataLoader
from category_encoders.binary import BinaryEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from joblib import dump, load

from src.data.sets import merge_categories
from src.data.sets import save_sets
from src.data.sets import load_sets
from src.data.sets import split_sets_random
from src.data.sets import test_class_exclusion
from src.models.performance import convert_cr_to_dataframe
from src.models.pytorch import PytorchClassification
from src.models.pytorch import get_device
from src.models.pytorch import train_classification
from src.models.pytorch import test_classification
from src.models.pytorch import PytorchDataset
from src.models.pipes import create_preprocessing_pipe
from src.visualization.visualize import plot_confusion_matrix

# Set up directories

In [None]:
project_dir = Path(find_dotenv()).parent
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
interim_data_dir = data_dir / 'interim'
processed_data_dir = data_dir / 'processed'
reports_dir = project_dir / 'reports'
models_dir = project_dir / 'models'

# Load raw data

In [None]:
df = pd.read_parquet(processed_data_dir / 'subset.parquet')
df

# Merge categories

In [None]:
s = merge_categories(df.brewery_name, threshold = 500)
s.value_counts()

In [None]:
df.loc[:, 'brewery_name'] = s

In [None]:
df.brewery_name.value_counts(normalize=True)

# Split data

In [None]:
X_train, y_train, X_val, y_val, X_test, y_test = split_sets_random(
    df, target_col=target, test_ratio=0.2, to_numpy=False
)

In [None]:
y_val.value_counts(normalize=True)

In [None]:
y_test.value_counts(normalize=True)

In [None]:
X_train

Check for excluded classes.

In [None]:
test_class_exclusion(y_train, y_test, y_val)

# Preprocess data

1. The `brewery_name` is a feature with a very high cardinality, ~5700. One hot encoding is not feasible as it will introduce 5700 very sparse columns. Another option is to use binary encoding, which would result in 14 new columns.
1. Standard scaling is used to ensure that the binary columns ([0, 1])and the review columns ([1, 5]) are on the same scale.

In [None]:
pipe = Pipeline([
    ('bin_encoder', BinaryEncoder(cols=['brewery_name'])),
    ('scaler', StandardScaler())
])

In [None]:
X_train_trans = pipe.fit_transform(X_train)
X_val_trans = pipe.transform(X_val)
X_test_trans = pipe.transform(X_test)

In [None]:
X_train_trans.shape

In [None]:
n_features = X_train_trans.shape[1]
n_features

In [None]:
n_classes = y_train.nunique()
n_classes

## Encoding

PyTorch accepts only numerical labels.

In [None]:
le = LabelEncoder()
y_train_trans = le.fit_transform(y_train.to_frame())
y_val_trans = le.fit_transform(y_val.to_frame())
y_test_trans = le.transform(y_test.to_frame())

In [None]:
y_test_trans

## Convert to Pytorch tensors

In [None]:
device = get_device()
device

In [None]:
train_dataset = PytorchDataset(X=X_train_trans, y=y_train_trans)
val_dataset = PytorchDataset(X=X_val_trans, y=y_val_trans)
test_dataset = PytorchDataset(X=X_test_trans, y=y_test_trans)

# Classification model

In [None]:
model = PytorchClassification(n_features=n_features, n_classes=n_classes)

In [None]:
model.to(device)

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model

In [None]:
N_EPOCHS = 20
BATCH_SIZE = 512
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [47]:
start_time = datetime.now()
print(f'Started: {start_time}')
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion, 
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device,
                                                 scheduler=scheduler)
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion, 
                                                batch_size=BATCH_SIZE, 
                                                device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

end_time = datetime.now()
runtime = end_time - start_time
print(f'Ended: {end_time}')
print(f'Runtime: {runtime}')

Started: 2021-03-12 20:35:51.281585
Epoch: 0
	(train)	Loss: 0.0054	|	Acc: 24.9%
	(valid)	Loss: 0.0051	|	Acc: 26.8%
Epoch: 1
	(train)	Loss: 0.0054	|	Acc: 24.8%
	(valid)	Loss: 0.0051	|	Acc: 26.8%
Epoch: 2
	(train)	Loss: 0.0054	|	Acc: 24.9%
	(valid)	Loss: 0.0051	|	Acc: 26.9%
Epoch: 3
	(train)	Loss: 0.0054	|	Acc: 24.9%
	(valid)	Loss: 0.0051	|	Acc: 26.9%
Epoch: 4
	(train)	Loss: 0.0054	|	Acc: 24.9%
	(valid)	Loss: 0.0051	|	Acc: 26.9%
Epoch: 5
	(train)	Loss: 0.0054	|	Acc: 24.9%
	(valid)	Loss: 0.0051	|	Acc: 26.9%
Epoch: 6
	(train)	Loss: 0.0054	|	Acc: 24.8%
	(valid)	Loss: 0.0051	|	Acc: 26.9%
Epoch: 7
	(train)	Loss: 0.0054	|	Acc: 25.0%
	(valid)	Loss: 0.0051	|	Acc: 26.9%
Epoch: 8
	(train)	Loss: 0.0054	|	Acc: 25.0%
	(valid)	Loss: 0.0051	|	Acc: 26.9%
Epoch: 9
	(train)	Loss: 0.0054	|	Acc: 25.0%
	(valid)	Loss: 0.0051	|	Acc: 27.0%
Epoch: 10
	(train)	Loss: 0.0054	|	Acc: 25.0%
	(valid)	Loss: 0.0051	|	Acc: 27.0%
Epoch: 11
	(train)	Loss: 0.0054	|	Acc: 25.0%
	(valid)	Loss: 0.0051	|	Acc: 27.0%
Epoch: 12
	(tr

# Prediction

In [45]:
preds = model(test_dataset.X_tensor.to(device)).argmax(1)
preds

tensor([25, 18,  9,  ..., 65, 47, 25], device='cuda:0')

# Evaluation

## Classification report

In [46]:
report = classification_report(y_test, le.inverse_transform(preds.cpu()))
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


                                     precision    recall  f1-score   support

                            Altbier       0.33      0.33      0.33      1521
             American Adjunct Lager       0.27      0.75      0.40      6085
           American Amber / Red Ale       0.19      0.20      0.19      9288
         American Amber / Red Lager       0.30      0.30      0.30      1887
                American Barleywine       0.23      0.02      0.04      5390
                 American Black Ale       0.41      0.05      0.08      2394
                American Blonde Ale       0.20      0.01      0.02      2594
                 American Brown Ale       0.25      0.07      0.11      5066
            American Dark Wheat Ale       0.00      0.00      0.00       296
     American Double / Imperial IPA       0.27      0.32      0.29     17159
 American Double / Imperial Pilsner       0.16      0.01      0.01      1109
   American Double / Imperial Stout       0.34      0.47      0.40     1018

# Save objects for production

## Save model

In [54]:
path = models_dir / f'{artefact_prefix}_model'
torch.save(model, path.with_suffix('.torch'))

## Create pipe object

This is for transforming the input prior to prediction.

In [55]:
X = pd.concat([X_train, X_val, X_test])
prod_pipe = create_preprocessing_pipe(X)

path = models_dir / f'{artefact_prefix}_pipe'
dump(prod_pipe, path.with_suffix('.sav'))

['D:\\git\\assignment_2\\models\\3_pytorch_pipe.sav']

## Save `LabelEncoder`

This is required to get back the name of the name of the `beer_style`.

In [56]:
path = models_dir / f'{artefact_prefix}_label_encoder'
dump(le, path.with_suffix('.sav'))

['D:\\git\\assignment_2\\models\\3_pytorch_label_encoder.sav']