# Introduction

This notebook predicts the `beer_style` using a neural network on the PyTorch
framework. It is a modification of the 5_pytorch.ipynb notebook. After 20
epochs, there seems to be still some room for improvement.

The same model is trained again for 60 more epochs.

## Summary
The increase of neurons has **not** improved the model performance. The
[classification report](#Classification-report) shows that the validation
accuracy increased to as high as 31.2%, and the test accuracy remains at 32%.

In [32]:
artefact_prefix = '8_pytorch'
target = 'beer_style'

In [33]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
from dotenv import find_dotenv
from datetime import datetime
import pandas as pd
from pathlib import Path
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from category_encoders.binary import BinaryEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from joblib import dump, load

from src.data.sets import merge_categories
from src.data.sets import save_sets
from src.data.sets import load_sets
from src.data.sets import split_sets_random
from src.data.sets import test_class_exclusion
from src.models.performance import convert_cr_to_dataframe
from src.models.pytorch import PytorchClassification_8
from src.models.pytorch import get_device
from src.models.pytorch import train_classification
from src.models.pytorch import test_classification
from src.models.pytorch import PytorchDataset
from src.models.pipes import create_preprocessing_pipe
from src.visualization.visualize import plot_confusion_matrix

# Set up directories

In [35]:
project_dir = Path(find_dotenv()).parent
data_dir = project_dir / 'data'
raw_data_dir = data_dir / 'raw'
interim_data_dir = data_dir / 'interim'
processed_data_dir = data_dir / 'processed'
reports_dir = project_dir / 'reports'
models_dir = project_dir / 'models'

# Load data

In [36]:
X_train, X_test, X_val, y_train, y_test, y_val = load_sets()

# Preprocess data

1. The `brewery_name` is a feature with a very high cardinality, ~5700. One hot
 encoding is not feasible as it will introduce 5700 very sparse columns.
 Another option is to use binary encoding, which would result in 14 new columns.
1. Standard scaling is used to ensure that the binary columns ([0, 1])and the
review columns ([1, 5]) are on the same scale.

In [37]:
pipe = Pipeline([
    ('bin_encoder', BinaryEncoder(cols=['brewery_name'])),
    ('scaler', StandardScaler())
])

In [38]:
X_train_trans = pipe.fit_transform(X_train)
X_val_trans = pipe.transform(X_val)
X_test_trans = pipe.transform(X_test)

In [39]:
X_train_trans.shape

(951968, 18)

In [40]:
n_features = X_train_trans.shape[1]
n_features

18

In [41]:
n_classes = y_train.nunique()
n_classes

104

## Encoding

PyTorch accepts only numerical labels.

In [42]:
le = LabelEncoder()
y_train_trans = le.fit_transform(y_train.to_frame())
y_val_trans = le.fit_transform(y_val.to_frame())
y_test_trans = le.transform(y_test.to_frame())

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [43]:
y_test_trans

array([98, 89,  2, ..., 37, 94, 98])

## Convert to Pytorch tensors

In [44]:
device = get_device()
device

device(type='cuda', index=0)

In [45]:
train_dataset = PytorchDataset(X=X_train_trans, y=y_train_trans)
val_dataset = PytorchDataset(X=X_val_trans, y=y_val_trans)
test_dataset = PytorchDataset(X=X_test_trans, y=y_test_trans)

# Classification model

In [46]:
model = PytorchClassification_8(n_features=n_features, n_classes=n_classes)
model.to(device)

PytorchClassification_8(
  (layer_1): Linear(in_features=18, out_features=4096, bias=True)
  (batchnorm1): BatchNorm1d(4096, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_2): Linear(in_features=4096, out_features=1024, bias=True)
  (batchnorm2): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_3): Linear(in_features=1024, out_features=256, bias=True)
  (batchnorm3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_4): Linear(in_features=256, out_features=128, bias=True)
  (batchnorm4): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_5): Linear(in_features=128, out_features=64, bias=True)
  (batchnorm5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_out): Linear(in_features=64, out_features=104, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
)

In [21]:
criterion = nn.CrossEntropyLoss()

In [22]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Train the model

In [25]:
N_EPOCHS = 60
BATCH_SIZE = 4096
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [24]:
start_time = datetime.now()
print(f'Started: {start_time}')
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion, 
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device,
                                                 scheduler=scheduler)
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion, 
                                                batch_size=BATCH_SIZE, 
                                                device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

end_time = datetime.now()
runtime = end_time - start_time
print(f'Ended: {end_time}')
print(f'Runtime: {runtime}')

Started: 2021-03-14 14:30:13.766633
Epoch: 0
	(train)	Loss: 0.0009	|	Acc: 16.6%
	(valid)	Loss: 0.0008	|	Acc: 23.4%
Epoch: 1
	(train)	Loss: 0.0007	|	Acc: 23.6%
	(valid)	Loss: 0.0007	|	Acc: 26.3%
Epoch: 2
	(train)	Loss: 0.0007	|	Acc: 25.2%
	(valid)	Loss: 0.0007	|	Acc: 27.4%
Epoch: 3
	(train)	Loss: 0.0007	|	Acc: 25.9%
	(valid)	Loss: 0.0007	|	Acc: 27.9%
Epoch: 4
	(train)	Loss: 0.0007	|	Acc: 26.3%
	(valid)	Loss: 0.0006	|	Acc: 28.3%
Epoch: 5
	(train)	Loss: 0.0007	|	Acc: 26.6%
	(valid)	Loss: 0.0006	|	Acc: 28.5%
Epoch: 6
	(train)	Loss: 0.0007	|	Acc: 26.9%
	(valid)	Loss: 0.0006	|	Acc: 28.8%
Epoch: 7
	(train)	Loss: 0.0007	|	Acc: 27.1%
	(valid)	Loss: 0.0006	|	Acc: 28.9%
Epoch: 8
	(train)	Loss: 0.0006	|	Acc: 27.2%
	(valid)	Loss: 0.0006	|	Acc: 29.0%
Epoch: 9
	(train)	Loss: 0.0006	|	Acc: 27.4%
	(valid)	Loss: 0.0006	|	Acc: 29.2%
Epoch: 10
	(train)	Loss: 0.0006	|	Acc: 27.5%
	(valid)	Loss: 0.0006	|	Acc: 29.2%
Epoch: 11
	(train)	Loss: 0.0006	|	Acc: 27.5%
	(valid)	Loss: 0.0006	|	Acc: 29.3%
Epoch: 12
	(tr

In [None]:
N_EPOCHS = 20
BATCH_SIZE = 4096
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)

In [26]:
start_time = datetime.now()
print(f'Started: {start_time}')
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train_classification(train_dataset,
                                                 model=model,
                                                 criterion=criterion,
                                                 optimizer=optimizer,
                                                 batch_size=BATCH_SIZE,
                                                 device=device,
                                                 scheduler=scheduler)
    valid_loss, valid_acc = test_classification(val_dataset,
                                                model=model,
                                                criterion=criterion,
                                                batch_size=BATCH_SIZE,
                                                device=device)

    print(f'Epoch: {epoch}')
    print(f'\t(train)\tLoss: {train_loss:.4f}\t|\tAcc: {train_acc * 100:.1f}%')
    print(f'\t(valid)\tLoss: {valid_loss:.4f}\t|\tAcc: {valid_acc * 100:.1f}%')

end_time = datetime.now()
runtime = end_time - start_time
print(f'Ended: {end_time}')
print(f'Runtime: {runtime}')

Started: 2021-03-14 14:45:36.016408
Epoch: 0
	(train)	Loss: 0.0006	|	Acc: 28.4%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 1
	(train)	Loss: 0.0006	|	Acc: 28.4%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 2
	(train)	Loss: 0.0006	|	Acc: 28.5%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 3
	(train)	Loss: 0.0006	|	Acc: 28.4%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 4
	(train)	Loss: 0.0006	|	Acc: 28.4%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 5
	(train)	Loss: 0.0006	|	Acc: 28.5%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 6
	(train)	Loss: 0.0006	|	Acc: 28.4%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 7
	(train)	Loss: 0.0006	|	Acc: 28.4%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 8
	(train)	Loss: 0.0006	|	Acc: 28.4%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 9
	(train)	Loss: 0.0006	|	Acc: 28.4%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 10
	(train)	Loss: 0.0006	|	Acc: 28.5%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 11
	(train)	Loss: 0.0006	|	Acc: 28.5%
	(valid)	Loss: 0.0006	|	Acc: 30.0%
Epoch: 12
	(tr

# Prediction

In [27]:
# Use the CPU version if the GPU runs out of memory.
# preds = model(test_dataset.X_tensor.to(device)).argmax(1)
model.to('cpu')
preds = model(test_dataset.X_tensor).argmax(1)
preds
model.to(device)

PytorchClassification_8(
  (layer_1): Linear(in_features=18, out_features=256, bias=True)
  (batchnorm1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_2): Linear(in_features=256, out_features=128, bias=True)
  (batchnorm2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layer_out): Linear(in_features=128, out_features=104, bias=True)
  (relu): ReLU()
  (dropout): Dropout(p=0.2, inplace=False)
)

# Evaluation

## Classification report

In [28]:
report = classification_report(y_test, le.inverse_transform(preds.cpu()))
print(report)

  _warn_prf(average, modifier, msg_start, len(result))


                                     precision    recall  f1-score   support

                            Altbier       0.34      0.36      0.35      1521
             American Adjunct Lager       0.55      0.73      0.63      6085
           American Amber / Red Ale       0.19      0.22      0.20      9288
         American Amber / Red Lager       0.32      0.33      0.32      1887
                American Barleywine       0.22      0.03      0.06      5390
                 American Black Ale       0.36      0.06      0.10      2394
                American Blonde Ale       0.20      0.04      0.06      2594
                 American Brown Ale       0.25      0.11      0.15      5066
            American Dark Wheat Ale       0.00      0.00      0.00       296
     American Double / Imperial IPA       0.26      0.35      0.30     17159
 American Double / Imperial Pilsner       0.15      0.03      0.05      1109
   American Double / Imperial Stout       0.35      0.47      0.40     1018

# Save objects for production

## Save model

In [29]:
path = models_dir / f'{artefact_prefix}_model'
torch.save(model, path.with_suffix('.torch'))

## Create pipe object

This is for transforming the input prior to prediction.

In [30]:
X = pd.concat([X_train, X_val, X_test])
prod_pipe = create_preprocessing_pipe(X)

path = models_dir / f'{artefact_prefix}_pipe'
dump(prod_pipe, path.with_suffix('.sav'))

['D:\\git\\assignment_2\\models\\9_pytorch_pipe.sav']

## Save `LabelEncoder`

This is required to get back the name of the name of the `beer_style`.

In [31]:
path = models_dir / f'{artefact_prefix}_label_encoder'
dump(le, path.with_suffix('.sav'))

['D:\\git\\assignment_2\\models\\9_pytorch_label_encoder.sav']