In [None]:
pwd

In [None]:
ls

In [None]:
import gc
import pandas as pd
import numpy as np
from glob import glob
from transformers import CamembertTokenizer
import torch
from torch import nn
%matplotlib inline
import json
from tqdm import tqdm
from sklearn import metrics

from model import BertPunc
from data import load_file, preprocess_data, create_data_loader

### Model path

In [None]:
glob('models/*')

In [None]:
path = 'models/20190418_211742/'

### Prepare

In [None]:
data_test = load_file('train_clean_subset.txt')
#data_test_asr = load_file('/home/stanfous/datasets/punctuation_model/donald_trump_original_test.txt')

In [None]:
#with open(path+'hyperparameters.json', 'r') as f:
#    hyperparameters = json.load(f)
#hyperparameters

In [None]:
tokenizer = CamembertTokenizer.from_pretrained('camembert-base', do_lower_case=True)

punctuation_enc = {
    'O': 0,
    ',COMMA': 1,
    '.PERIOD': 2,
    '?QUESTIONMARK': 3,
    ':COLON': 4,
    '!EXCLAMATIONMARK': 5,
    ';SEMICOLON': 6
}

# punctuation_enc = {
#     'O': 0,
#     'PERIOD': 1
# }

#segment_size = hyperparameters['segment_size']
segment_size = 32

In [None]:
X_test, y_test = preprocess_data(data_test, tokenizer, punctuation_enc, segment_size)
#X_test_asr, y_test_asr = preprocess_data(data_test_asr, tokenizer, punctuation_enc, segment_size)

In [None]:
X_test.shape

In [None]:
len(y_test)

In [None]:
output_size = len(punctuation_enc)
dropout = 0.3
bert_punc = nn.DataParallel(BertPunc(segment_size, output_size, dropout).cuda())
bert_punc = BertPunc(segment_size, output_size, dropout)

### Model progress

In [None]:
progress = pd.read_csv('progress.csv', delimiter=';')
progress

In [None]:
progress[['training loss', 'loss']].plot();

In [None]:
progress[['accuracy', 'f1_space', 'f1_comma', 'f1_period', 'f1_question']].plot();
# progress[['accuracy', 'f1_O', 'f1_PERIOD']].plot();

### Model evaluation

In [None]:
bert_punc.load_state_dict(torch.load(path+'model'))
bert_punc.eval()

In [None]:
batch_size = 16
data_loader_test = create_data_loader(X_test, y_test, False, batch_size)
#data_loader_test_asr = create_data_loader(X_test_asr, y_test_asr, False, batch_size)

In [None]:
def predictions(data_loader):
    y_pred = []
    y_true = []
    for inputs, labels in tqdm(data_loader, total=len(data_loader)):
        with torch.no_grad():
            inputs, labels = inputs.cpu(), labels.cpu()
            output = bert_punc(inputs)
            y_pred += list(output.argmax(dim=1).cpu().data.numpy().flatten())
            y_true += list(labels.cpu().data.numpy().flatten())
    return y_pred, y_true

In [None]:
def evaluation(y_pred, y_test):
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(
        y_test, y_pred, average=None, labels=[1, 2, 3, 4, 5, 6])
    overall = metrics.precision_recall_fscore_support(
        y_test, y_pred, average='macro', labels=[1, 2, 3, 4, 5, 6])
    result = pd.DataFrame(
        np.array([precision, recall, f1]), 
        columns=list(punctuation_enc.keys())[1:], 
        index=['Precision', 'Recall', 'F1']
    )
    result['OVERALL'] = overall[:3]
    return result

In [None]:
# def evaluation(y_pred, y_test):
#     precision, recall, f1, _ = metrics.precision_recall_fscore_support(
#         y_test, y_pred, average=None, labels=[1])
#     overall = metrics.precision_recall_fscore_support(
#         y_test, y_pred, average='macro', labels=[1])
#     result = pd.DataFrame(
#         np.array([precision, recall, f1]), 
#         columns=list(punctuation_enc.keys())[1:], 
#         index=['Precision', 'Recall', 'F1']
#     )
#     result['OVERALL'] = overall[:3]
#     return result

#### Test

In [None]:
gc.collect()

In [None]:
y_pred_test, y_true_test = predictions(data_loader_test)

In [None]:
eval_test = evaluation(y_pred_test, y_true_test)
eval_test

In [None]:
# https://www.isca-speech.org/archive/Interspeech_2016/pdfs/1517.PDF
ref_test = pd.DataFrame({
    'COMMA':    [0.655, 0.471, 0.548],
    'PERIOD':   [0.733, 0.725, 0.729],
    'QUESTION': [0.707, 0.630, 0.667],
    'OVERALL':  [0.700, 0.597, 0.644]
}, index=['Precision', 'Recall', 'F1'])
ref_test

In [None]:
for col in ref_test.columns:
    pd.DataFrame({'Reference': ref_test[col], 'BertPunc': eval_test[col]}).plot.bar(
        title=col, figsize=(12, 4))

#### Test ASR

In [None]:
y_pred_test_asr, y_true_test_asr = predictions(data_loader_test_asr)
eval_test_asr = evaluation(y_pred_test_asr, y_true_test_asr)
eval_test_asr

In [None]:
# https://www.isca-speech.org/archive/Interspeech_2016/pdfs/1517.PDF
ref_test_asr = pd.DataFrame({
    'COMMA':    [0.596, 0.429, 0.499],
    'PERIOD':   [0.707, 0.720, 0.714],
    'QUESTION': [0.607, 0.486, 0.540],
    'OVERALL':  [0.660, 0.573, 0.614]
}, index=['Precision', 'Recall', 'F1'])
ref_test_asr

In [None]:
for col in ref_test_asr.columns:
    pd.DataFrame({'Reference': ref_test_asr[col], 'BertPunc': eval_test_asr[col]}).plot.bar(
        title=col, figsize=(12, 4))