## II. PREDICTION

##### 1. Import packages
`pip install transformers torch datasets pandas`

In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel, pipeline
import torch
import random
from datasets import load_dataset

# settings.py
from settings import dataset_name, model_name, finetuned_model_name, finetuned_models_folder

##### 2. Load dataset "emotion" - use test subset

In [None]:
dataset = load_dataset(dataset_name)
labels = pd.Series(list(dataset.data['test'].columns[2])).unique().astype(str).tolist()
print('Labels of "emotion" test dataset:', labels)
print(f'Size of "emotion" test datset: {len(dataset.data["test"])}')

In [None]:
print('"""  Example: """')
index = 42
print(f"TEXT: '{dataset['test']['text'][index]}'")
print(f"LABEL: {dataset['test']['label'][index]}")
print(f"LABEL_TEXT: {dataset['test']['label_text'][index]}")

##### 3. Load finetuned model from file

In [None]:
finetuned_model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=f'./{finetuned_models_folder}/{finetuned_model_name}',local_files_only=True)

##### 4. Tokenize text (test subset)

In [None]:
# Tokenize random sample sentences from test set
num_samples = 100
sample_index = random.sample(range(len(dataset.data["test"])), num_samples)
samples_text = [dataset['test']['text'][x] for x in sample_index]
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokens = tokenizer(samples_text, padding=True, truncation=True, return_tensors="pt")

##### 5. Make predictions for test subset

In [None]:
# IMPORTANT: DO NOT TRAIN FOR PREDICTION - FREEZE MODEL - NO BACKPROPAGATION:
with torch.no_grad():
    # Make prediction:
    model_output = finetuned_model(**tokens)

In [None]:
# PREDICTION Values and Labels
pred_labels = torch.argmax(model_output.logits, axis=-1).numpy().tolist()
pred_labels_text = [labels[x] for x in pred_labels]

In [None]:
# REAL Values and Labels
labels = [dataset['test']['label'][x] for x in sample_index]
labels_text = [dataset['test']['label_text'][x] for x in sample_index]

In [None]:
# Put everything into a pandas DataFrame
df = pd.DataFrame(data={'text': samples_text, 'real_labels': labels,'real_labels_text': labels_text,'pred_labels':pred_labels, 'pred_labels_text':pred_labels_text})

##### 6. Show correct/good and incorrect/bad predictions in a DataFrame
###### (Please note that this might change for every run of this notebook depending on the randomly chosen samples above)

In [None]:
# Good predictions
df_good_preds = df.query('real_labels == pred_labels')
print(f'Number of correct/good predictions: {len(df_good_preds.index)}')
df_good_preds

In [None]:
# Bad predictions
df_bad_preds = df.query('real_labels != pred_labels')
print(f'Number of incorrect/bad predictions: {len(df_bad_preds.index)}')
df_bad_preds

In [None]:
print(f'Percentage of correctly predicted sample labels: {float(len(df_good_preds.index)*100/(len(df_bad_preds.index) + len(df_good_preds.index)))} %')