## II. PREDICTION

##### 1. Import packages
`pip install transformers torch datasets pandas`

In [28]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel, pipeline
import torch
import random
from datasets import load_dataset

# settings.py
from settings import dataset_name, model_name, finetuned_model_name, finetuned_models_folder

##### 2. Load dataset "emotion" - use test subset

In [29]:
dataset = load_dataset(dataset_name)
labels = pd.Series(list(dataset.data['test'].columns[2])).unique().astype(str).tolist()
print('Labels of "emotion" test dataset:', labels)
print(f'Size of "emotion" test datset: {len(dataset.data["test"])}')

Using custom data configuration SetFit--emotion-115c273ec307f160
Reusing dataset json (C:\Users\Portfolio-Resolution\.cache\huggingface\datasets\SetFit___json\SetFit--emotion-115c273ec307f160\0.0.0\a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253)


  0%|          | 0/3 [00:00<?, ?it/s]

Labels of "emotion" test dataset: ['sadness', 'joy', 'fear', 'anger', 'love', 'surprise']
Size of "emotion" test datset: 2000


In [30]:
print('"""  Example: """')
index = 42
print(f"TEXT: '{dataset['test']['text'][index]}'")
print(f"LABEL: {dataset['test']['label'][index]}")
print(f"LABEL_TEXT: {dataset['test']['label_text'][index]}")

"""  Example: """
TEXT: 'i feel reassured that i am dealing with my diet in the right way and that all is good'
LABEL: 1
LABEL_TEXT: joy


##### 3. Load finetuned model from file

In [31]:
finetuned_model = AutoModelForSequenceClassification.from_pretrained(pretrained_model_name_or_path=f'./{finetuned_models_folder}/{finetuned_model_name}',local_files_only=True)

##### 4. Tokenize text (test subset)

In [32]:
# Tokenize random sample sentences from test set
num_samples = 100
sample_index = random.sample(range(len(dataset.data["test"])), num_samples)
samples_text = [dataset['test']['text'][x] for x in sample_index]
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokens = tokenizer(samples_text, padding=True, truncation=True, return_tensors="pt")

##### 5. Make predictions for test subset

In [33]:
# IMPORTANT: DO NOT TRAIN FOR PREDICTION - FREEZE MODEL - NO BACKPROPAGATION:
with torch.no_grad():
    # Make prediction:
    model_output = finetuned_model(**tokens)

In [34]:
# PREDICTION Values and Labels
pred_labels = torch.argmax(model_output.logits, axis=-1).numpy().tolist()
pred_labels_text = [labels[x] for x in pred_labels]

In [35]:
# REAL Values and Labels
labels = [dataset['test']['label'][x] for x in sample_index]
labels_text = [dataset['test']['label_text'][x] for x in sample_index]

In [36]:
# Put everything into a pandas DataFrame
df = pd.DataFrame(data={'text': samples_text, 'real_labels': labels,'real_labels_text': labels_text,'pred_labels':pred_labels, 'pred_labels_text':pred_labels_text})

##### 6. Show correct/good and incorrect/bad predictions in a DataFrame
###### (Please note that this might change for every run of this notebook depending on the randomly chosen samples above)

In [37]:
# Good predictions
df_good_preds = df.query('real_labels == pred_labels')
print(f'Number of correct/good predictions: {len(df_good_preds.index)}')
df_good_preds

Number of correct/good predictions: 96


Unnamed: 0,text,real_labels,real_labels_text,pred_labels,pred_labels_text
0,i could feel my mother s sympathetic dread as ...,2,love,2,fear
1,i feel like an emotional cutter,0,sadness,0,sadness
2,i have to admit that i feel the teensiest bit ...,3,anger,3,anger
3,i forgive myself that i have accepted and allo...,0,sadness,0,sadness
4,i think itd be easier if i had parents that ar...,3,anger,3,anger
...,...,...,...,...,...
95,i am most certainly an acquired taste but late...,3,anger,3,anger
96,i feel it is really valuable to contemplate on...,1,joy,1,joy
97,i feel like a may have mislead the very gracio...,1,joy,1,joy
98,i left with my bouquet of red and yellow tulip...,1,joy,1,joy


In [38]:
# Bad predictions
df_bad_preds = df.query('real_labels != pred_labels')
print(f'Number of incorrect/bad predictions: {len(df_bad_preds.index)}')
df_bad_preds

Number of incorrect/bad predictions: 4


Unnamed: 0,text,real_labels,real_labels_text,pred_labels,pred_labels_text
32,ive been soo excited for him to feel and it wa...,1,joy,5,surprise
45,i feel i can only hope im not alone in these t...,0,sadness,1,joy
53,i feel blessed that i have people in my life w...,2,love,1,joy
85,i did feel slightly weird in that costume,4,fear,5,surprise


In [39]:
print(f'Percentage of correctly predicted sample labels: {float(len(df_good_preds.index)*100/(len(df_bad_preds.index) + len(df_good_preds.index)))} %')

Percentage of correctly predicted sample labels: 96.0 %
