In [103]:
from transformers import BertForSequenceClassification
from scipy.special import softmax
import pandas as pd
from IPython.display import Image
from transformers import BertTokenizer
from scipy.special import softmax
from IPython.display import Image
from datasets import load_dataset
import torch
import numpy as np
from openprompt import PromptDataLoader, PromptForClassification
from openprompt.data_utils import InputExample
from openprompt.plms import load_plm
from openprompt.prompts import ManualTemplate, ManualVerbalizer, ManualTemplate

In [104]:
def forward(model, encoding, n):
    outputs = model(**encoding)
    predictions = outputs.logits.detach().numpy()[0]
    predictions = [(idx, single_output) for idx, single_output in enumerate((softmax(predictions)*100))]
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:n]

def predict(model, inference_text, tokenizer,n):
    inference_input = InputExample(text_a = inference_text)
    inference_dataloader = PromptDataLoader(dataset=[inference_input], template=promptTemplate, tokenizer=tokenizer,
        tokenizer_wrapper_class=WrapperClass, max_seq_length=250, decoder_max_length=3,
        batch_size=1,shuffle=False, teacher_forcing=False, predict_eos_token=False,
        truncate_method="head")

    for index, inputs in enumerate(inference_dataloader):
        logits = model(inputs)
    predictions = [(idx, single_output) for idx, single_output in enumerate((softmax(logits.detach().numpy().tolist()[0])*100))]
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions[:n]

def create_input_text_list(input_text):
    input_text_list = [[]]
    line_input_length = int(len(input_text.split())/3)
    max_length = 10
    line_max = max_length if line_input_length < max_length else line_input_length if len(input_text) > max_length * 3 else max_length
    print(line_max)
    if len(input_text) > line_max:
        word_list = input_text.split()
        word_count = 0
        word_idx = 0
        for word in word_list:
            if word_count < line_max:
                input_text_list[word_idx].append(word)
                word_count += 1
            else:
                word_count = 0
                word_idx += 1
                input_text_list.append([])
        input_text_list = [" ".join(text_list) for text_list in input_text_list]
    else:
        input_text_list.append(input_text)
    return input_text_list

def create_pretty_string(model_names, input_text, model_labels, top_n):
    output_string = ""
    border = "  |  "
    input_title = "Input"
    input_text_list = create_input_text_list(input_text)
    output_string += "######\n"
    first_column_length = len(max(input_text_list + [input_title], key=len))
    output_string += input_title
    output_string += "".ljust(first_column_length - len(input_title), " ")
    for column_idx, name in enumerate(model_names):
        output_string += border
        output_string += name
        column_length = len(max(model_labels[column_idx] + [name], key=len)) - len(name)
        output_string += "".ljust(column_length, " ")
    output_string += border
    output_string += "\n"
    output_string += (u'\u2500'*(len(output_string))) + "\n"
    for row_idx in range(top_n):
        if row_idx < len(input_text_list):
            output_string += input_text_list[row_idx]
            row_length = first_column_length - len(input_text_list[row_idx])
            output_string += "".ljust(row_length, " ")
        else:
            output_string += "".ljust(first_column_length, " ")
        output_string += border
        for model_idx, model_output in enumerate(model_labels):
            max_length_column = len(max(model_output + [model_names[model_idx]], key=len))
            whitespace_length = max_length_column - len(model_output[row_idx])
            output_string += model_output[row_idx]
            output_string += "".ljust(whitespace_length, " ")
            output_string += border
        output_string += "\n"
    output_string += "######\n"
    return output_string

def pretty_inference(model_list, model_names, input_list, tokenizer_list, top_n):
    output = ""
    for input_text in input_list:
        model_labels = [[] for _ in range(len(model_list))]
        for idx, model in enumerate(model_list):
            tokenizer = tokenizer_list[idx]
            if "prompting" in model_names[idx]:
                predictions = predict(model, input_text, tokenizer, top_n)
            else:
                encoding = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
                predictions = forward(model, encoding, top_n)
            for prediction in predictions:
                pk_name = mappings.loc[mappings["index"]==prediction[0]]["name"].values[0]
                model_labels[idx].append(f"{pk_name}:{prediction[1]:.2f}%")
        output += create_pretty_string(model_names, input_text, model_labels, top_n)
        output += "\n"
    clear_output()
    return output

In [105]:
# Dependencies
plm, prompt_tokenizer, model_config, WrapperClass = load_plm("gpt2","gpt2")
mappings = pd.read_csv('data/pokemon_mapping.csv')
name_to_label_dict = mappings[["name","index"]].set_index('index').to_dict()["name"]
pokemon_descriptions = load_dataset('data/dataset/', delimiter=';')
NUM_CLASSES = np.unique(pokemon_descriptions['train']['labels'])

Using pad_token, but it is not set yet.
Using custom data configuration dataset-294e9b13f49dafc6
Found cached dataset csv (C:/Users/fst/.cache/huggingface/datasets/csv/dataset-294e9b13f49dafc6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

In [106]:
promptTemplate = ManualTemplate(
    text = '{"placeholder":"text_a"} the pokemon is {"mask"}',
    tokenizer = prompt_tokenizer,
)

promptVerbalizer = ManualVerbalizer(
    classes = NUM_CLASSES,
    label_words = name_to_label_dict,
    tokenizer = prompt_tokenizer,
)

promptLoadedModel = PromptForClassification(
    template = promptTemplate,
    plm = plm,
    verbalizer = promptVerbalizer,
    freeze_plm= True
)

promptLoadedModel.load_state_dict(state_dict=torch.load("prompting/checkp_copy/gpt2_trained_model.cp"))

<All keys matched successfully>

In [None]:
mappings = pd.read_csv('data/pokemon_mapping.csv')

model = [BertForSequenceClassification.from_pretrained("saved-model-base/"),BertForSequenceClassification.from_pretrained("saved-model/"),promptLoadedModel]
tokenizer = [BertTokenizer.from_pretrained("saved-model-base/"),BertTokenizer.from_pretrained("saved-model/"),prompt_tokenizer]

input_text = [
    "Walking stone monster with a huge body.",
    "Walking stone monster with a huge body. It hates water.",
    "Walking stone monster with a huge body. It hates water. Favorit attack is earthshake",
    "Insect with sharp claws only found in the safari zone",
    "only wakes up to eat",
    "A rock pokemon which looks like a stone snake",
    "A stone like snake",
    "The pokemon has a small Flower on the head and likes to sing. During the night it is sleeping.",
    "Many believe that all other Pokémon are descendants of this one",
    "It was the result of various experiments of team rocket",
    "A snake dragon like pokemon with a long tail. It is an higher evolution and is really strong. One of the top five is using this pokemon",
    "It is yellow and it's cheeks have red circles. It has long ears and likes thunder. Ash is his best friend",
    "A psychic pokemon with spoons",
    "Red legendary dragon with fire",
]

output = pretty_inference(model_list=model, tokenizer_list=tokenizer,model_names=["bert-base","bert-large", "gpt2-prompting"],
input_list=input_text, top_n=5)

In [None]:
print(output)