In [1]:
from datasets import load_dataset
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import gc
from torch.utils.data import DataLoader

In [2]:
torch.random.manual_seed(0)
model_id = "microsoft/Phi-3-mini-4k-instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [56]:
dataset = load_dataset("allenai/openbookqa", split='validation')

In [57]:
dataset

Dataset({
    features: ['id', 'question_stem', 'choices', 'answerKey'],
    num_rows: 500
})

In [19]:
test = dataset.select(range(20))
test

Dataset({
    features: ['id', 'question_stem', 'choices', 'answerKey'],
    num_rows: 20
})

In [59]:
for item in dataset:
    # Use a default value or empty string if 'fact1' is missing
    print(item)


{'id': '8-376', 'question_stem': 'Frilled sharks and angler fish live far beneath the surface of the ocean, which is why they are known as', 'choices': {'text': ['Deep sea animals', 'fish', 'Long Sea Fish', 'Far Sea Animals'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'A'}
{'id': '7-57', 'question_stem': 'Gas can fill any container it is given, and liquid', 'choices': {'text': ['is standard weight and size', 'is the opposite of variable', 'only needs a few', 'uses what it needs'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'D'}
{'id': '7-1024', 'question_stem': 'When birds migrate south for the winter, they do it because', 'choices': {'text': ['they are genetically called to', 'their children ask for them to', 'it is important to their happiness', 'they decide to each year'], 'label': ['A', 'B', 'C', 'D']}, 'answerKey': 'A'}
{'id': '959', 'question_stem': 'If a person walks in the opposite direction of a compass arrow they are walking', 'choices': {'text': ['west', 'north', 'east'

In [20]:
max_length = 100
prefix = "Which of A, B, C, or D is the most possible word to follow the sentence? sentece:"
model_inputs = {
    'input_ids': [],
    'attention_mask': [],
    'labels': []
}
suffix = "Choose only one from A,B,C and D. answer:"
print(len(suffix))
def preprocess(data):
    inputs = []
    labels = []
    print(data)
    for i in range(len(data['question_stem'])):
        quiz = prefix + data['question_stem'][i][:-1]
        
        for j in range(4):
            quiz += data['choices'][i]['label'][j] + ":" + data['choices'][i]['text'][j]  + suffix
            
        inputs.append(quiz)
        labels.append(data['answerKey'][i])
    

    tokenized_inputs= tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt")
    tokenized_labels = tokenizer(labels, return_tensors="pt")
    print('hi')
    model_inputs['input_ids'].append(tokenized_inputs['input_ids'])
    model_inputs['attention_mask'].append(tokenized_inputs['attention_mask'])
    model_inputs['labels'].append(tokenized_labels['input_ids'])
                             

41


In [21]:
tokenized = test.map(preprocess, batched=True, batch_size=2)

model_inputs['input_ids'] = torch.vstack(model_inputs['input_ids'])
model_inputs['attention_mask'] = torch.vstack(model_inputs['attention_mask'])
model_inputs['labels'] = torch.cat(model_inputs['labels'], dim=0)

Map:   0%|          | 0/20 [00:00<?, ? examples/s]

{'id': ['8-343', '1129'], 'question_stem': ['A person wants to start saving money so that they can afford a nice vacation at the end of the year. After looking over their budget and expenses, they decide the best way to save money is to', 'There is most likely going to be fog around:'], 'choices': [{'text': ['make more phone calls', 'quit eating lunch out', 'buy less with monopoly money', 'have lunch with friends'], 'label': ['A', 'B', 'C', 'D']}, {'text': ['a marsh', 'a tundra', 'the plains', 'a desert'], 'label': ['A', 'B', 'C', 'D']}], 'answerKey': ['B', 'A']}
hi
{'id': ['880', '7-999'], 'question_stem': ['Predators eat', 'Oak tree seeds are planted and a sidewalk is paved right next to that spot, until eventually, the tree is tall and the roots must extend past the sidewalk, which means'], 'choices': [{'text': ['lions', 'humans', 'bunnies', 'grass'], 'label': ['A', 'B', 'C', 'D']}, {'text': ['roots may be split', 'roots may begin to die', 'parts may break the concrete', 'roots may 

In [22]:
model_inputs = {k: v.to('cuda') for k, v in model_inputs.items()}

In [23]:
decoded_labels = [tokenizer.decode(label_ids, skip_special_tokens=True) for label_ids in model_inputs['labels']]
decoded_labels

['B',
 'A',
 'C',
 'C',
 'C',
 'C',
 'C',
 'B',
 'D',
 'B',
 'C',
 'B',
 'C',
 'A',
 'C',
 'D',
 'C',
 'C',
 'A',
 'B']

In [24]:
allowed_tokens = ['A', 'B', 'C', 'D']
allowed_token_ids = tokenizer.convert_tokens_to_ids(allowed_tokens)

In [53]:
with torch.no_grad():
    outputs = model(input_ids=model_inputs['input_ids'], attention_mask=model_inputs['attention_mask'])
print(outputs.logits.shape)
print(outputs.logits[:,-1,:].shape)
processed = torch.nn.functional.softmax(outputs.logits[:,-1,:], dim=1)
print(processed.shape)
generated = torch.argmax(processed,dim=1)
print(generated.shape)
decoded_outputs = [tokenizer.decode(answer, skip_special_tokens=True) for answer in generated]
print(decoded_outputs)

torch.Size([20, 100, 32064])
torch.Size([20, 32064])
torch.Size([20, 32064])
torch.Size([20])
['A', 'only', 'and', ',', 'from', 'ose', 'A', '.', '\n', 'one', 'one', ',', 'C', 'd', 'C', ',', 'answer', 'Cho', ',', ',']


In [28]:
with torch.no_grad():
    outputs = model(input_ids=model_inputs['input_ids'], attention_mask=model_inputs['attention_mask']
                            )


suffix = "answer:"

generated_output = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
position = [output.find(suffix) for output in generated_output]
print(position)
post = []
for i, pos in enumerate(position):
    if pos == -1:
        post.append(generated_output[i])
    else:
        post.append(generated_output[i][pos+7:])
print(post)

TypeError: argument 'ids': Can't extract `str` to `Vec`

In [None]:
class CustomedPipeline():
    def __init__(
            self,
            config,
            model_id = "microsoft/Phi-3-mini-4k-instruct"
            device = "cuda"
        ):
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model =  CustomedPhi3ForCausalLM(self.tokenizer, self.config)
        
    
    def forward(self, model_inputs, max_length = 500):
        input_ids = model_inputs['input_ids']
        attention_mask = model_inputs['attention_mask']
        prompt_len = model_inputs['prompts']

        generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask,max_length=max_length)
        return {"generated_sequence": generated_sequence, "prompt_len" :prompt_len}

    def postprocess(self, model_outputs, clean_up_tokenization_spaces=True):
        generated_sequence = model_outputs["generated_sequence"]
        prompt_len = model_outputs["prompt_len"]
        
        result = []
        
        for i, text in enumerate(generated_sequence):
            eos_pos = (text == self.tokenizer.eos_token_id).nonzero(as_tuple=True)[0]
  
            if len(eos_pos) > 0:
                eos_after_prompt = next((pos.item() for pos in eos_pos if pos.item() > prompt_len), None)

                if eos_after_prompt is not None:
                    text = text[prompt_len:eos_after_prompt-1]
                else:
                    text = text[prompt_len:]
            else:
                text = text[prompt_len:]
                
            #decoded_text = self.tokenizer.decode(text, skip_special_tokens=True)
            decoded_text = self.tokenizer.decode(text)
            result.append([{'generated':decoded_text}])

        return result

In [None]:
gc.collect()