In [1]:
import transformers
from transformers import AutoTokenizer, AutoModelForMaskedLM
import datasets
from datasets import load_dataset

import sys
import src.evals.data as data_module

import torch

import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = "answerdotai/ModernBERT-large"

In [3]:
chat = data_module.create_preference_to_flan_style_dataset(
    task = "sarahpann/rwb_chat",
    split = "train",
    tokenizer_name = tokenizer,
    max_seq_length = 3600,
    prefix = "Which response is the most helpful, relevant, and correct? ",

    dataset_name="sarahpann/rwb_chat",
    dataset_subset="",
    task_column_names={"sarahpann/rwb_chat": ('chosen', 'rejected', 'og_dataset')}
)

chat_hard = data_module.create_preference_to_flan_style_dataset(
    task = "sarahpann/rwb_chat_hard",
    split = "train",
    tokenizer_name = tokenizer,
    max_seq_length = 3600,
    prefix = "Which response is the most helpful, relevant, and correct? ",

    dataset_name="sarahpann/rwb_chat_hard",
    dataset_subset="",
    task_column_names={"sarahpann/rwb_chat_hard": ('chosen', 'rejected', 'og_dataset')}
)

reasoning = data_module.create_preference_to_flan_style_dataset(
    task="sarahpann/rwb_reasoning",
    split='train',
    tokenizer_name=tokenizer,
    max_seq_length=3600,
    prefix="Determine which response is the best choice based on mathematical or programming accuracy. ",

    dataset_name="sarahpann/rwb_reasoning",
    dataset_subset="",
    task_column_names={"sarahpann/rwb_reasoning": ('chosen', 'rejected', 'og_dataset')}
)

safety = data_module.create_preference_to_flan_style_dataset(
    task="sarahpann/rwb_safety",
    split='train',
    tokenizer_name=tokenizer,
    max_seq_length=3600,
    prefix="Which response is the most helpful, relevant, and correct? ",

    dataset_name="sarahpann/rwb_safety",
    dataset_subset="",
    task_column_names={"sarahpann/rwb_safety": ('chosen', 'rejected', 'og_dataset')}
)

Downloading readme: 100%|██████████| 352/352 [00:00<00:00, 3.76kB/s]
Downloading data: 100%|██████████| 3.90M/3.90M [00:00<00:00, 12.0MB/s]
Generating train split: 100%|██████████| 2488/2488 [00:00<00:00, 132085.67 examples/s]
Map: 100%|██████████| 2488/2488 [00:00<00:00, 2926.47 examples/s]
Downloading readme: 100%|██████████| 348/348 [00:00<00:00, 3.26kB/s]
Downloading data: 100%|██████████| 403k/403k [00:00<00:00, 2.82MB/s]
Generating train split: 100%|██████████| 464/464 [00:00<00:00, 151865.55 examples/s]
Map: 100%|██████████| 464/464 [00:00<00:00, 5350.36 examples/s]
Downloading readme: 100%|██████████| 352/352 [00:00<00:00, 2.01kB/s]
Downloading data: 100%|██████████| 1.06M/1.06M [00:00<00:00, 6.37MB/s]
Generating train split: 100%|██████████| 1431/1431 [00:00<00:00, 239297.07 examples/s]
Map: 100%|██████████| 1431/1431 [00:00<00:00, 2725.76 examples/s]
Downloading readme: 100%|██████████| 350/350 [00:00<00:00, 3.58kB/s]
Downloading data: 100%|██████████| 665k/665k [00:00<00:00,

In [4]:
model = AutoModelForMaskedLM.from_pretrained("sarahpann/reasoning_model_large")

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.
Loading adapter weights from sarahpann/reasoning_model_large led to missing keys in the model: model.layers.0.attn.Wqkv.lora_A.default.weight, model.layers.0.attn.Wqkv.lora_B.default.weight, model.layers.0.attn.Wqkv.lora_magnitude_vector.default.weight, model.layers.0.attn.Wo.lora_A.default.weight, model.layers.0.attn.Wo.lora_B.default.weight, model.layers.0.attn.Wo.lora_magnitude_vector.default.weight, model.layers.0.mlp.Wi.lora_A.default.weight, model.layers.0.mlp.Wi.lora_B.default.weight, model.layers.0.mlp.Wi.lora_magnitude_vector.default.weight, model.layers.0.mlp.Wo.lora_A.default.weight, model.layers.0.mlp.Wo.lora_B.default.weight, model.layers.0.mlp.Wo.lora_magnitude_vector.default.weight, model.layers.1.attn.Wqkv.lora_A.default.weight, model.layers.1.attn.Wqkv.lora_B.default.weight, model.layers.1.attn.Wqkv.

In [5]:
chat.set_format(type='torch', columns=['input_ids', 'attention_mask'])
chat_hard.set_format(type='torch', columns=['input_ids', 'attention_mask'])
reasoning.set_format(type='torch', columns=['input_ids', 'attention_mask'])
safety.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [6]:
chat_loader = torch.utils.data.DataLoader(chat, batch_size=1, shuffle=False)
chat_hard_loader = torch.utils.data.DataLoader(chat_hard, batch_size=1, shuffle=False)
reasoning_loader = torch.utils.data.DataLoader(reasoning, batch_size=1, shuffle=False)
safety_loader = torch.utils.data.DataLoader(safety, batch_size=1, shuffle=False)

In [7]:
real_tokenizer = AutoTokenizer.from_pretrained(tokenizer)

In [8]:
import tqdm
from tqdm import tqdm

In [29]:
def eval_dataset(model, subject_dataloader):
    total = 0
    correct = 0

    model.to("cuda")
    model.eval()

    with torch.no_grad():
        for example in tqdm(subject_dataloader):
            input_ids = example['input_ids'].clone()
            input_ids[:, -2] = real_tokenizer.mask_token_id
            input_ids = input_ids.to("cuda")
            attention_mask = example['attention_mask'].to("cuda")


            output = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = output.logits
            pred = torch.argmax(logits, dim=-1)

            # print(output)
            print(real_tokenizer.decode(pred[0]))

            # print(real_tokenizer.decode(input_ids[0]))
            if pred[0, -2] == example['input_ids'][0, -2]:
                correct += 1
            total += 1

            
    print(f"Accuracy: {correct / total}")
    return correct / total

In [30]:
# chat_acc = eval_dataset(model, chat_loader)
# chat_hard_acc = eval_dataset(model, chat_hard_loader)
reasoning_acc = eval_dataset(model, reasoning_loader)
# safety_acc = eval_dataset(model, safety_loader)

  0%|          | 0/1431 [00:00<?, ?it/s]

  0%|          | 4/1431 [00:00<01:18, 18.08it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `has_close_elements(numbers: List[float], threshold: float) -> bool` to solve the following problem:
Check if in given list of numbers, are any two numbers closer to each other than
given threshold.
>>> has_close_elements([1.0, 2.0, 3.0], 0.5)
False
>>> has_close_elements([1.0, 2.8, 3.0, 4.0, 5.0, 2.0], 0.3)
True
    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx!= idx2:
                distance = abs(elem - elem2)
                if distance < threshold:
                    return True

    return False
# Choice 1: Write a Python function `has_close_elements(numbers: List[float], threshold: float) -> bool` to solve the following problem:
Check if in given list of numbers, are any two numbers closer to each other than
given threshold.
>>> has_close_elements([1.0, 2.0, 3.0], 0.5)
False
>>> has_close_elemen

  1%|          | 8/1431 [00:00<01:19, 17.91it/s]

[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `mean_absolute_deviation(numbers: List[float]) -> float` to solve the following problem:
For a given list of input numbers, calculate Mean Absolute Deviation
around the mean of this dataset.
Mean Absolute Deviation is the average absolute difference between each
element and a centerpoint (mean in this case):
MAD = average | x - x_mean |
>>> mean_absolute_deviation([1.0, 2.0, 3.0, 4.0])
1.0
    mean = sum(numbers) / len(numbers)
    return sum(abs(x - mean) for x in numbers) / len(numbers)
# Choice 1: Write a Python function `mean_absolute_deviation(numbers: List[float]) -> float` to solve the following problem:
For a given[SEP] of input numbers, calculate Mean Absolute Deviation
around the mean of this dataset.
Mean Absolute Deviation is the average absolute difference between each
element and a centerpoint (mean in this case):
MAD = average | x - x_mean |


  1%|          | 12/1431 [00:00<01:19, 17.87it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `sum_product(numbers: List[int]) -> Tuple[int, int]` to solve the following problem:
For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.
Empty sum should be equal to 0 and empty product should be equal to 1.
>>> sum_product([])
(0, 1)
>>> sum_product([1, 2, 3, 4])
(10, 24)
    sum_value = 0
    prod_value = 1

    for n in numbers:
        sum_value += n
        prod_value *= n
    return sum_value, prod_value
# Choice 1: Write a Python function `sum_product(numbers: List[int]) -> Tuple[int, int]` to solve the following problem:
For a given list of integers, return a tuple consisting of a sum and a product of all the integers in a list.
Empty sum should be equal to 0 and empty product should be equal to 1.
>>> sum_product([])
(0, 1)
>>> sum_product([1, 2, 3, 4])
(10, 24)
    sum_value = 0
    prod_value

  1%|          | 16/1431 [00:00<01:19, 17.81it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `longest(strings: List[str]) -> Optional[str]` to solve the following problem:
Out of list of strings, return the longest one. Return the first one in case of multiple
strings of the same length. Return None in case the input list is empty.
>>> longest([])
>>> longest(['a', 'b', 'c'])
'a'
>>> longest(['a', 'bb', 'ccc'])
'ccc'
    if not strings:
        return None

    maxlen = max(len(x) for x in strings)
    for s in strings:
        if len(s) == maxlen:
            return s
# Choice 1: Write a Python function `longest(strings: List[str]) -> Optional[str]` to solve the following problem:
Out of[SEP] of strings, return the longest one. Return the first one in case of multiple
strings of the same length. Return None in case the input list is empty.
>>> longest([])
>>> longest(['a', 'b', 'c'])
'a'
>>> longest(['a', 'bb', 'ccc'])
'ccc'
    if not strings:
     

  1%|▏         | 20/1431 [00:01<01:19, 17.65it/s]

[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `count_distinct_characters(string: str) -> int` to solve the following problem:
Given a string, find out how many distinct characters (regardless of case) does it consist of
>>> count_distinct_characters('xyzXYZ')
3
>>> count_distinct_characters('Jerry')
4
    return len(set(string.lower()))
# Choice 1: Write a Python function `count_distinct_characters(string: str) -> int` to solve the following problem:
Given a string, find out how many distinct characters (regardless of case) does it consist of
>>> count_distinct_characters('xyzXYZ')
3
>>> count_distinct_characters('Jerry')
4
    return len(set(string))
Answer:  3[SEP]
[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `parse_music(music_string: str) -> List[int]` to solve the following problem:
Input to this function is a strin

  2%|▏         | 24/1431 [00:01<01:18, 18.01it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `find_closest_elements(numbers: List[float]) -> Tuple[float, float]` to solve the following problem:
From a supplied list of numbers (of length at least two) select and return two that are the closest to each
other and return them in order (smaller number, larger number).
>>> find_closest_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.2])
(2.0, 2.2)
>>> find_closest_elements([1.0, 2.0, 3.0, 4.0, 5.0, 2.0])
(2.0, 2.0)
    closest_pair = None
    distance = None

    for idx, elem in enumerate(numbers):
        for idx2, elem2 in enumerate(numbers):
            if idx!= idx2:
                if distance is None:
                    distance = abs(elem - elem2)
                    closest_pair = tuple(sorted([elem, elem2]))
                else:
                    new_distance = abs(elem - elem2)
                    if new_distance < distance:
                        dist

  2%|▏         | 28/1431 [00:01<01:17, 18.08it/s]

[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `largest_divisor(n: int) -> int` to solve the following problem:
For a given number n, find the largest number that divides n evenly, smaller than n
>>> largest_divisor(15)
5
    for i in reversed(range(n)):
        if n % i == 0:
            return i
# Choice 1: Write a Python function `largest_divisor(n: int) -> int` to solve the following problem:
For a given number n, find the largest number that divides n evenly, smaller than n
>>> largest_divisor(15)
5
    for i in reversed(range(n)):
        if n - i == 0:
            return i
Answer:  5[SEP]
[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `factorize(n: int) -> List[int]` to solve the following problem:
Return list of prime factors of given integer in the order from smallest to largest.
Each of the factors should be liste

  2%|▏         | 32/1431 [00:01<01:18, 17.87it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `concatenate(strings: List[str]) -> str` to solve the following problem:
Concatenate list of strings into a single string
>>> concatenate([])
''
>>> concatenate(['a', 'b', 'c'])
'abc'
    return ''.join(strings)
# Choice 1: Write a Python function `concatenate(strings: List[str]) -> str` to solve the following problem:
Concatenate list of strings into a single string
>>> concatenate([])
''
>>> concatenate(['a', 'b', 'c'])
'abc'
    return''.join(strings)
Answer:  
[SEP]
[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `filter_by_prefix(strings: List[str], prefix: str) -> List[str]` to solve the following problem:
Filter an input list of strings only for ones that start with a given prefix.
>>> filter_by_prefix([], 'a')
[]
>>> filter_by_prefix(['abc', 'bcd', 'cde', 'array'], 'a')
['a

  3%|▎         | 36/1431 [00:02<01:19, 17.65it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `find_zero(xs: list)` to solve the following problem:
xs are coefficients of a polynomial.
find_zero find x such that poly(x) = 0.
find_zero returns only only zero point, even if there are many.
Moreover, Find_zero only takes list xs having even number of coefficients
and largest non zero coefficient as it guarantees
a solution.
>>> round(find_zero([1, 2]), 2) # f(x) = 1 + 2x
-0.5
>>> round(find_zero([-6, 11, -6, 1]), 2) # (x - 1) * (x - 2) * (x - 3) =-6 + 11x - 6x^2 + x^3
1.0
    begin, end = -1., 1.
    while poly(xs, begin) * poly(xs, end) > 0:
        begin *= 2.0
        end *= 2.0
    while end - begin > 1e-10:
        center = (begin + end) / 2.0
        if poly(xs, center) * poly(xs, begin) > 0:
            begin = center
        else:
            end = center
    return begin
[SEP] Choice 1: Write a Python function `find_zero(xs: list)` to solve the f

  3%|▎         | 40/1431 [00:02<01:19, 17.57it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `fizz_buzz(n: int)` to solve the following problem:
Return the number of times the digit 7 appears in integers less than n which are divisible by 11 or 13.
>>> fizz_buzz(50)
0
>>> fizz_buzz(78)
2
>>> fizz_buzz(79)
3
    ns = []
    for i in range(n):
        if i % 11 == 0 or i % 13 == 0:
            ns.append(i)
    s = ''.join(list(map(Str, ns)))
    ans = 0
    for c in s:
        ans += (c == '7')
    return ans
>>> Choice 1: Write a Python function `fizz_buzz(n: int)` to solve the following problem:
Return the number of times the digit 7 appears in integers less than n which are divisible by 11 or 13.
>>> fizz_buzz(50)
0
>>> fizz_buzz(78)
2
>>> fizz_buzz(79)
3
    ns = []
    for i in range(n):
        if i % 11 == 0 and i % 13 == 0:
            ns.append(i)
    s = ''.join(list(map(str, ns)))
    ans = 0
    for c in s:
        ans += (c == '7')
    retu

  3%|▎         | 44/1431 [00:02<01:18, 17.69it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `triples_sum_to_zero(l: list)` to solve the following problem:
triples_sum_to_zero takes a list of integers as an input.
it returns True if there are three distinct elements in the list that
sum to zero, and False otherwise.
>>> triples_sum_to_zero([1, 3, 5, 0])
False
>>> triples_sum_to_zero([1, 3, -2, 1])
True
>>> triples_sum_to_zero([1, 2, 3, 7])
False
>>> triples_sum_to_zero([2, 4, -5, 3, 9, 7])
True
>>> triples_sum_to_zero([1])
False
    for i in range(len(l)):
        for j in range(i + 1, len(l)):
            for k in range(j + 1, len(l)):
                if l[i] + l[j] + l[k] == 0:
                    return True
    return False
# Choice 1: Write a Python function `triples_sum_to_zero(l: list)` to solve the following problem:
triples_[SEP]_to_zero takes a list of integers as an input.
it returns True if there are three distinct elements in the list tha

  3%|▎         | 48/1431 [00:02<01:18, 17.73it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `change_base(x: int, base: int)` to solve the following problem:
Change numerical base of input number x to base.
return string representation after the conversion.
base numbers are less than 10.
>>> change_base(8, 3)
'22'
>>> change_base(8, 2)
'1000'
>>> change_base(7, 2)
'111'
    ret = ""
    while x > 0:
        ret = str(x % base) + ret
        x //= base
    return ret
>>> Choice 1: Write a Python function `change_base(x: int, base: int)` to solve the following problem:
Change numerical base of input number x to base.
return string representation after the conversion.
base numbers are less than 10.
>>> change_base(8, 3)
'22'
>>> change_base(8, 2)
'1000'
>>> change_base(7, 2)
'111'
    ret = ""
    while x > 0:
        ret = str(x % base) + ret
        x -= base
    return ret
Answer:  0[SEP]
[CLS]Determine which response is the best choice based on mathe

  4%|▎         | 52/1431 [00:02<01:17, 17.71it/s]

[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `is_palindrome(text: str)` to solve the following problem:
Checks if given string is a palindrome
>>> is_palindrome('')
True
>>> is_palindrome('aba')
True
>>> is_palindrome('aaaaa')
True
>>> is_palindrome('zbcd')
False
    for i in range(len(text)):
        if text[i]!= text[len(text) - 1 - i]:
            return False
    return True
>>> Choice 1: Write a Python function `is_palindrome(text: str)` to solve the following problem:
Checks if given string is a palindrome
>>> is_palindrome('')
True
>>> is_palindrome('aba')
True
>>> is_palindrome('aaaaa')
True
>>> is_palindrome('zbcd')
False
    for i in range(len(text)):
        if text[i]!= text[len(text) - i]:
            return False
    return True
Answer:  True[SEP]
[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `modp(n: in

  4%|▍         | 56/1431 [00:03<01:17, 17.67it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `below_threshold(l: list, t: int)` to solve the following problem:
Return True if all numbers in the list l are below threshold t.
>>> below_threshold([1, 2, 4, 10], 100)
True
>>> below_threshold([1, 20, 4, 10], 5)
False
    for e in l:
        if e >= t:
            return False
    return True
>>> Choice 1: Write a Python function `below_threshold(l: list, t: int)` to solve the following problem:
Return True if all numbers in the list l are below threshold t.
>>> below_threshold([1, 2, 4, 10], 100)
True
>>> below_threshold([1, 20, 4, 10], 5)
False
    for e in l:
        if e >= t:
            return True
    return False
Answer:  True[SEP]
[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `add(x: int, y: int)` to solve the following problem:
Add two numbers x and y
>>> add(2, 3

  4%|▍         | 60/1431 [00:03<01:17, 17.77it/s]

[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `correct_bracketing(brackets: str)` to solve the following problem:
brackets is a string of "<" and ">".
return True if every opening bracket has a corresponding closing bracket.
>>> correct_bracketing("<")
False
>>> correct_bracketing("<>")
True
>>> correct_bracketing("<<><>>")
True
>>> correct_bracketing("><<>")
False
    depth = 0
    for b in brackets:
        if b == "<":
            depth += 1
        else:
            depth -= 1
        if depth < 0:
            return False
    return depth == 0
>>> Choice 1: Write a Python function `correct_bracketing(brackets: str)` to solve the following problem:
brackets is a string of "<" and ">".
return True if every opening bracket has a corresponding closingbracket.
>>> correct_bracketing("<")
False
>>> correct_bracketing("<>")
True
>>> correct_bracketing("<<><>>")
True
>>> correct_bracketing("><<>")
False
 

  4%|▍         | 64/1431 [00:03<01:16, 17.89it/s]

[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `sum_to_n(n: int)` to solve the following problem:
sum_to_n is a function that sums numbers from 1 to n.
>>> sum_to_n(30)
465
>>> sum_to_n(100)
5050
>>> sum_to_n(5)
15
>>> sum_to_n(10)
55
>>> sum_to_n(1)
1
    return sum(range(n + 1))
# Choice 1: Write a Python function `sum_to_n(n: int)` to solve the following problem:
[SEP]_to_n is a function that sums numbers from 1 to n.
>>> sum_to_n(30)
465
>>> sum_to_n(100)
5050
>>> sum_to_n(5)
15
>>> sum_to_n(10)
55
>>> sum_to_n(1)
1
    return sum(range(n))
Answer:  1[SEP]
[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `correct_bracketing(brackets: str)` to solve the following problem:
brackets is a string of "(" and ")".
return True if every opening bracket has a corresponding closing bracket.
>>> correct_bracketing("(")
False
>>> c

  5%|▍         | 68/1431 [00:03<01:17, 17.61it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `vowels_count(s)` to solve the following problem:
Write a function vowels_count which takes a string representing
a word as input and returns the number of vowels in the string.
Vowels in this case are 'a', 'e', 'i', 'o', 'u'. Here, 'y' is also a
vowel, but only when it is at the end of the given word.
Example:
>>> vowels_count("abcde")
2
>>> vowels_count("ACEDY")
3
    vowels = "aeiouAEIOU"
    n_vowels = sum(c in vowels for c in s)
    if s[-1] == 'y' or s[-1] == 'Y':
        n_vowels += 1
    return n_vowels
# Choice 1: Write a Python function `vowels_count(s)` to solve the following problem:
Write a[SEP] vowels_count which takes a string representing
a word as input and returns the number of vowels in the string.
Vowels in this case are 'a', 'e', 'i', 'o', 'u'. Here, 'y' is also a
vowel, but only when it is at the end of the given word.
Example:
>>> vowels

  5%|▌         | 72/1431 [00:04<01:16, 17.70it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `pluck(arr)` to solve the following problem:
"Given an array representing a branch of a tree that has non-negative integer nodes
your task is to pluck one of the nodes and return it.
The plucked node should be the node with the smallest even value.
If multiple nodes with the same smallest even value are found return the node that has smallest index.
The plucked node should be returned in a list, [ smalest_value, its index ],
If there are no even values or the given array is empty, return [].
Example 1:
Input: [4,2,3]
Output: [2, 1]
Explanation: 2 has the smallest even value, and 2 has the smallest index.
Example 2:
Input: [1,2,3]
Output: [2, 1]
Explanation: 2 has the smallest even value, and 2 has the smallest index.
Example 3:
Input: []
Output: []
Example 4:
Input: [5, 0, 3, 0, 4, 2]
Output: [0, 1]
Explanation: 0 is the smallest value, but  there are two zero

  5%|▌         | 76/1431 [00:04<01:17, 17.40it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `will_it_fly(q,w)` to solve the following problem:
Write a function that returns True if the object q will fly, and False otherwise.
The object q will fly if it's balanced (it is a palindromic list) and the sum of its elements is less than or equal the maximum possible weight w.
Example:
will_it_fly([1, 2], 5) ➞ False
# 1+2 is less than the maximum possible weight, but It's unbalanced.
will_it_fly([3, 2, 3], 1) ➞ False
# it's balanced, but 3+2+3 is more thanthe maximum possible weight.
will_it_fly([3, 2, 3], 9) ➞ True
# 3+2+3 is less than the maximum possible weight, and it's balanced.
will_it_fly([3], 5) ➞ True
# 3 is less than the maximum possible weight, and it's balanced.
    if sum(q) > w:
        return False

    i, j = 0, len(q)-1
    while i<j:
        if q[i]!= q[j]:
            return False
        i+=1
        j-=1
    return True
# Choice 1: Write

  6%|▌         | 80/1431 [00:04<01:17, 17.39it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `is_simple_power(x, n)` to solve the following problem:
Your task is to write a function that returns true if a number x is a simple
power of n and false in other cases.
x is a simple power of n if n**int=x
For example:
is_simple_power(1, 4) => true
is_simple_power(2, 2) => true
is_simple_power(8, 2) => true
is_simple_power(3, 2) => false
is_simple_power(3, 1) => false
is_simple_power(5, 3) => false
    if (n == 1): 
        return (x == 1) 
    power = 1
    while (power < x): 
        power = power * n 
    return (power == x) 

 Choice 1: Write a Python function `is_simple_power(x, n)` to solve the following problem:
Your task is to write a function that returns[SEP] if a number x is a simple
power of n and false in other cases.
x is a simple power of n if n**int=x
For example:
is_simple_power(1, 4) => true
is_simple_power(2, 2) => true
is_simple_power(8, 2

  6%|▌         | 84/1431 [00:04<01:19, 16.94it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `is_happy(s)` to solve the following problem:
You are given a string s.
Your task is to check if the string is happy or not.
A string is happy if its length is at least 3 and every 3 consecutive letters are distinct
For example:
is_happy(a) => False
is_happy(aa) => False
is_happy(abcd) => True
is_happy(aabb) => False
is_happy(adb) => True
is_happy(xyy) => False
    if len(s) < 3:
      return False

    for i in range(len(s) - 2):
      
      if s[i] == s[i+1] or s[i+1] == s[i+2] or s[i] == s[i+2]:
        return False
    return True
# Choice 1: Write a Python function `is_happy(s)` to solve the following problem:
You are given a string s.
Your task is to check if the string is happy or not.
A string is happy if its length is at least 3 and every 3 consecutive letters are distinct
For example:
is_happy(a) => False
is_happy(aa) => False
is_happy(abcd) => True

  6%|▌         | 88/1431 [00:04<01:16, 17.48it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `solve(N)` to solve the following problem:
Given a positive integer N, return the total sum of its digits in binary.
Example
For N = 1000, the sum of digits will be 1 the output should be "1".
For N = 150, the sum of digits will be 6 the output should be "110".
For N = 147, the sum of digits will be 12 the output should be "1100".
Variables:
@N integer
Constraints: 0 ≤ N ≤ 10000.
Output:
a string of binary number
    return bin(sum(int(i) for i in str(N)))[2:]
Answer Choice 1: Write a Python function `solve(N)` to solve the following problem:
Given a positive integer N, return the total sum of its digits in binary.
Example
For N = 1000, the sum of digits will be 1 the output should be "1".
For N = 150, the sum of digits will be 6 the output should be "110".
For N = 147, the sum of digits will be 12 the output should be "1100".
Variables:
@N integer
Constraints

  6%|▋         | 92/1431 [00:05<01:15, 17.72it/s]

[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `sort_array(Array)` to solve the following problem:
Given an array of non-negative integers, return a copy of the given array after sorting,
you will sort the given array in ascending order if the sum( first index value, last index value) is odd,
or sort it in descending order if the sum( first index value, last index value) is even.
Note:
* don't change the given array.
Examples:
* sort_array([]) => []
* sort_array([5]) => [5]
* sort_array([2, 4, 3, 0, 1, 5]) => [0, 1, 2, 3, 4, 5]
* sort_array([2, 4, 3, 0, 1, 5, 6]) => [6, 5, 4, 3, 2, 1, 0]
    return [] if len(array) == 0 else sorted(array, reverse= (array[0]+array[-1]) % 2 == 0) 
 * Choice 1: Write a Python function `sort_array(array)` to solve the following problem:
Given an array of[SEP]-negative integers, return a[SEP] of the given array after sorting,
you will sort the given array in ascending order 

  7%|▋         | 96/1431 [00:05<01:18, 17.04it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `any_int(x, y, z)` to solve the following problem:
Create a function that takes 3 numbers.
Returns true if one of the numbers is equal to the sum of the other two, and all numbers are integers.
Returns false in any other cases.
Examples
any_int(5, 2, 7) ➞ True
any_int(3, 2, 2) ➞ False
any_int(3, -2, 1) ➞ True
any_int(3.6, -2.2, 2) ➞ False
    
    if isinstance(x,int) and isinstance(y,int) and isinstance(z,int):
        if (x+y==z) or (x+z==y) or (y+z==x):
            return True
        return False
    return False
# Choice 1: Write a Python function `any_int(x, y, z)` to solve the following problem:
Create a function that takes 3 numbers.
Returns true if one of the numbers is equal to the sum of the other two, and all numbers are integers.
Returns false in any other cases.
Examples
any_int(5, 2, 7) ➞ True
any_int(3, 2, 2) ➞ False
any_int(3, -2, 1) ➞ True
an

  7%|▋         | 100/1431 [00:05<01:16, 17.46it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `count_up_to(n)` to solve the following problem:
Implement a function that takes an non-negative integer and returns an array of the first n
integers that are prime numbers and less than n.
for example:
count_up_to(5) => [2,3]
count_up_to(11) => [2,3,5,7]
count_up_to(0) => []
count_up_to(20) => [2,3,5,7,11,13,17,19]
count_up_to(1) => []
count_up_to(18) => [2,3,5,7,11,13,17]
    primes = []
    for i in range(2, n):
        is_prime = True
        for j in range(2, i):
            if i % j == 0:
                is_prime = False
                break
        if is_prime:
            primes.append(i)
    return primes

  Choice 1: Write a Python function `count_up_to(n)` to solve the following problem:
Implement a[SEP] that takes an non-negative integer and returns an array of the first n
integers that are prime numbers and less than n.
for example:
count_up_to(5

  7%|▋         | 104/1431 [00:05<01:15, 17.61it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `make_a_pile(n)` to solve the following problem:
Given a positive integer n, you have to make a pile of n levels of stones.
The first level has n stones.
The number of stones in the next level is:
- the next odd number if n is odd.
- the next even number if n is even.
Return the number of stones in each level in a list, where element at index
i represents the number of stones in the level (i+1).
Examples:
>>> make_a_pile(3)
[3, 5, 7]
    return [n + 2*i for i in range(n)]
• Choice 1: Write a Python function `make_a_pile(n)` to solve the following problem:
Given a positive integer n,[SEP] has to make a pile of n levels of stones.
The first level has n stones.
The number of stones in the next level is:
- the next odd number if n is odd.
- the next even number if n is even.
Return the number of stones in each level in a list, where element at index
i represents t

  8%|▊         | 108/1431 [00:06<01:15, 17.62it/s]

[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `unique_digits(x)` to solve the following problem:
Given a list of positive integers x. return a sorted list of all
elements that hasn't any even digit.
Note: Returned list should be sorted in increasing order.
For example:
>>> unique_digits([15, 33, 1422, 1])
[1, 15, 33]
>>> unique_digits([152, 323, 1422, 10])
[]
    odd_digit_elements = []
    for i in x:
        if all (int(c) % 2 == 1 for c in str(i)):
            odd_digit_elements.append(i)
    return sorted(odd_digit_elements)
# Choice 1: Write a Python function `unique_digits(x)` to solve the following problem:
Given a list of positive integers x. return a sorted list of all
elements that hasn't any even digit.
Note: Returned list should be sorted in increasing order.
For example:
>>> unique_digits([15, 33, 1422, 1])
[1, 15, 33]
>>> unique_digits([152, 323, 1422, 10])
[]
    odd_digit_elements = []


  8%|▊         | 112/1431 [00:06<01:16, 17.15it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `count_nums(arr)` to solve the following problem:
Write a function count_nums which takes an array of integers and returns
the number of elements which has a sum of digits > 0.
If a number is negative, then its first signed digit will be negative:
e.g. -123 has signed digits -1, 2, and 3.
>>> count_nums([]) == 0
>>> count_nums([-1, 11, -11]) == 1
>>> count_nums([1, 1, 2]) == 3
    def digits_sum(n):
        neg = 1
        if n < 0: n, neg = -1 * n, -1 
        n = [int(i) for i in str(n)]
        n[0] = n[0] * neg
        return sum(n)
    return len(list(filter(lambda x: x > 0, [digits_sum(i) for i in arr])))
Problem Choice 1: Write a Python function `count_nums(arr)` to solve the following problem:
Write a function count_nums which takes an array of integers and returns
the number of elements which has a sum of digits > 0.
If a number is negative, then its 

  8%|▊         | 116/1431 [00:06<01:15, 17.41it/s]

[CLS]Determine which response is the best choice based on mathematical or programming logic.  Choice 0: Write a Python function `reverse_delete(s,c)` to solve the following problem:
Task
We are given two strings s and c, you have to deleted all the characters in s that are equal to any character in c
then check if the result string is palindrome.
A string is called palindrome if it reads the same backward as forward.
You should return a tuple containing the result string and True/False for the check.
Example
For s = "abcde", c = "ae",the result should be ('bcd',False)
For s = "abcdef", c = "b"  the result should be ('acdef',False)
For s = "abcdedcba", c = "ab",the result should be ('cdedc',True)
    s = ''.join([char for char in s if char not in c])
    return (s,s[::-1] == s)
Solution Choice 1: Write a Python function `reverse_delete(s,c)` to solve the following problem:
Task
We are given two strings s and c,[SEP] have to deleted all the characters in s that are equal to any character

  8%|▊         | 119/1431 [00:06<01:14, 17.57it/s]


[CLS]Determine which response is the best choice based on mathematical or programming accuracy.  Choice 0: Write a Python function `sort_array(arr)` to solve the following problem:
In this Kata, you have to sort an array of non-negative integers according to
number of ones in their binary representation in ascending order.
For similar number of ones, sort based on decimal value.
It must be implemented like this:
>>> sort_array([1, 5, 2, 3, 4]) == [1, 2, 3, 4, 5]
>>> sort_array([-2, -3, -4, -5, -6]) == [-6, -5, -4, -3, -2]
>>> sort_array([1, 0, 2, 3, 4]) [0, 1, 2, 3, 4]
    return sorted(sorted(arr), key=lambda x: bin(x)[2:].count('1'))
# Choice 1: Write a Python function `sort_array(arr)` to solve the following problem:
In this Kata,[SEP] has to sort an array of non-negative integers according to
number of ones in their binary representation in ascending order.
For similar number of ones, sort based on decimal value.
It must be implemented like this:
>>>sort_array([1, 5, 2, 3, 4]) == [

KeyboardInterrupt: 

In [67]:
final_results = {
    "chat": chat_acc,
    "chat_hard": chat_hard_acc,
    "reasoning": reasoning_acc,
    "safety": safety_acc
}

print(final_results)

{'chat': 0.7946141479099679, 'chat_hard': 0.2521551724137931, 'reasoning': 0.76659678546471, 'safety': 0.7918918918918919}


In [None]:
# for chat
0.8657556270096463 * (1 - 0.157) + 0.36853448275862066 * (0.157)

0.7876919073622352

In [None]:
# for reasoning
0.3305380852550664