In [21]:
# load all the codebert datasets
import pandas as pd
import numpy as np
import os
import json
import re
import random


In [2]:
dataset = []
for file in os.listdir('code_bert'):
    with open(os.path.join('code_bert', file), 'r') as f:
        dataset += [json.loads(line) for line in f.readlines()]
        
dataset_df = pd.DataFrame(dataset)
    

In [14]:
# get introductory question and answer
def get_q_and_a(df, difficulty='introductory', seed=42):
    random.seed(seed)
    diff_df = df
    if difficulty:
        diff_df = df.loc[df['difficulty'] == difficulty]
    
    q_and_a_df = pd.DataFrame()
    
    while len(q_and_a_df) != 4:
        sample_problem_id = random.choice(diff_df['problem_id'].unique())
        q_and_a_df = diff_df.loc[diff_df['problem_id'] == sample_problem_id]

    return q_and_a_df



In [4]:
def visualize_q_and_a(df):
    print("=====================================")
    print(df[['problem_id', 'label_name', 'difficulty']])
    print("=====================================")
    print(df['question'].values[0])
    
    for i, row in df.iterrows():
        print("=====================================")
        print(row['original_source'], row['label_name'])
        print("=====================================")
        print(row['code'])
        print("=====================================")

In [5]:
def latex_q_and_a(df, minipage=False):
    ''''
    \begin{lstlisting}[language=Python, style=question]

You are given N positive integers a_1, a_2, ..., a_N.
For a non-negative integer m, let f(m) = (m\ mod\ a_1) + (m\ mod\ a_2) + ... + (m\ mod\ a_N).
Here, X\ mod\ Y denotes the remainder of the division of X by Y.
Find the maximum value of f.

-----Constraints-----
 - All values in input are integers.
 - 2 \leq N \leq 3000
 - 2 \leq a_i \leq 10^5

-----Input-----
Input is given from Standard Input in the following format:
N
a_1 a_2 ... a_N

-----Output-----
Print the maximum value of f.

-----Sample Input-----
3
3 4 6

-----Sample Output-----
10

f(11) = (11\ mod\ 3) + (11\ mod\ 4) + (11\ mod\ 6) = 10 is the maximum value of f.
\end{lstlisting}

\textbf{Human-Written Answer:}
\begin{lstlisting}[language=Python, style=python]
    n = int(input())
    a_l = list(map(int, input().split()))
    m = 1
    for a in a_l:
        m *= a
    ans = 0
    for a in a_l:
        ans += (m-1)%a 
    print(ans)
\end{lstlisting}

\textbf{CodeLlama 70B:}
\begin{lstlisting}[language=Python, style=python]
    def max_f(a):
        n = len(a)
        mod_sum = [0] * n
        max_f = 0
    
        for i in range(n):
            for j in range(i, n):
                mod_sum[j] += a[i] - 1
    
        for i in range(n):
            max_f = max(max_f, mod_sum[i])
    
        return max_f
    
    if __name__ == "__main__":
        n = int(input())
        a = list(map(int, input().split()))
        result = max_f(a)
        print(result)
\end{lstlisting}

\textbf{CodeLlama 70B:}
\begin{lstlisting}[language=Python, style=python]
    def max_f(a):
        n = len(a)
        mod_sum = [0] * n
        max_f = 0
    
        for i in range(n):
            for j in range(i, n):
                mod_sum[j] += a[i] - 1
    
        for i in range(n):
            max_f = max(max_f, mod_sum[i])
    
        return max_f
    
    if __name__ == "__main__":
        n = int(input())
        a = list(map(int, input().split()))
        result = max_f(a)
        print(result)
\end{lstlisting}
    '''
    print("\\textbf{Question:}")
    print("\\begin{lstlisting}[language=Python, style=question]")
    # remove "Provide me the Python3 codes for solving the question: "
    question = df['question'].values[0][len("Provide me the Python3 codes for solving the question: "):]
    print(question)
    print("\\end{lstlisting}")
    
    if minipage:
        print("\\begin{figure}[!h]")
    for i, row in df.iterrows():
        if minipage:
            print("     \\begin{minipage}{0.49\\textwidth}")
            print("\\centering")
        if row['label_name'] == 'human_written':
            print("\\textbf{" + "Human-written" + "}")
        elif "CodeLlama" in row['original_source']:
            print("\\textbf{" + "CodeLlama 70B Generated" + "}")
        elif "Mixtral" in row['original_source']:
            print("\\textbf{" + "Mixtral 8x7B Generated" + "}")
        elif "gemma" in row['original_source']:
            print("\\textbf{" + "Gemma 7B Generated" + "}")
        print("\\begin{lstlisting}[language=Python, style=python]")
        print(row['code'])
        print("\\end{lstlisting}")
        if minipage:
            print("     \\end{minipage}")
    if minipage:
        print("\\end{figure}")

In [6]:
intro_q_and_a_df = get_q_and_a(dataset_df, difficulty='introductory')
visualize_q_and_a(intro_q_and_a_df)

     problem_id         label_name    difficulty
1064       4162      human_written  introductory
1560       4162  machine_generated  introductory
1947       4162  machine_generated  introductory
2156       4162  machine_generated  introductory
Provide me the Python3 codes for solving the question: You are given N positive integers a_1, a_2, ..., a_N.
For a non-negative integer m, let f(m) = (m\ mod\ a_1) + (m\ mod\ a_2) + ... + (m\ mod\ a_N).
Here, X\ mod\ Y denotes the remainder of the division of X by Y.
Find the maximum value of f.

-----Constraints-----
 - All values in input are integers.
 - 2 \leq N \leq 3000
 - 2 \leq a_i \leq 10^5

-----Input-----
Input is given from Standard Input in the following format:
N
a_1 a_2 ... a_N

-----Output-----
Print the maximum value of f.

-----Sample Input-----
3
3 4 6

-----Sample Output-----
10

f(11) = (11\ mod\ 3) + (11\ mod\ 4) + (11\ mod\ 6) = 10 is the maximum value of f.
results\gemma-7b-it-apps_introductory_207.jsonl human_written
n =

In [13]:
mixtral_intro = get_q_and_a(dataset_df, difficulty='introductory', seed=5)
mixtral_intro = mixtral_intro.loc[mixtral_intro['original_source'].str.contains('Mixtral')]
visualize_q_and_a(mixtral_intro)
    
        

     problem_id         label_name    difficulty
1949       4087  machine_generated  introductory
Provide me the Python3 codes for solving the question: Polycarp knows that if the sum of the digits of a number is divisible by $3$, then the number itself is divisible by $3$. He assumes that the numbers, the sum of the digits of which is divisible by $4$, are also somewhat interesting. Thus, he considers a positive integer $n$ interesting if its sum of digits is divisible by $4$.

Help Polycarp find the nearest larger or equal interesting number for the given number $a$. That is, find the interesting number $n$ such that $n \ge a$ and $n$ is minimal.


-----Input-----

The only line in the input contains an integer $a$ ($1 \le a \le 1000$).


-----Output-----

Print the nearest greater or equal interesting number for the given number $a$. In other words, print the interesting number $n$ such that $n \ge a$ and $n$ is minimal.


-----Examples-----
Input
432

Output
435

Input
99

Output
1

In [None]:
latex_q_and_a(intro_q_and_a_df)

In [None]:
interview_q_and_a_df = get_q_and_a(dataset_df, difficulty='interview')
visualize_q_and_a(interview_q_and_a_df)

In [None]:
latex_q_and_a(interview_q_and_a_df)

In [None]:
competition_q_and_a_df = get_q_and_a(dataset_df, difficulty='competition')
visualize_q_and_a(competition_q_and_a_df)

In [None]:
latex_q_and_a(competition_q_and_a_df)

In [None]:
("import sys\n"
 "\n"
 "def calculate\\_f(a, m):\n"
 "n = len(a)\n"
 "result = 0\nfor i in range(n):\nresult += m % a[i]\nreturn result\n\ndef solve():\nn = int(sys.stdin.readline())\na = list(map(int, sys.stdin.readline().split()))\n\nmax\\_value = 0\nfor i in range(1, 10**5 + 1):\ncurrent\\_value = calculate\\_f(a, i)\nmax\\_value = max(max\\_value, current\\_value)\n\nprint(max\\_value)\n\nif **name** == '**main**':\nsolve()")

In [25]:
cannot_parse = []

for file in os.listdir('results'):
    if not file.endswith('207.jsonl'):
        continue
    with open(os.path.join('results', file), 'r') as f:
        lines = [json.loads(line) for line in f.readlines() if '# CANNOT PARSE' in line]
        # add original file
        for j in lines:
            j['original_file'] = file
        cannot_parse += lines
            
        
cannot_parse_df = pd.DataFrame(cannot_parse)[['problem_id', 'question', 'parsed_codes', 'original_file']]
display(cannot_parse_df)
        

Unnamed: 0,problem_id,question,parsed_codes,original_file
0,3692,Provide me the Python3 codes for solving the q...,# CANNOT PARSE\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,CodeLlama-70b-Instruct-hf-apps_competition_207...
1,3698,Provide me the Python3 codes for solving the q...,# CANNOT PARSE\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n...,CodeLlama-70b-Instruct-hf-apps_competition_207...
2,3700,Provide me the Python3 codes for solving the q...,"# CANNOT PARSE\n\n\nI apologize, but as a resp...",CodeLlama-70b-Instruct-hf-apps_competition_207...
3,3701,Provide me the Python3 codes for solving the q...,"# CANNOT PARSE\n\n\nI apologize, but I cannot ...",CodeLlama-70b-Instruct-hf-apps_competition_207...
4,3703,Provide me the Python3 codes for solving the q...,# CANNOT PARSE\n\n\nThis is a complex problem ...,CodeLlama-70b-Instruct-hf-apps_competition_207...
...,...,...,...,...
96,4146,Provide me the Python3 codes for solving the q...,# CANNOT PARSE\n\n\nThe provided code does not...,CodeLlama-70b-Instruct-hf-apps_introductory_20...
97,4177,Provide me the Python3 codes for solving the q...,# CANNOT PARSE\n\n\nI cannot fulfill your requ...,CodeLlama-70b-Instruct-hf-apps_introductory_20...
98,4191,Provide me the Python3 codes for solving the q...,# CANNOT PARSE\n\n\ndef main():\n # Initial...,CodeLlama-70b-Instruct-hf-apps_introductory_20...
99,4054,Provide me the Python3 codes for solving the q...,# CANNOT PARSE\n## Explanation\n\nThe text you...,gemma-7b-it-apps_introductory_207.jsonl


In [43]:
def visualize_cannot_parse(df):
    
    for i, row in df.iterrows():
        if row['parsed_codes'].strip() == '# CANNOT PARSE':
            continue
        print("=====================================")
        print(row[['problem_id', 'original_file']])
        print("=====================================")
        print(row['question'])
        print("=====================================")
        print(row['parsed_codes'])
        print("=====================================")
        
visualize_cannot_parse(cannot_parse_df)

problem_id                                                    3700
original_file    CodeLlama-70b-Instruct-hf-apps_competition_207...
Name: 2, dtype: object
Provide me the Python3 codes for solving the question: Tanechka is shopping in the toy shop. There are exactly $n$ toys in the shop for sale, the cost of the $i$-th toy is $i$ burles. She wants to choose two toys in such a way that their total cost is $k$ burles. How many ways to do that does she have?

Each toy appears in the shop exactly once. Pairs $(a, b)$ and $(b, a)$ are considered equal. Pairs $(a, b)$, where $a=b$, are not allowed.


-----Input-----

The first line of the input contains two integers $n$, $k$ ($1 \le n, k \le 10^{14}$) — the number of toys and the expected total cost of the pair of toys.


-----Output-----

Print the number of ways to choose the pair of toys satisfying the condition above. Print 0, if Tanechka can choose no pair of toys in such a way that their total cost is $k$ burles.


-----Examples-----
