# Prelims


In [None]:
!pip install -U accelerate bitsandbytes datasets peft transformers


In [None]:
import torch
from transformers import (
    TrainingArguments,
    Trainer,
    ResNetModel,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    CodeLlamaTokenizer
)
from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
import accelerate
import bitsandbytes
from transformers.modeling_outputs import SequenceClassifierOutput
import json
import time

from datasets import load_dataset

In [None]:
print(torch.cuda.is_available())

In [None]:
mbpp = load_dataset("mbpp")  # train, validation, and test
humaneval = load_dataset("openai_humaneval")  # test only

In [None]:
for key, value in mbpp["test"][0].items():
  print(key,value, '\n')

print(len(mbpp['test']))

In [None]:

print(humaneval)
print(humaneval['test']['canonical_solution'][0])
print(humaneval['test']['prompt'][0])


NameError: name 'humaneval' is not defined

# Simple Modelling

In [None]:
system_instruction  = "Refactor the given Python program to a more readable, efficient, and maintainable one. You can assume that the given program is semantically correct. Do not change the external behavior of the program, and keep the syntactic and semantic correctness. Python programs should be in a code block. Do not explain anything in natural language."

In [None]:
model_id = "codellama/CodeLlama-7b-Instruct-hf"
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.float16
)

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto",
)


In [None]:
# system = "Provide answers in Python"
user = humaneval['test']['canonical_solution'][0]
prompt = f"<s>[INST] <<SYS>>\\n{system_instruction}\\n<</SYS>>\\n\\n{user}[/INST]"
inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")

output = model.generate(
    inputs["input_ids"],
    max_new_tokens=200,
    do_sample=True,
    top_p=0.9,
    temperature=0.1,
    pad_token_id=tokenizer.eos_token_id
)
output = output[0].to("cpu")
print(tokenizer.decode(output))

In [None]:
humaneval_prompts = []
for i in humaneval["test"]:
  concat_prompt = i['prompt'] + i['canonical_solution']
  humaneval_prompts.append(concat_prompt)

humaneval_outputs = generate_output(humaneval_prompts)


In [None]:
import csv

def generate_output(prompts):
  output_texts = []
  for idx, p in prompts:
    print(idx, end = '')
    user = p
    prompt = f"<s>[INST] <<SYS>>\\n{system_instruction}\\n<</SYS>>\\n\\n{user}[/INST]"
    inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
    output = model.generate(
        inputs["input_ids"],
        max_new_tokens=200,
        do_sample=True,
        top_p=0.9,
        temperature=0.1,
        pad_token_id=tokenizer.eos_token_id
    )
    output = output[0].to("cpu")
    output_text = tokenizer.decode(output)
    output_dict = {'index': idx,  'prompt': p, 'output': output_text}
    output_texts.append(output_dict)
  return output_texts

def parsed_csv(file_path, results):
  keys = ['index','prompt', 'output']
  with open(file_path, 'w', newline='') as csvfile:
      writer = csv.DictWriter(csvfile, fieldnames=keys)
      writer.writeheader()
      for result in results:
          writer.writerow(result)


In [None]:
print("HUMANEVAL\n")
humaneval_prompts = []
for idx,i in enumerate(humaneval["test"]):
  concat_prompt = i['prompt'] + i['canonical_solution']
  humaneval_prompts.append([idx,concat_prompt])

humaneval_outputs = generate_output(humaneval_prompts)
parsed_csv('/content/humaneval_parsed.csv', humaneval_outputs)



In [None]:
print("\n\nMBPP\n")
mbpp_prompts=[]
for idx,i in enumerate(mbpp["test"]):
  concat_prompt = i['text'] + '\n\n'+ i['code']
  mbpp_prompts.append([idx,concat_prompt])

mbpp_outputs = generate_output(mbpp_prompts)
parsed_csv('/content/mbpp_parsed.csv', mbpp_outputs)



MBPP

01234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663

# Parsing

In [None]:
import csv
import re
def extract_python_function(text):
    # Pattern to capture a Python function
    # - Starts with 'def' followed by any characters (the function name and arguments)
    # - Includes all indented lines after the 'def' line, allowing for empty lines
    # - Assumes an indentation of 4 spaces, but can be adjusted
    pattern = r"(def .+?:\n(?: {4,}.*\n|\n)*)"

    # Find all matches
    matches = re.findall(pattern, text, re.MULTILINE)

    return matches


def extract_after_substring(full_string, substring):
    index = full_string.find(substring)
    if index != -1:
        return full_string[index + len(substring):]
    else:
        return ""

def extract_between_markers(full_string, start_marker, end_marker):
    start_index = full_string.find(start_marker)
    end_index = full_string.find(end_marker, start_index + len(start_marker))
    if start_index != -1 and end_index != -1:
        return full_string[start_index + len(start_marker):end_index].strip()
    else:
        return ""

def extract_python_code(full_string):
    extracted_string = extract_after_substring(full_string, "[/INST]")
    out = extract_between_markers(extracted_string, "```", "```")
    if out != '':
      return out
    else:
      return extract_python_function(extracted_string)


In [None]:
example = '''<s> [INST] <<SYS>>\nRefactor the given Python program to a more readable, efficient, and maintainable one. You can assume that the given program is semantically correct. Do not change the external behavior of the program, and keep the syntactic and semantic correctness. Python programs should be in a code block. Do not explain anything in natural language.\n<</SYS>>\n\nWrite a function to find the perimeter of a square.

def square_perimeter(a):

  perimeter=4*a

  return perimeter[/INST]  Here is a more readable, efficient, and maintainable version of the given Python program:

def square_perimeter(side_length):
    return 4 * side_length

This version of the program is more readable because it uses meaningful variable names and follows the PEP 8 style guide for Python code formatting. It is also more efficient because it avoids the unnecessary use of the `*` operator and the `return` statement. Finally, it is more maintainable because it is easier to understand and modify the code.</s>'''
print(extract_python_function(example))


['def square_perimeter(a):\n\n', 'def square_perimeter(side_length):\n    return 4 * side_length\n\n']


In [None]:
import pandas as pd
csv_file_path = 'humaneval_parsed.csv'

humaneval_output_values = []
with open(csv_file_path, 'r') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        humaneval_output_values.append(row['output'])

humaneval_snippets=[]
for i in humaneval_output_values:
  humaneval_snippets.append(extract_python_code(i))

# Step 1: Read the CSV file into a DataFrame
csv_file_path = 'humaneval_parsed.csv'
parsed ='humaneval_parsed_output.csv'
df = pd.read_csv(csv_file_path)

# Step 3: Add the new "City" column to the DataFrame
df['parsed_output'] = humaneval_snippets

# Step 4: Write the DataFrame back to the CSV file
df.to_csv(parsed, index=False)

In [None]:
csv_file_path = 'mbpp_parsed.csv'

mbpp_output_values = []
with open(csv_file_path, 'r') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        mbpp_output_values.append(row['output'])

mbpp_snippets=[]
for i in mbpp_output_values:
  mbpp_snippets.append(extract_python_code(i))
print(mbpp_snippets)


# Step 1: Read the CSV file into a DataFrame
csv_file_path = 'mbpp_parsed.csv'
parsed ='mbpp_parsed_output.csv'
df = pd.read_csv(csv_file_path)

# Step 3: Add the new "City" column to the DataFrame
df['parsed_output'] = mbpp_snippets

# Step 4: Write the DataFrame back to the CSV file
df.to_csv(parsed, index=False)

[['def remove_Occ(s, ch):\n    for i in range(len(s)):\n        if s[i] == ch:\n            s = s[:i] + s[i+1:]\n            break\n    for i in range(len(s) - 1, -1, -1):\n        if s[i] == ch:\n            s = s[:i] + s[i+1:]\n            break\n    return s\n\n'], ['def sort_matrix(M):\n    return sorted(M, key=sum)\n\n'], 'from collections import Counter\n\ndef count_common(words):\n    word_counts = Counter(words)\n    top_four = word_counts.most_common(4)\n    return top_four', 'def find_volume(l, b, h):\n    return (l * b * h) / 2', "import re\n\ndef split_lowerstring(text):\n    return re.split(r'[a-z]', text)", 'import re\n\ndef text_lowercase_underscore(text):\n    pattern = r"^[a-z]+_[a-z]+$"\n    if re.search(pattern, text):\n        return "Found a match!"\n    else:\n        return "Not matched!"', ['def square_perimeter(side_length):\n    return 4 * side_length\n\n'], "def remove_dirty_chars(string, second_string):\n    clean_string = ''\n    for char in string:\n      

In [None]:
for elem in mbpp_snippets:
  if elem == '':
    print('1')

# Modelling (DEPRECATED)


In [None]:
modelpath = "codellama/CodeLlama-7b-Instruct-hf"
# model = AutoModelForCausalLM.from_pretrained(
#     base_model,
#     load_in_8bit=True,
#     torch_dtype=torch.float16,
#     device_map="auto",
# )
# tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-Python-hf")

# Load 4-bit quantized model
model = AutoModelForCausalLM.from_pretrained(
    modelpath,
    device_map="auto",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
    ),
    torch_dtype=torch.bfloat16,
)

# Load (slow) Tokenizer, fast tokenizer sometimes ignores added tokens
tokenizer = AutoTokenizer.from_pretrained(modelpath, use_fast=False)

# Add tokens <|im_start|> and <|im_end|>, latter is special eos token
tokenizer.pad_token = "</s>"
tokenizer.add_tokens(["<|im_start|>"])
tokenizer.add_special_tokens(dict(eos_token="<|im_end|>"))
model.resize_token_embeddings(len(tokenizer))
model.config.eos_token_id = tokenizer.eos_token_id

In [None]:
# Add LoRA adapters to model
base_model = model
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules = ['q_proj', 'k_proj', 'down_proj', 'v_proj', 'gate_proj', 'o_proj', 'up_proj'],
    lora_dropout=0.1,
    bias="none",
    modules_to_save = ["lm_head", "embed_tokens"],
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, config).to('cuda')
model.config.use_cache = False

# Prompting (DEPRECATED)

In [None]:
model = model.to('cuda')

# https://huggingface.co/blog/codellama#conversational-instructionshttps://huggingface.co/blog/codellama#conversational-instructions

prompt = system_instruction + '\n\n\n' + mbpp["train"][0]['code']
prompt = system_instruction + '\n\n\n' + humaneval['test']['canonical_solution'][0]

prompt = "Write a python function to generate the fibonacci series."
# for key, value in mbpp["train"][0].items():
#   print(key,value, '\n')
print(prompt)
# PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

# input = tokenizer([prompt], return_tensors="pt").to("cuda")
print('\n\nGENERATING\n\n\n\n')
model_inputs = tokenizer([prompt], return_tensors="pt", padding=True).to("cuda")
start = time.time()
generated_ids = model.generate(**model_inputs, max_new_tokens=1024)
print("time: ", time.time()- start)
output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
print(output[0].rstrip())

In [None]:
def append_to_json(file_path, new_data):
    # Try to read existing data, if file doesn't exist, start with an empty list
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
    except FileNotFoundError:
        data = []

    # Append new data (single or multiple)
    if isinstance(new_data, list):
        data.extend(new_data)
    else:
        data.append(new_data)

    # Write the updated data back to the file
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

In [None]:
file_path = 'llama_mbpp.json'

for mult in range(10):
  prompts = []

  for num in range(1,11):
    prompt = system_instruction + '\n\nCode:\n\n' + mbpp["train"][num*mult]['code']
    prompts.append(prompt)

  model_inputs = tokenizer(prompts, return_tensors="pt", padding=True).to("cuda")
  generated_ids = model.generate(**model_inputs, max_new_tokens=512)
  outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
  print(outputs)
  data = [{'mbpp_num': (num+1)*mult, 'output': out} for num,out in enumerate(outputs)]

  append_to_json(file_path, data)
