In [None]:
!pip install accelerate datasets

Collecting accelerate
  Downloading accelerate-0.29.3-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m297.6/297.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.19.0-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━

In [None]:
!apt-get install clang-format

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
clang-format is already the newest version (1:14.0-55~exp2).
0 upgraded, 0 newly installed, 0 to remove and 45 not upgraded.


In [None]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

model_name = "bigcode/starcoderbase-1b"

config = AutoConfig.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForSequenceClassification.from_pretrained("rohitc33/starcoder-1b-finetuned-ds-3", config=config)
model.to("cuda")
model.config.pad_token_id = model.config.eos_token_id

In [None]:
import re
import torch

def mark_branch_hints(code):
    # Combined regular expression pattern for 'likely' and 'unlikely' hints

    pattern = re.compile(r'\[\[(likely|unlikely)\]\]')

    lines = code.split('\n')
    lines = lines[-100:]
    code = "\n".join(lines)
    if len(code) > 5000:
        return None
    processed_code = pattern.sub("", code)
    return processed_code

def predict_label(code):
    code = mark_branch_hints(code)
    if code is not None:
        inputs = tokenizer(code, padding="max_length", truncation=True, max_length=1024, return_tensors="pt").to(model.device)
        with torch.no_grad():
            logits = model(**inputs).logits[0]
            print(logits)
            if abs(int(logits[1] - logits[0])) >= 2:
                label = logits.argmax()
                print(label)
                if int(label) == 0:
                    return "[[likely]]"
                else:
                    return "[[unlikely]]"
    return None

In [None]:
import os
import sys
import random


def find_start_of_function(lines: list, i: int) -> int:
    '''
    Starting at an "if"' statement, finds the start of the outermost function it is located in (no indent).
    @param lines: list of lines in the c file
    @param i: ndex of the '{' after the "if" statement
    '''

    # 1) iterate over upward lines, detect line with no indent
    # (optional, may cause issues) 2) check "assert(lines[i].lstrip().startswith("{"))"
    # 3) iterate upwards until find empty line (should be 1-4 lines, maybe set hard stop)
    while len(lines[i]) > len(lines[i].lstrip()):
        i -= 1
    # no indent (start of function, ideally) found
    # assert(lines[i].startswith("{"))

    # limit to 5 lines
    count = 0
    while bool(lines[i].strip()) and count <= 5: # while string not empty and less than 5 iterations
        i -= 1
        count += 1
    return i

def end_of_if(indent: int, lines: list, i: int):
    while i < len(lines):
        code = lines[i].lstrip()
        curr_indent = len(lines[i]) - len(code)
        if curr_indent == indent and code.startswith("}"):
            return i
        i += 1
    return i

def label_example(example: str, class_int: int) -> list[str]:
    class_str = "[[likely]]" if class_int == 1 else "[[unlikely]]"
    return example.replace("// SENTINEL", class_str)

def gather_mp_examples(file: str) -> list[str]:
    labeled_count = 0
    not_labeled_count = 0

    new_file =  file[:-3]+"_clanged.cc" # assuming file extension is .cc
    out_file =  file[:-3]+"_out.cc" # assuming file extension is .cc
    # os.system(f"clang-format {file} > {new_file}")

    with open(new_file, "r") as f:
        with open(out_file, "w") as g:
            lines = list(f)
            for i, line in enumerate(lines):
                code = line.lstrip()
                indent = len(line) - len(code)
                if code.startswith("if"):
                    sentinel_line = line.rstrip() + " // SENTINEL" + "\n"
                    func_start_idx = find_start_of_function(lines, i)
                    if_end_idx = end_of_if(indent, lines, i)
                    prefix_str = "".join(lines[func_start_idx:i])
                    suffix_str = "".join(lines[i+1:if_end_idx+1])
                    example = prefix_str + sentinel_line + suffix_str
                    label = predict_label(example)
                    if label is not None:
                        g.write(line[:-1] + f" {label}\n")
                        labeled_count += 1
                        continue
                    else:
                        not_labeled_count += 1
                # print("arst")
                g.write(line)

    print(f"Labeled: {labeled_count}")
    print(f"Not labeled: {not_labeled_count}")

In [None]:
x = gather_mp_examples("image_ppm.cc")

tensor([ 1.4372, -1.9414], device='cuda:0')
tensor(0, device='cuda:0')
tensor([-0.6078,  0.1914], device='cuda:0')
tensor([ 0.4109, -0.7701], device='cuda:0')
tensor([ 0.0068, -0.0630], device='cuda:0')
tensor([ 0.6154, -1.0597], device='cuda:0')
tensor([ 1.6339, -1.8949], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 2.0778, -2.3455], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 2.0551, -2.2063], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 0.4518, -0.9442], device='cuda:0')
tensor([ 0.6501, -0.9252], device='cuda:0')
tensor([ 2.0644, -2.2442], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 1.7830, -1.9481], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 0.7379, -1.1159], device='cuda:0')
tensor([ 1.8947, -1.8375], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 0.6156, -1.1329], device='cuda:0')
tensor([ 0.8656, -1.4204], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 2.6013, -2.6694], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 

In [None]:
x = gather_mp_examples("seam_carver.cc")

tensor([ 1.5032, -1.6383], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 2.6846, -2.8316], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 1.5699, -1.8761], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 0.4379, -0.5547], device='cuda:0')
tensor([ 1.1977, -1.5543], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 0.8770, -1.2730], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 1.5732, -1.7012], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 1.7215, -1.9486], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 0.9812, -1.3294], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 0.6046, -0.9091], device='cuda:0')
tensor([ 1.7993, -1.6905], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 2.7180, -2.7590], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 2.4208, -2.6909], device='cuda:0')
tensor(0, device='cuda:0')
tensor([-0.4013,  0.2389], device='cuda:0')
tensor([ 0.8721, -1.1585], device='cuda:0')
tensor(0, device='cuda:0')
tensor([ 0.1230,