This notebook calculates the BLEU and codeBLEU scores across models. It also tests the functions generated by the model runs to see if they address their corresponding task.

In [207]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.5.1-py3-none-any.whl.metadata (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-3.1.1-py3-none-any.whl.metadata (8.6 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.5.1-py3-none-any.whl (104 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-3.1.1-py3-none-any.whl (19 kB)
Installing collected packages: portalocker, colorama, sacrebleu
Successfully installed colorama-0.4.6 portalocker-3.1.1 sacrebleu-2.5.1


In [268]:
! pip install transformers
!pip install tree_sitter==0.2.0
! git clone -q https://github.com/microsoft/CodeXGLUE.git

Collecting tree_sitter==0.2.0
  Downloading tree_sitter-0.2.0.tar.gz (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.4/110.4 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tree_sitter
  Building wheel for tree_sitter (setup.py) ... [?25l[?25hdone
  Created wheel for tree_sitter: filename=tree_sitter-0.2.0-cp311-cp311-linux_x86_64.whl size=389537 sha256=2ce649a280f7957f14580cf9bd47b4539171393bc7438063095e6de3b619b2f9
  Stored in directory: /root/.cache/pip/wheels/d9/6e/e2/b0126ad4f531cf09749b69518118f0ebf7bf3134ed91c71abb
Successfully built tree_sitter
Installing collected packages: tree_sitter
Successfully installed tree_sitter-0.2.0


In [578]:
import json

with open('raw_prompts_and_outputs_reformatted.json', 'r') as file:
    data_new = json.load(file)

In [188]:
model_choices = ["gpt-4o-mini", "Codestral-2501", "gpt-4.1-mini"]
prompt_strategies = ["zero shot", "few shot", "prompt chaining", "self consistency", "chain of thought"]

In [187]:
# get run info based on task_num, model, and prompting strategy
def get_run_info(task_num, model, prompt_strat):
  key = f"task {task_num}"
  for run in data_new[key]:
    if run["model"] == model and run["prompt strategy"] == prompt_strat:
      return run

In [622]:
run = get_run_info(21, model_choices[2], prompt_strategies[0])

In [623]:
print(run["output"])

Let's analyze the provided script first, then identify issues, and finally refactor it.

---

## Original Script Analysis

```python
# utils .py - Script to parse and summarize numeric CSV files

import csv

def read_csv ( filepath ) :
  with open ( filepath , ’r’)as f :
    return [ row for row in csv . reader ( f ) ]

def summarize_column ( data , index ) :
  values = [ float ( row [ index ]) for row in data [1:]] # skip header
  total = sum( values )
  avg = total / len ( values )
  return total , avg

def main () :
  filepath = ’data .csv ’
  data = read_csv ( filepath )
  total , avg = summarize_column ( data , 1)
  print (" Total :", total )
  print (" Average :", avg )

if __name__ == ’__main__ ’:
  main ()
```

### Issues and potential improvements

1. **Syntax/Formatting Issues:**

   - The code uses non-standard whitespace: spaces inside parentheses, e.g. `read_csv ( filepath )` instead of `read_csv(filepath)`.
   - The script has inconsistent spacing and indentation style.
 

# Comparative Analysis Across Models

In [579]:
for k,v in data_new.items():
  for i, run in enumerate(v):
    if i%2 != 0:
      continue
    strat = run["prompt strategy"]

    if (strat == "prompt chaining"):
      for k2, v2 in run["output"].items():
        bleu = calc_bleu(v2, v[i+1]["output"][k2])
        print(f"({k}, {strat}, {k2}): {bleu}")
      continue

    if strat == "self consistency":
      bleu2 = calc_bleu(run["output"]["output attempt 1"], v[i+1]["output"]["output attempt 1"])
      bleu3 = calc_bleu(run["output"]["output attempt 2"], v[i+1]["output"]["output attempt 2"])
      print(f"({k}, {strat}): {bleu2}, {bleu3}")
      continue

    bleu = calc_bleu(run["output"], v[i+1]["output"])
    print(f"({k}, {strat}): {bleu}")

(task 1, zero shot): 24.08
(task 1, self consistency): 22.78, 42.34
(task 2, zero shot): 29.36
(task 2, prompt chaining, response 1): 31.58
(task 2, prompt chaining, response 2): 33.11
(task 3, zero shot): 6.27
(task 3, chain of thought): 21.08
(task 4, prompt chaining, response 1): 15.47
(task 4, prompt chaining, response 2): 41.59
(task 4, few shot): 28.24
(task 5, chain of thought): 40.58
(task 5, zero shot): 46.24
(task 6, zero shot): 100.00
(task 6, self consistency): 87.54, 79.67
(task 7, zero shot): 26.49
(task 7, self consistency): 26.67, 27.88
(task 8, zero shot): 6.29
(task 8, chain of thought): 22.60
(task 9, zero shot): 35.49
(task 9, chain of thought): 48.96
(task 10, zero shot): 21.31
(task 10, self consistency): 21.31, 93.06
(task 11, zero shot): 6.87
(task 11, self consistency): 2.18, 8.08
(task 12, zero shot): 76.37
(task 12, few shot): 76.37
(task 13, zero shot): 37.96
(task 13, chain of thought): 26.98
(task 14, zero shot): 100.00
(task 14, few shot): 17.78
(task 15,

In [244]:
import sacrebleu

def calc_bleu(hyp, ref):
  hypotheses = [hyp]
  references = [[ref]]

  score = sacrebleu.corpus_bleu(hypotheses, references)
  bleu_score = str(score).split()[2]
  return bleu_score

In [362]:
def remove_newlines(text):
    return text.replace('\n', '').replace('\r', '')

In [630]:
model1_codes = []
model2_codes = []

In [631]:
code1 = """import string

def load_file(filepath):
    with open(filepath, 'r') as f:
        return f.readlines()

def clean_line(line):
    # Remove punctuation and make lowercase
    translator = str.maketrans('', '', string.punctuation)
    return line.translate(translator).lower()

def count_words(lines):
    word_counts = {}
    for line in lines:
        clean = clean_line(line)
        for word in clean.split():
            word_counts[word] = word_counts.get(word, 0) + 1
    return word_counts

def main():
    filepath = 'input.txt'
    lines = load_file(filepath)
    counts = count_words(lines)
    for word, count in sorted(counts.items()):
        print(f"{word}: {count}")

if __name__ == '__main__':
    main()
"""
code1=remove_newlines(code1)

In [632]:
code2 = """import string

def load_file(filepath):
    with open(filepath, 'r') as f:
        return f.readlines()

def clean_line(line):
    translator = str.maketrans('', '', string.punctuation)
    clean_line = line.translate(translator).lower()
    return clean_line

def count_words(lines):
    word_counts = {}
    for line in lines:
        clean = clean_line(line)
        for word in clean.split():
            word_counts[word] = word_counts.get(word, 0) + 1
    return word_counts

def main():
    filepath = 'input.txt'
    lines = load_file(filepath)
    counts = count_words(lines)
    for word, count in sorted(counts.items()):
        print(f"{word}: {count}")

if __name__ == '__main__':
    main()
"""
code2=remove_newlines(code2)

In [633]:
model1_codes.append(code1)

In [634]:
model2_codes.append(code2)

In [635]:
with open("predictions.txt", "w", encoding="utf-8") as f:
  f.write(model1_codes[0])
with open("targets.txt", "w", encoding="utf-8") as f:
  f.write(model2_codes[0])
metrics = !cd /content/CodeXGLUE/Code-Code/code-to-code-trans/evaluator/CodeBLEU/ && python calc_code_bleu.py --refs /content/targets.txt --hyp /content/predictions.txt --lang python --params 0.25,0.25,0.25,0.25
print(metrics)

['ngram match: 0.8366270549736919, weighted ngram match: 0.8822032599751687, syntax_match: 0.38333333333333336, dataflow_match: 0.0', 'CodeBLEU score:  0.5255409120705485']


# Evaluate Task 4

In [637]:
ex1 = "x@example.com"
ex2 = "Abc..123@example.com"
ex3 = "user.name+tag+sorting@example.com"
ex4 = "user@.com"
ex5 = "angh__hgj@domain.com"

In [636]:
import re

def is_valid_email(email):
    pattern = re.compile(
        r'^(?!.*\.\.)'                              # no consecutive dots in local-part
        r'[A-Za-z0-9!#$%&\'*+/=?^_`{|}~-]+'        # local-part start
        r'(?:\.[A-Za-z0-9!#$%&\'*+/=?^_`{|}~-]+)*' # local-part continued with dots
        r'@'
        r'(?:(?:[A-Za-z0-9](?:[A-Za-z0-9-]*[A-Za-z0-9])?\.)+'  # domain labels
        r'[A-Za-z]{2,})$'                           # TLD with at least 2 letters
    )
    if len(email) > 254:
        return False
    return bool(pattern.match(email))


In [638]:
print(is_valid_email(ex1))
print(is_valid_email(ex2))
print(is_valid_email(ex3))
print(is_valid_email(ex4))
print(is_valid_email(ex5))

True
False
True
False
True


In [639]:
import re

def is_valid_email(email):
    # Define the regex pattern for a valid email address
    pattern = r'^[a-zA-Z0-9!#$%&\'*+\-/=?^_`{|}~]+(\.[a-zA-Z0-9!#$%&\'*+\-/=?^_`{|}~]+)*@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)+$'

    # Match the email against the pattern
    if re.match(pattern, email):
        return True
    else:
        return False


In [640]:
print(is_valid_email(ex1))
print(is_valid_email(ex2))
print(is_valid_email(ex3))
print(is_valid_email(ex4))
print(is_valid_email(ex5))

True
False
True
False
True


In [641]:
import re

def is_valid_email(email):
    pattern = r'^(?!.*\.\.)(?!.*\.$)(?!.*@.*@)(?!.*@-)(?!.*-@)(?!.*@\.)(?!.*\.$)[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
    return re.match(pattern, email) is not None

In [642]:
print(is_valid_email(ex1))
print(is_valid_email(ex2))
print(is_valid_email(ex3))
print(is_valid_email(ex4))
print(is_valid_email(ex5))

True
False
True
False
True


In [643]:
import re

def is_valid_email(email):
    # Define the regex pattern for validating an email
    pattern = re.compile(
        r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)"
    )
    return bool(pattern.match(email))

In [644]:
print(is_valid_email(ex1))
print(is_valid_email(ex2))
print(is_valid_email(ex3))
print(is_valid_email(ex4))
print(is_valid_email(ex5))

True
True
True
False
True


# Evaluate Task 8

In [413]:
test1 = 'value1,"value, with, commas",value3'
test2 = 'John Doe,"123, Main St",50,"He said, ""Hello!""","New\nLine"'
test3 = '"Doe, John",25,"New York, NY"'

In [414]:
import csv
from io import StringIO

def parse_csv_line(line):
    # Use StringIO to treat the string as a file
    f = StringIO(line)
    reader = csv.reader(f)
    return next(reader)  # Read the first (and only) row

In [415]:
print(parse_csv_line(test1))
print(parse_csv_line(test2))
print(parse_csv_line(test3))

['value1', 'value, with, commas', 'value3']
['John Doe', '123, Main St', '50', 'He said, "Hello!"', 'New\nLine']
['Doe, John', '25', 'New York, NY']


In [402]:
def parse_csv_line(line):
    fields = []
    field = []
    in_quote = False
    escape_next = False

    for char in line:
        if escape_next:
            field.append(char)
            escape_next = False
        elif char == '"':
            if in_quote:
                if field and field[-1] == '"':  # Handle double quotes
                    field.append(char)
                else:
                    in_quote = False
            else:
                in_quote = True
        elif char == ',' and not in_quote:
            fields.append(''.join(field))
            field = []
        elif char == '\\':
            escape_next = True
        else:
            field.append(char)

    # Append the last field
    fields.append(''.join(field))

    return fields

In [416]:
print(parse_csv_line(test1))
print(parse_csv_line(test2))
print(parse_csv_line(test3))

['value1', 'value, with, commas', 'value3']
['John Doe', '123, Main St', '50', 'He said, "Hello!"', 'New\nLine']
['Doe, John', '25', 'New York, NY']


In [408]:
import re

def parse_csv_line(line):
    # Regular expression to match fields in a CSV line.
    # This pattern accounts for:
    # - Quoted fields: "field, with, commas"
    # - Unquoted fields: field1,field2
    # - Optional spaces around commas
    pattern = r'''("([^"]*)" | ([^,"]+))'''

    # Find all matches in the line
    matches = re.findall(pattern, line.strip())

    # Extract the matched groups into a list
    result = []
    for match in matches:
        # Either the first group will be filled (quoted) or the second (unquoted)
        quoted_field, unquoted_field = match[1], match[2]
        if quoted_field:
            # Remove surrounding quotes and return the field
            result.append(quoted_field)
        elif unquoted_field:
            result.append(unquoted_field.strip())

    return result


In [417]:
print(parse_csv_line(test1))
print(parse_csv_line(test2))
print(parse_csv_line(test3))

['value1', 'value, with, commas', 'value3']
['John Doe', '123, Main St', '50', 'He said, "Hello!"', 'New\nLine']
['Doe, John', '25', 'New York, NY']


In [406]:
def parse_csv_line(line):
    fields = []
    field = []
    inside_quotes = False
    escape_char = False

    for char in line:
        if char == '"' and not escape_char:
            # Toggle the inside_quotes state
            inside_quotes = not inside_quotes
        elif char == ',' and not inside_quotes:
            # End of a field
            fields.append(''.join(field).strip())
            field = []
        elif char == '\\' and inside_quotes:
            # Handle escape character
            escape_char = True
        else:
            # Regular character
            field.append(char)
            escape_char = False

    # Add the last field
    fields.append(''.join(field).strip())

    return fields

In [418]:
print(parse_csv_line(test1))
print(parse_csv_line(test2))
print(parse_csv_line(test3))

['value1', 'value, with, commas', 'value3']
['John Doe', '123, Main St', '50', 'He said, "Hello!"', 'New\nLine']
['Doe, John', '25', 'New York, NY']


# Eval Task 12

In [455]:
def factorial(n):
  result = 1
  for i in range(1, n + 1):
    result *= i
  return result

In [459]:
for i in range(5):
  print(factorial(i))

1
1
2
6
24


In [457]:
def factorial(n):
    if n == 0:
        return 1
    result = 1
    for i in range(1, n + 1):
        result *= i
    return result