In [1]:
import json
import pandas as pd
import ast
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
import re

[nltk_data] Downloading package stopwords to /home/pratz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#LOGO
train_logo_data = "logo_data/python/train_200_dataset.jsonl"
test_logo_data = "logo_data/python/test_dataset.jsonl"

# Load train and test dataset
with open(f"external/dependencies/{train_logo_data}", 'r') as f:
    train_data = [json.loads(line) for line in f]

with open(f"external/dependencies/{test_logo_data}", 'r') as f:
    test_data = [json.loads(line) for line in f]

In [3]:
# List containing LOGO primitives from the ReGAL paper
logo_primitives = ['forward', 'left', 'right', 'penup', 'pendown', 'teleport', 'heading', 'isdown', 'embed']

In [4]:
# Extract descriptions and programs from train_data and transform into pandas DataFrame
def extract_descriptions_and_programs(data):
    extracted_data = []
    for item in data:
        description = None
        program = None
        for message in item.get('messages', []):
            if message['from'] == 'human':
                description = message['value']
            elif message['from'] == 'gpt':
                program = message['value']
        if description and program:
            extracted_data.append([description, program])
    extracted_data = pd.DataFrame(extracted_data, columns=['Description', 'Program'])
    return extracted_data

# Extract descriptions and programs from train_data
df_train = extract_descriptions_and_programs(train_data)
df_test = extract_descriptions_and_programs(test_data)

#display(df_train)
#display(df_test)

# Append the test data to the train data
df_all = pd.concat([df_train, df_test], ignore_index=True)
#display(df_all)

#####################
# Create a list of row indices where the same description reappears
df_all['Description Repeats'] = df_all['Description'].map(lambda desc: [i for i, d in enumerate(df_all['Description']) if d == desc])
df_all['Program Repeats'] = df_all['Program'].map(lambda code: [i for i, c in enumerate(df_all['Program']) if c == code])
#####################

#####################
# Drop all duplicate rows
df_all = df_all.drop_duplicates(subset=['Description', 'Program'])
#display(df_all)
#####################

#####################
# Check if the row indices collected for the description and program are the same
df_all['Same Index'] = df_all.apply(lambda row: row['Description Repeats'] == row['Program Repeats'], axis=1)

# Filter and print the rows where 'Same Index' is False
#print(df_all['Same Index'].value_counts())
display(df_all[df_all['Same Index'] == False])
#####################

#####################
# Add columns for the count of description repeats and program repeats
df_all['n-Times Desc. Repeated'] = df_all['Description Repeats'].map(len)
df_all['n-Times Prog. Repeated'] = df_all['Program Repeats'].map(len)
#####################

# Drop all rows where 'n-Times Prog. Repeated' and 'n-Times Desc. Repeated' are both 1
#df_all = df_all[~((df_all['n-Times Prog. Repeated'] == 1) & (df_all['n-Times Desc. Repeated'] == 1))]

# Display the updated DataFrame
display(df_all)
print(df_all['n-Times Desc. Repeated'].value_counts())
print(df_all['n-Times Prog. Repeated'].value_counts())

#print(df_all['Description'].value_counts())
#print(df_all['Program'].value_counts())


Unnamed: 0,Description,Program,Description Repeats,Program Repeats,Same Index
43,5 sided snowflake with a short line and a smal...,"for j in range(5):\n embed(""""""penup()\nforw...",[43],"[43, 287]",False
287,5 sided snowflake with a short space and a sho...,"for j in range(5):\n embed(""""""penup()\nforw...",[287],"[43, 287]",False


Unnamed: 0,Description,Program,Description Repeats,Program Repeats,Same Index,n-Times Desc. Repeated,n-Times Prog. Repeated
0,4 concentric square s,"for i in range(5):\n embed(""""""for j in rang...","[0, 34, 80, 103, 180, 183]","[0, 34, 80, 103, 180, 183]",True,6,6
1,6 sided snowflake with a medium line and a med...,"for j in range(6):\n embed(""""""forward(8)\nl...",[1],[1],True,1,1
2,5 sided snowflake with a medium line and a sma...,"for j in range(5):\n embed(""""""forward(8)\nl...",[2],[2],True,1,1
3,6 short line s in a row,"for j in range(6):\n embed(""""""forward(2)\nl...","[3, 113, 118, 164]","[3, 113, 118, 164]",True,4,4
4,a small triangle connected by a big line to a ...,for i in range(3):\n forward(2)\n left(1...,[4],[4],True,1,1
...,...,...,...,...,...,...,...
306,8 sided snowflake with a medium circle and a s...,"for j in range(8):\n embed(""""""penup()\nforw...",[306],[306],True,1,1
307,5 sided snowflake with 2 small circle s as arms,"for j in range(5):\n embed(""""""penup()\nforw...",[307],[307],True,1,1
308,3 sided snowflake with a small square and a sm...,"for j in range(3):\n embed(""""""penup()\nforw...",[308],[308],True,1,1
309,5 sided snowflake with a small 5 gon and a sma...,"for j in range(5):\n embed(""""""penup()\nforw...",[309],[309],True,1,1


n-Times Desc. Repeated
1    237
2     16
3      6
4      3
6      2
Name: count, dtype: int64
n-Times Prog. Repeated
1    235
2     18
3      6
4      3
6      2
Name: count, dtype: int64


In [78]:
# contain an embed function with one for-loop # Expect 100
test_prog_0=df_all['Program'].iloc[2]

# contain no embed function # Expect 12
test_prog_1='for j in range(2):\n    forward(8*i)\n    left(51.42857142857143)\n    forward(EPS_DIST*1)\n    penup()\n    forward(EPS_DIST*1)\n    pendown()\n'

# containing two consecutive (4 small full circles) # Expect 
test_prog_2="for j in range(5):\n    embed(\"\"\"penup()\nforward(2)\nleft(0.0)\n\npendown()\nfor i in range(HALF_INF):\n    forward(EPS_DIST*2)\n    left(EPS_ANGLE)\nfor i in range(HALF_INF):\n    forward(EPS_DIST*2)\n    left(EPS_ANGLE)\npenup()\nforward(2)\nleft(0.0)\n\npendown()\nfor i in range(HALF_INF):\n    forward(EPS_DIST*2)\n    left(EPS_ANGLE)\"\"\", locals())\n    forward(0)\n    left(72.0)\n"

# containing two consecutive embed functions # Expect 200
test_prog_3='for j in range(5):\n    embed("""forward(8)\nleft(0.0)\nfor i in range(8):\n    forward(2)\n    left(45.0)""", locals())\n    forward(0)\n    left(72.0)\n embed("""forward(3)\nleft(0.0)\nfor i in range(8):\n    forward(2)\n    left(45.0)""", locals())\n forward(1)\n    left(12.0)\n'

# containing two consecutive for-loops in embed function # Expect 740
test_prog_4='for j in range(5):\n    embed("""forward(8)\nleft(0.0)\nfor i in range(8):\n    forward(2)\n    left(45.0)\n for i in range(8):\n    forward(2)\n    left(45.0)""", locals())\n forward(1)\n    left(12.0)\n'

print(test_prog_0)

for j in range(5):
    embed("""forward(8)
left(0.0)
for i in range(8):
    forward(2)
    left(45.0)""", locals())
    forward(0)
    left(72.0)



In [116]:
import re

def calc_semantic_length(program):
    output = []
    stack = [] # Stack to manage nested counts
    
    # Regex patterns for the entire input
    for_loop_pattern = r"for\s+\w+\s+in\s+range\((\d+|HALF_INF)\):" # add the option of HALF_INF which is used to draw simicircles and circles
    embed_start_pattern = r'embed\("""'
    forward_left_pattern = r"(forward|left|right|penup|pendown|teleport|heading|isdown)\(([\w\.\*\+\-\(\)0-9]*)\)"
    embed_end_pattern = r'""", locals\(\)\)'
    
    # Track position in input
    position = 0
    while position < len(program):
        # Match for-loop
        for_match = re.match(for_loop_pattern, program[position:])
        if for_match:
            loop_value = for_match.group(1)  # Extract the loop value (either a digit or HALF_INF)
            if loop_value == "HALF_INF":  # If the loop is using HALF_INF
                output.append(f"180*(")  # Append 180 for HALF_INF
            else:
                n_iter = int(loop_value)  # Convert the loop value to an integer if it's a digit
                output.append(f"{n_iter}*(")
            stack.append(0)  # track for-loop
            position += for_match.end()
            continue
        
        # Match embed start
        embed_start_match = re.match(embed_start_pattern, program[position:])
        if embed_start_match:
            output.append("(")
            stack.append(1)  # track embed block
            position += embed_start_match.end()
            continue
        
        # Match forward/left
        forward_left_match = re.match(forward_left_pattern, program[position:])
        if forward_left_match:
            # Check if the last character in output is a closing bracket
            if output and re.search(r"\)+$", output[-1]):
                output.append("+1+")  # Append "+1+" if there are closing brackets
            else:
                output.append("1+")  # Otherwise, append "1+"
            #if stack:
            #    stack[-1] += 1  # Increment the counter for the current block
            position += forward_left_match.end()
            continue
        
        # Match embed end
        embed_end_match = re.match(embed_end_pattern, program[position:])
        count_zeros = 0
        if embed_end_match:
            if stack:
                for i in reversed(range(len(stack))):
                    if stack[i] == 1:
                        count_zeros +=1
                        break
                    elif stack[i] == 0:
                        count_zeros +=1
                stack = stack[:i] # remove all the last elements since starting the embed block
                if output and output[-1] == "1+":
                    output[-1] = "1"
                output.append(")" * count_zeros)

                #count = 0 # set the count to 0 after adding brackets
            position += embed_end_match.end()
            continue
        
        # Increment position if no match is found
        position += 1
    
    # Close any remaining loops
    while stack:
        count = int(len(stack))
        stack = [] # clear the stack
        if output and output[-1] == "1+":
            output[-1] = "1"
        output.append(")"* count) 
   
    # Combine the output
    result = ''.join(output).rstrip('+')
    print(f"Output: {result}")
    try:
        result = eval(result)
    except Exception as e:
        print(f"Error while evaluation the expression: {e}")
    
    return result

# Example usage
pseudo_program = test_prog_4
print(pseudo_program)
parsed_output = calc_semantic_length(pseudo_program)
print(parsed_output)


for j in range(5):
    embed("""forward(8)
left(0.0)
for i in range(8):
    forward(2)
    left(45.0)
 for i in range(8):
    forward(2)
    left(45.0)""", locals())
 forward(1)
    left(12.0)

Output: 5*((1+1+8*(1+1+8*(1+1)))+1+1)
740


In [45]:
def embed_main_program(program):
    ### EMBEDDED PROGRAM ###
    embed_pattern = r'embed\("""(.*?)""", locals\(\)\)'
    embed_matches = [(match.group(1), match.start(), match.end()) for match in re.finditer(embed_pattern, program, re.DOTALL)]

    ### MAIN PROGRAM ###
    main_program = re.sub(embed_pattern, "", program, flags=re.DOTALL).strip()
    return main_program, embed_matches

def identify_loops(program):
    for_loop_pattern = r"for\s+\w+\s+in\s+range\((\d+)\):"
    for_loop_matches = [(match.group(1), match.start()) for match in re.finditer(for_loop_pattern, program)]
    return for_loop_matches

def calculate_semantic_length(program):
    
    return semantic_length

def parse_semantic_length(program):
    # Separate the embedded programs from the main program
    main_program, embed_matches = embed_main_program(program)

    # Identify loops
    program_loops = identify_loops(program)


    logo_prim_pattern = r"(forward|left|right|penup|pendown|teleport|heading|isdown)\(" 
    logo_prim_matches = [(match.group(1), match.start()) for match in re.finditer(logo_prim_pattern, program)]

    return program_loops, logo_prim_matches

In [46]:
main_program, embed_matches = embed_main_program(test_prog_0)
print(main_program)
print(embed_matches)

program_loops, logo_prim_matches = parse_semantic_length(test_prog_0)
print(program_loops)
print(logo_prim_matches)

for j in range(5):
    
    forward(0)
    left(72.0)
[('forward(8)\nleft(0.0)\nfor i in range(8):\n    forward(2)\n    left(45.0)', 23, 115)]
[('5', 0), ('8', 53)]
[('forward', 32), ('left', 43), ('forward', 76), ('left', 91), ('forward', 120), ('left', 135)]
