# Stats about frequency of functions in the train and test programs
In this notebook I plan to gain an overview of the operations contained in the LOGO domain programs.
1) Which python operations are contained beside the primitive functions specified in the REGAL paper?
2) Does the embed(prog, vars) which I view as indicator of subprograms is contained in all programs? Is it contained more then once in some programs and if so how often?
3) What is the frequency of the other functions occuring in programs?

In [1]:
import json
import pandas as pd
import ast

Defining needed functions to answer question 1 to 3.

In [2]:
def is_primitive(name, primitives):
    """Check if a function name is a primitive."""
    return name in primitives

def visit_node(node, primitives, expressions=None, function=None):
    """Recursively visit AST nodes to extract relevant expressions or count the specified function."""
    count = 0
    if isinstance(node, ast.Call):
        # Handle function calls
        if isinstance(node.func, ast.Name):
            if expressions is not None and not is_primitive(node.func.id, primitives):
                expressions.add(node.func.id)
            if function is not None and node.func.id == function:
                count += 1

    elif isinstance(node, ast.For):
        # Extract 'for'-loop construct
        if expressions is not None:
            expressions.add("for-loop")
        if function and "for-loop" in function:
            count += 1

    # Recursively visit child nodes
    for child in ast.iter_child_nodes(node):
        count += visit_node(child, primitives, expressions, function)
    
    return count

def extract_non_logo_expressions(data, primitives):
    """
    Extracts non-pimitive expressions from programs identified by the 'gpt' key in the dataset.

    Args:
        data (list): List of dictionaries containing the human description of the results of the program as well as the program itself.
        primitives (list): List of primitives to exclude.

    Returns:
        list: List of unique non-primitive expressions.
    """
    expressions = set()

    # Iterate over each dictionary in the dataset
    for item in data:
        if 'program' in item:
            code = item['program']
            try:
                tree = ast.parse(code)
                visit_node(tree, primitives, expressions=expressions)
            except SyntaxError:
                continue

        else:
            for message in item.get('messages', []):
                if message['from'] == 'gpt':
                    code = message['value']
                    try:
                        tree = ast.parse(code)
                        visit_node(tree, primitives, expressions=expressions)
                    except SyntaxError:
                        continue

    return list(expressions)

def count_functions_frequency(data, functions):
    """
    Counts the frequency of programs containing a specific operation (a function or for-loop).

    Args:
        data (list): List of dictionaries containing the human description of the results of the program as well as the program itself.
        function (str): The function name to count in the dataset.

    Returns:
        dict: A dictionary with the count of programs containing the specified function.
    """
    frequency_count = {}

    # Iterate over each dictionary in the dataset
    for item in data:
        if 'program' in item:
            code = item['program']
            try:
                tree = ast.parse(code)
                count = visit_node(tree, primitives=[], function=functions)
                if count in frequency_count:
                    frequency_count[count] += 1
                else:
                    frequency_count[count] = 1
            except SyntaxError:
                continue

        for message in item.get('messages', []):
            if message['from'] == 'gpt':
                code = message['value']
                try:
                    tree = ast.parse(code)
                    count = visit_node(tree, primitives=[], function=functions)
                    if count in frequency_count:
                        frequency_count[count] += 1
                    else:
                        frequency_count[count] = 1
                except SyntaxError:
                    continue

    return frequency_count

## LOGO
### Question 1: Py-operations beside LOGO primitives

In [3]:
#LOGO
train_logo_data = "logo_data/python/train_200_dataset.jsonl"
test_logo_data = "logo_data/python/test_dataset.jsonl"

# Load train and test dataset
with open(f"external/dependencies/{train_logo_data}", 'r') as f:
    train_data = [json.loads(line) for line in f]

with open(f"external/dependencies/{test_logo_data}", 'r') as f:
    test_data = [json.loads(line) for line in f]

In [4]:
# List containing LOGO primitives from the ReGAL paper
logo_primitives = ['forward', 'left', 'right', 'penup', 'pendown', 'teleport', 'heading', 'isdown', 'embed']

In [5]:
train_result = extract_non_logo_expressions(train_data, logo_primitives)
test_result = extract_non_logo_expressions(test_data, logo_primitives)

print(train_result)
print(test_result)

['range', 'locals', 'for-loop']
['range', 'locals', 'for-loop']


## Question 2 and 3: Frequency of functions in programs

In [6]:
# Count the frequency of programs containing the 'embed' function
def display_frequency(data, functions_list):
    for function in functions_list:
        function_frequency = count_functions_frequency(data, function)
        df = pd.DataFrame(list(function_frequency.items()), columns=["Frequency of function", "Frequency of Programs"]).sort_values(by="Frequency of function")
        df.set_index("Frequency of function", inplace=True)
        print(f"Frequency Table for '{function}':")
        print(df)
        print("-" * 50)

print("Python-functions:")
print("Train data:")
display_frequency(train_data, train_result)
print("Test data:")
display_frequency(test_data, test_result)

print("\n")
print("\n")
print("Logo-primitives:")
print("Train data:")
display_frequency(train_data, logo_primitives)
print("Test data:")
display_frequency(test_data, logo_primitives)

Python-functions:
Train data:
Frequency Table for 'range':
                       Frequency of Programs
Frequency of function                       
1                                        159
2                                         19
3                                         22
--------------------------------------------------
Frequency Table for 'locals':
                       Frequency of Programs
Frequency of function                       
0                                         64
1                                        136
--------------------------------------------------
Frequency Table for 'for-loop':
                       Frequency of Programs
Frequency of function                       
1                                        159
2                                         19
3                                         22
--------------------------------------------------
Test data:
Frequency Table for 'range':
                       Frequency of Programs
Frequency o

### Observations:
in train and test:
left occurse multiple times right never

teleport, heading, isdown seems to never occure (is this an error?)

## TextCraft

In [7]:
#TextCraft
train_textcraft_data = "python_data/textcraft/gpt-4/train.jsonl"
test_textcraft_data = "python_data/textcraft/gpt-4/test_d2.jsonl"

# Load train and test dataset
with open(f"external/dependencies/{train_textcraft_data}", 'r') as f:
    train_data = [json.loads(line) for line in f]

with open(f"external/dependencies/{test_textcraft_data}", 'r') as f:
    test_data = [json.loads(line) for line in f]

In [8]:
# List containing TextCraft primitives from the ReGAL paper
primitives = ['get_object', 'craft_object', 'check_inventory']
train_result = extract_non_logo_expressions(train_data, primitives)
print(train_result)

[]


In [10]:
# Count the frequency of programs containing the 'embed' function
def display_frequency(data, functions_list):
    for function in functions_list:
        function_frequency = count_functions_frequency(data, function)
        df = pd.DataFrame(list(function_frequency.items()), columns=["Frequency of function", "Frequency of Programs"]).sort_values(by="Frequency of function")
        df.set_index("Frequency of function", inplace=True)
        print(f"Frequency Table for '{function}':")
        print(df)
        print("-" * 50)


print("Logo-primitives:")
print("Train data:")
display_frequency(train_data, primitives)
print("Test data:")
display_frequency(test_data, primitives)

Logo-primitives:
Train data:
Frequency Table for 'get_object':
                       Frequency of Programs
Frequency of function                       
1                                         50
2                                         69
3                                         48
4                                         15
5                                          2
6                                          5
--------------------------------------------------
Frequency Table for 'craft_object':
                       Frequency of Programs
Frequency of function                       
2                                         64
3                                         69
4                                         26
5                                         12
6                                          7
7                                          7
8                                          3
10                                         1
----------------------------------------