# Stats about frequency of functions in the train and test programs
In this notebook I plan to gain an overview of the operations contained in the LOGO domain programs.
1) Which python operations are contained beside the primitive functions specified in the REGAL paper?
2) Does the embed(prog, vars) which I view as indicator of subprograms is contained in all programs? Is it contained more then once in some programs and if so how often?
3) What is the frequency of the other functions occuring in programs?

In [2]:
import json
import pandas as pd
import ast

Defining needed functions to answer question 1 to 3.

In [3]:
def is_primitive(name, primitives):
    """Check if a function name is a primitive."""
    return name in primitives

def visit_node(node, primitives, expressions=None, function=None):
    """Recursively visit AST nodes to extract relevant expressions or count the specified function."""
    count = 0
    if isinstance(node, ast.Call):
        # Handle function calls
        if isinstance(node.func, ast.Name):
            if expressions is not None and not is_primitive(node.func.id, primitives):
                expressions.add(node.func.id)
            if function is not None and node.func.id == function:
                count += 1

    elif isinstance(node, ast.For):
        # Extract 'for'-loop construct
        if expressions is not None:
            expressions.add("for-loop")
        if function and "for-loop" in function:
            count += 1

    # Recursively visit child nodes
    for child in ast.iter_child_nodes(node):
        count += visit_node(child, primitives, expressions, function)
    
    return count

def extract_non_logo_expressions(data, primitives):
    """
    Extracts non-pimitive expressions from programs identified by the 'gpt' key in the dataset.

    Args:
        data (list): List of dictionaries containing the human description of the results of the program as well as the program itself.
        primitives (list): List of primitives to exclude.

    Returns:
        list: List of unique non-primitive expressions.
    """
    expressions = set()

    # Iterate over each dictionary in the dataset
    for item in data:
        if 'program' in item:
            code = item['program']
            try:
                tree = ast.parse(code)
                visit_node(tree, primitives, expressions=expressions)
            except SyntaxError:
                continue

        else:
            for message in item.get('messages', []):
                if message['from'] == 'gpt':
                    code = message['value']
                    try:
                        tree = ast.parse(code)
                        visit_node(tree, primitives, expressions=expressions)
                    except SyntaxError:
                        continue

    return list(expressions)

def count_functions_frequency(data, functions):
    """
    Counts the frequency of programs containing a specific operation (a function or for-loop).

    Args:
        data (list): List of dictionaries containing the human description of the results of the program as well as the program itself.
        function (str): The function name to count in the dataset.

    Returns:
        dict: A dictionary with the count of programs containing the specified function.
    """
    frequency_count = {}

    # Iterate over each dictionary in the dataset
    for item in data:
        if 'program' in item:
            code = item['program']
            try:
                tree = ast.parse(code)
                count = visit_node(tree, primitives=[], function=functions)
                if count in frequency_count:
                    frequency_count[count] += 1
                else:
                    frequency_count[count] = 1
            except SyntaxError:
                continue

        for message in item.get('messages', []):
            if message['from'] == 'gpt':
                code = message['value']
                try:
                    tree = ast.parse(code)
                    count = visit_node(tree, primitives=[], function=functions)
                    if count in frequency_count:
                        frequency_count[count] += 1
                    else:
                        frequency_count[count] = 1
                except SyntaxError:
                    continue

    return frequency_count

## LOGO
### Question 1: Py-operations beside LOGO primitives

In [4]:
#LOGO
train_logo_data = "logo_data/python/train_200_dataset.jsonl"
test_logo_data = "logo_data/python/test_dataset.jsonl"

# Load train and test dataset
with open(f"external/dependencies/{train_logo_data}", 'r') as f:
    train_data = [json.loads(line) for line in f]

with open(f"external/dependencies/{test_logo_data}", 'r') as f:
    test_data = [json.loads(line) for line in f]

In [5]:
# List containing LOGO primitives from the ReGAL paper
logo_primitives = ['forward', 'left', 'right', 'penup', 'pendown', 'teleport', 'heading', 'isdown', 'embed']

In [6]:
train_result = extract_non_logo_expressions(train_data, logo_primitives)
test_result = extract_non_logo_expressions(test_data, logo_primitives)

print(train_result)
print(test_result)

['locals', 'range', 'for-loop']
['locals', 'range', 'for-loop']


### Question 2 and 3: Frequency of functions in programs

In [7]:
# Count the frequency of programs containing the 'embed' function
def display_frequency(data, functions_list):
    for function in functions_list:
        function_frequency = count_functions_frequency(data, function)
        df = pd.DataFrame(list(function_frequency.items()), columns=["Frequency of function", "Frequency of Programs"]).sort_values(by="Frequency of function")
        df.set_index("Frequency of function", inplace=True)
        print(f"Frequency Table for '{function}':")
        print(df)
        print("-" * 50)

print("Python-functions:")
print("Train data:")
display_frequency(train_data, train_result)
print("Test data:")
display_frequency(test_data, test_result)

print("\n")
print("\n")
print("Logo-primitives:")
print("Train data:")
display_frequency(train_data, logo_primitives)
print("Test data:")
display_frequency(test_data, logo_primitives)

Python-functions:
Train data:
Frequency Table for 'locals':
                       Frequency of Programs
Frequency of function                       
0                                         64
1                                        136
--------------------------------------------------
Frequency Table for 'range':
                       Frequency of Programs
Frequency of function                       
1                                        159
2                                         19
3                                         22
--------------------------------------------------
Frequency Table for 'for-loop':
                       Frequency of Programs
Frequency of function                       
1                                        159
2                                         19
3                                         22
--------------------------------------------------
Test data:
Frequency Table for 'locals':
                       Frequency of Programs
Frequency 

### Observations:
in train and test:
left occurse multiple times right never

teleport, heading, isdown seems to never occure (is this an error?)

### Detect dublicate entries in the train and test data

In [8]:
def iterate_over_value(data, from_value):
        """
        Extracts the code or description from the messages based on the 'from' value.

        Args:
            data (list): List of dictionaries containing the human description of the results of the program as well as the program itself.
            from_value (str): This is either 'human' or 'gpt'.

        Returns:
            list: A list of code or descriptions based on the 'from' value.
        """    
        results = []
        for item in data:
            for message in item.get('messages', []):
                if message['from'] == from_value:
                    if from_value == 'gpt':
                        code = message['value']
                        results.append(code)
                    else:
                        description = message['value']
                        results.append(description)
        return results

In [9]:
print("Train Data unique descriptions:")
human_desc = iterate_over_value(train_data, 'human')   
print(len(human_desc))
print(len(set(human_desc)))

print("\n")
print("Train Data unique programs:")
programs = iterate_over_value(train_data, 'gpt')   
print(len(programs))
print(len(set(programs)))

print("\n")
print("\n")
print("Test Data unique descriptions:")
human_desc = iterate_over_value(test_data, 'human')   
print(len(human_desc))
print(len(set(human_desc)))

print("\n")
print("Test Data unique programs:")
programs = iterate_over_value(test_data, 'gpt')   
print(len(programs))
print(len(set(programs)))


print("\n")
print("\n")
all_data = []
all_data.extend(train_data)
all_data.extend(test_data)

print("Data unique descriptions:")
human_desc = iterate_over_value(all_data, 'human')   
print(len(human_desc))
print(len(set(human_desc)))

print("\n")
print("Data unique programs:")
programs = iterate_over_value(all_data, 'gpt')   
print(len(programs))
print(len(set(programs)))

Train Data unique descriptions:
200
153


Train Data unique programs:
200
153




Test Data unique descriptions:
111
111


Test Data unique programs:
111
111




Data unique descriptions:
311
264


Data unique programs:
311
263


This is the simplest way of searching for exact dublicates in the description or the program. 
When searching only within the train or the test data. It seems like there are 47 descriptions which have an exact match and this number also remains the same for the programs in the train data. In the test data all descriptions and programs seem unique. But as soon as one also compares them to to what is available in the train data one program and one description is also in the exact same shape in the train data. 

In [34]:
# Extract descriptions and programs from train_data and transform into pandas DataFrame
def extract_descriptions_and_programs(data):
    extracted_data = []
    for item in data:
        description = None
        program = None
        for message in item.get('messages', []):
            if message['from'] == 'human':
                description = message['value']
            elif message['from'] == 'gpt':
                program = message['value']
        if description and program:
            extracted_data.append([description, program])
    extracted_data = pd.DataFrame(extracted_data, columns=['Description', 'Program'])
    return extracted_data

# Extract descriptions and programs from train_data
df_train = extract_descriptions_and_programs(train_data)
df_test = extract_descriptions_and_programs(test_data)

#display(df_train)
#display(df_test)

# Append the test data to the train data
df_all = pd.concat([df_train, df_test], ignore_index=True)
#display(df_all)

#####################
# Create a list of row indices where the same description reappears
df_all['Description Repeats'] = df_all['Description'].map(lambda desc: [i for i, d in enumerate(df_all['Description']) if d == desc])
df_all['Program Repeats'] = df_all['Program'].map(lambda code: [i for i, c in enumerate(df_all['Program']) if c == code])
#####################

#####################
# Drop all duplicate rows
df_all = df_all.drop_duplicates(subset=['Description', 'Program'])
#display(df_all)
#####################

#####################
# Check if the row indices collected for the description and program are the same
df_all['Same Index'] = df_all.apply(lambda row: row['Description Repeats'] == row['Program Repeats'], axis=1)

# Filter and print the rows where 'Same Index' is False
#print(df_all['Same Index'].value_counts())
display(df_all[df_all['Same Index'] == False])
#####################

#####################
# Add columns for the count of description repeats and program repeats
df_all['n-Times Desc. Repeated'] = df_all['Description Repeats'].map(len)
df_all['n-Times Prog. Repeated'] = df_all['Program Repeats'].map(len)
#####################

# Drop all rows where 'n-Times Prog. Repeated' and 'n-Times Desc. Repeated' are both 1
#df_all = df_all[~((df_all['n-Times Prog. Repeated'] == 1) & (df_all['n-Times Desc. Repeated'] == 1))]

# Display the updated DataFrame
display(df_all)
print(df_all['n-Times Desc. Repeated'].value_counts())
print(df_all['n-Times Prog. Repeated'].value_counts())

#print(df_all['Description'].value_counts())
#print(df_all['Program'].value_counts())


Unnamed: 0,Description,Program,Description Repeats,Program Repeats,Same Index
43,5 sided snowflake with a short line and a smal...,"for j in range(5):\n embed(""""""penup()\nforw...",[43],"[43, 287]",False
287,5 sided snowflake with a short space and a sho...,"for j in range(5):\n embed(""""""penup()\nforw...",[287],"[43, 287]",False


Unnamed: 0,Description,Program,Description Repeats,Program Repeats,Same Index,n-Times Desc. Repeated,n-Times Prog. Repeated
0,4 concentric square s,"for i in range(5):\n embed(""""""for j in rang...","[0, 34, 80, 103, 180, 183]","[0, 34, 80, 103, 180, 183]",True,6,6
1,6 sided snowflake with a medium line and a med...,"for j in range(6):\n embed(""""""forward(8)\nl...",[1],[1],True,1,1
2,5 sided snowflake with a medium line and a sma...,"for j in range(5):\n embed(""""""forward(8)\nl...",[2],[2],True,1,1
3,6 short line s in a row,"for j in range(6):\n embed(""""""forward(2)\nl...","[3, 113, 118, 164]","[3, 113, 118, 164]",True,4,4
4,a small triangle connected by a big line to a ...,for i in range(3):\n forward(2)\n left(1...,[4],[4],True,1,1
...,...,...,...,...,...,...,...
306,8 sided snowflake with a medium circle and a s...,"for j in range(8):\n embed(""""""penup()\nforw...",[306],[306],True,1,1
307,5 sided snowflake with 2 small circle s as arms,"for j in range(5):\n embed(""""""penup()\nforw...",[307],[307],True,1,1
308,3 sided snowflake with a small square and a sm...,"for j in range(3):\n embed(""""""penup()\nforw...",[308],[308],True,1,1
309,5 sided snowflake with a small 5 gon and a sma...,"for j in range(5):\n embed(""""""penup()\nforw...",[309],[309],True,1,1


n-Times Desc. Repeated
1    237
2     16
3      6
4      3
6      2
Name: count, dtype: int64
n-Times Prog. Repeated
1    235
2     18
3      6
4      3
6      2
Name: count, dtype: int64


## TextCraft

In [8]:
#TextCraft
train_textcraft_data = "python_data/textcraft/gpt-4/train.jsonl"
test_textcraft_data = "python_data/textcraft/gpt-4/test_d2.jsonl"

# Load train and test dataset
with open(f"external/dependencies/{train_textcraft_data}", 'r') as f:
    train_data = [json.loads(line) for line in f]

with open(f"external/dependencies/{test_textcraft_data}", 'r') as f:
    test_data = [json.loads(line) for line in f]                        # here is no program key therfore empty results 

In [9]:
# List containing TextCraft primitives from the ReGAL paper
primitives = ['get_object', 'craft_object', 'check_inventory']
train_result = extract_non_logo_expressions(train_data, primitives)
test_result = extract_non_logo_expressions(test_data, primitives)

print(train_result)
print(test_result)

[]
[]


In [10]:
# Count the frequency of programs containing the 'embed' function
def display_frequency(data, functions_list):
    for function in functions_list:
        function_frequency = count_functions_frequency(data, function)
        df = pd.DataFrame(list(function_frequency.items()), columns=["Frequency of function", "Frequency of Programs"]).sort_values(by="Frequency of function")
        df.set_index("Frequency of function", inplace=True)
        print(f"Frequency Table for '{function}':")
        print(df)
        print("-" * 50)


print("Logo-primitives:")
print("Train data:")
display_frequency(train_data, primitives)
print("Test data:")
display_frequency(test_data, primitives)

Logo-primitives:
Train data:
Frequency Table for 'get_object':
                       Frequency of Programs
Frequency of function                       
1                                         50
2                                         69
3                                         48
4                                         15
5                                          2
6                                          5
--------------------------------------------------
Frequency Table for 'craft_object':
                       Frequency of Programs
Frequency of function                       
2                                         64
3                                         69
4                                         26
5                                         12
6                                          7
7                                          7
8                                          3
10                                         1
----------------------------------------