# IMPORTS

In [1]:
import pandas as pd
import json
import ast
from tqdm import tqdm

# READING DATASET

In [2]:
def process_file(file_path, dataset_type):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            entry = json.loads(line)
            index = entry['index']  # ID of the drawing
            category = entry.get('category', 'NONE')  # Abstraction level

            for step in entry['drawing_procedure']:
                step_id, instruction, board_state = step
                if step_id == 0:
                    continue  # Skip initial state if it contains no actionable instruction

                data.append({
                    'id_of_drawing': index,
                    'step_number': step_id,
                    'instructions': instruction,
                    'abstraction_level': category,
                    'resulting_labels': board_state,
                    'dataset': dataset_type
                })
    return data

def create_dataframe():
    train_data = process_file('Hexagons/data/train.jsonl', 'train')
    dev_data = process_file('Hexagons/data/dev.jsonl', 'dev')
    test_data = process_file('Hexagons/data/test.jsonl', 'test')

    all_data = train_data + dev_data + test_data
    df = pd.DataFrame(all_data)
    return df

In [3]:
df = create_dataframe()

In [4]:
df["abstraction_level"].value_counts()

abstraction_level
NONE                     1223
bounded iteration         694
conditions                641
recursion                 598
conditional iteration     417
symmetry                  258
other                     144
composed objects          120
simple                     82
Name: count, dtype: int64

In [5]:
df["dataset"].value_counts()

dataset
train    3278
test      453
dev       446
Name: count, dtype: int64

In [6]:
df.columns

Index(['id_of_drawing', 'step_number', 'instructions', 'abstraction_level',
       'resulting_labels', 'dataset'],
      dtype='object')

# NO COLOR INSTRUCTIONS

In [None]:
from openai import OpenAI

client = OpenAI(api_key="MY_API_KEY")

In [None]:
# Collect all the steps of a drawing together

instruction_dict = {}

for id_of_drawing in df["id_of_drawing"].unique():
  instruction_dict[id_of_drawing] = "  [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW]  ".join(list(df[df["id_of_drawing"] == id_of_drawing]["instructions"]))

In [None]:
new_instructions_all = {}
new_instructions = {}

SYSTEM_MESSAGE = "You are a helpful assistant that only provides what is asked. For the following instructions, transform the instruction so that it does not have any color information. Keep everything else the same. Just write the same instructions with the same words, without the color information."

for k,v in tqdm(instruction_dict.items()):
  print(k)

  chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": SYSTEM_MESSAGE},
        {"role": "user", "content": f"{v}"}
    ],
    model="gpt-4o")

  new_instructions_all[k] = chat_completion
  new_instructions[k] = chat_completion.choices[0].message.content

In [None]:
pd.to_pickle(new_instructions_all, "new_instructions_all.pkl")
pd.to_pickle(new_instructions, "new_instructions.pkl")

In [None]:
# Check for incorrect number of instructions

for k, v in instruction_dict.items():

  previous_length = v.split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ")
  current_length = new_instructions[k].split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ")

  if len(previous_length) != len(current_length):
    print(k)
    print(v.split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] "))
    print(new_instructions[k].split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] "))

In [None]:
new_instructions_clean = {}

for k, v in instruction_dict.items():
  previous_length = v.split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ")
  current_length = new_instructions[k].split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ")

  if len(previous_length) == len(current_length):
    new_instructions_clean[k] = new_instructions[k].split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ")

new_instructions_clean[144] = [""] + new_instructions[144].split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ")
new_instructions_clean[313] = new_instructions[313].replace("\n\n", " [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ").split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ")
new_instructions_clean[440] = new_instructions[440].replace("\n\n", "").split("[END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW]")
new_instructions_clean[561] = new_instructions[561].split(" \n\n[END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] \n\n")
new_instructions_clean[596] = new_instructions[596].split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW]")
new_instructions_clean[183] = new_instructions[183].split("  [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW]  ") + new_instructions[183].split("  [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW]  ")[-6:]

In [None]:
# Adding the new column
def get_value_from_dict(row):
    id_of_drawing = row['id_of_drawing']
    step_number = row['step_number']
    return new_instructions_clean[id_of_drawing][step_number - 1]

df['no_color'] = df.apply(get_value_from_dict, axis=1)


# SIMPLIFY INSTRUCTIONS

In [None]:
system_message = """Please simplify the following complex instructions into clear, easy-to-understand steps, referring to specific hexagon locations on a board with 18 columns and 10 rows of hexagonal tiles. Each instruction should be standalone and understandable without referring to previous instructions. Avoid using abstract terms like 'flower' or 'pyramid' without an exact description; instead, describe exactly which tiles to color to create these shapes. Do not number the instructions or bullet points, just separate them using the phrase " [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ".
Example 1:
Complex Instruction: Make a yellow caterpillar hugging the top of the drawing area, starting from the top left hexagon to the top hexagon of the eighth column.
Simplified Instruction: Starting from the top left corner of the board (Column 1, Row 1), color each hexagon yellow along the top row, stopping at Column 8.
Example 2:
Complex Instruction: Count nine columns from the left, then six hexagons down. Color the sixth hexagon purple.  [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW]  Add purple hexagons to every hexagon connected to that one to create a small purple flower.
Simplified Instruction: Color the hexagon located at Column 9, Row 6 purple.  [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW]  Color the hexagons directly adjacent to the hexagon at Column 9, Row 6, purple.
Your Instructions:
"""

In [None]:
simple_instructions_all = {}
simple_instructions = {}

for k,v in tqdm(instruction_dict.items()):
  print(k)

  chat_completion = client.chat.completions.create(
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user", "content": f"{v}"}
    ],
    model="gpt-4o")

  simple_instructions_all[k] = chat_completion
  simple_instructions[k] = chat_completion.choices[0].message.content

In [None]:
pd.to_pickle(simple_instructions_all, "simple_instructions_all.pkl")
pd.to_pickle(simple_instructions, "simple_instructions.pkl")

In [None]:
# Too many instructions for manual changes.

for k, v in instruction_dict.items():

  previous_length = v.split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] ")
  current_length = simple_instructions[k].split("[END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW]")

  if len(previous_length) != len(current_length):
    print(k)
    print(v.split(" [END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW] "))
    print(simple_instructions[k].split("[END OF CURRENT INSTRUCTION, NEXT INSTRUCTION NOW]"))

# PREPARE INPUT DATA

In [11]:
df

Unnamed: 0,id_of_drawing,step_number,instructions,abstraction_level,resulting_labels,dataset,no_color
0,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...
1,0,2,Create a less-than sign (<) using the original...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Create a less-than sign (<) using the origina...
2,0,3,Add two more red tiles to connect the ends of ...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Add two more tiles to connect the ends of the...
3,1,1,"On the eighth row from the left, vertically, a...",simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,"On the eighth row from the left, vertically, a..."
4,1,2,"Above the rightmost two red tiles, add another...",simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,"Above the rightmost two tiles, add another til..."
...,...,...,...,...,...,...,...
4172,506,3,"In the 5th and 13th columns leave one white, t...",bounded iteration,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",test,"In the 5th and 13th columns leave one, then c..."
4173,507,1,"Starting at the bottom left, paint the first, ...",bounded iteration,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",test,"Starting at the bottom left, paint the first, ..."
4174,507,2,Paint the next adjacent tile to the upper righ...,bounded iteration,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",test,Paint the next adjacent tile to the upper rig...
4175,507,3,"In the fourth column from the right, starting ...",bounded iteration,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",test,"In the fourth column from the right, starting..."


In [12]:
#df.to_excel("df_no_color.xlsx")
df = pd.read_excel("df_no_color.xlsx")

In [14]:
input_strings = []
input_strings_nocolor = []

previous_id = None
previou_string = None

for i, row in df.iterrows():
  if row["id_of_drawing"] != previous_id:
    previous_id = row["id_of_drawing"]
    previous_string = str(row["instructions"])
    previous_string_nocolor = str(row["no_color"])
    input_strings.append(previous_string)
    input_strings_nocolor.append(previous_string_nocolor)
  else:
    new_string = str(previous_string) + " [SEP] " + str(row["instructions"])
    new_string_nocolor = str(previous_string_nocolor) + " [SEP] " + str(row["no_color"])
    input_strings.append(new_string)
    input_strings_nocolor.append(new_string_nocolor)


In [15]:
df["input_strings"] = input_strings
df["input_strings_nocolor"] = input_strings_nocolor

In [16]:
expanded_data = []

for index, row in df.iterrows():
    # Get the previous state for comparison
    if row['step_number'] == 1:
        previous_labels = [0] * 180  # Assume all zeros if first step
    else:
        # Previous state is the last row with the same id_of_drawing and step_number-1
        previous_labels = df.loc[(df['id_of_drawing'] == row['id_of_drawing']) &
                                   (df['step_number'] == row['step_number'] - 1), 'resulting_labels'].values[0]

    # Get the current state
    current_labels = row['resulting_labels']

    # Create expanded rows for each element in the resulting_labels
    for i, label in enumerate(current_labels):
        row_number = i // 18  # Integer division to find the row
        column_number = i % 18  # Modulo to find the column within the row
        action_label = label if label != previous_labels[i] else 0  # Determine action label

        # Append a new record including all previous columns plus the new ones
        expanded_data.append({
            **row.to_dict(),
            'row_number': row_number,
            'column_number': column_number,
            'action_label': action_label
        })

expanded_df = pd.DataFrame(expanded_data)
expanded_df.head()

Unnamed: 0,id_of_drawing,step_number,instructions,abstraction_level,resulting_labels,dataset,no_color,input_strings,input_strings_nocolor,row_number,column_number,action_label
0,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,0,0
1,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,1,0
2,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,2,0
3,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,3,0
4,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,4,0


In [17]:
expanded_df["final_input"] = expanded_df["row_number"].astype(str) + " " +  expanded_df["column_number"].astype(str) + " " + expanded_df["input_strings"]
expanded_df["action_label_nocolor"] = expanded_df["action_label"].apply(lambda x: 0 if x==0 else 1)
expanded_df["final_input_nocolor"] = expanded_df["row_number"].astype(str) + " " +  expanded_df["column_number"].astype(str) + " " + expanded_df["input_strings_nocolor"]
expanded_df["abstraction_input"] = expanded_df["row_number"].astype(str) + " " +  expanded_df["column_number"].astype(str) + " " + "Abstraction Level: " + expanded_df["abstraction_level"] + " Instructions: " + expanded_df["input_strings"]

expanded_df.head()

Unnamed: 0,id_of_drawing,step_number,instructions,abstraction_level,resulting_labels,dataset,no_color,input_strings,input_strings_nocolor,row_number,column_number,action_label,final_input,action_label_nocolor,final_input_nocolor,abstraction_input
0,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,0,0,0 0 Start by coloring the tile in the sixth co...,0,0 0 Start by coloring the tile in the sixth co...,0 0 Abstraction Level: simple Instructions: St...
1,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,1,0,0 1 Start by coloring the tile in the sixth co...,0,0 1 Start by coloring the tile in the sixth co...,0 1 Abstraction Level: simple Instructions: St...
2,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,2,0,0 2 Start by coloring the tile in the sixth co...,0,0 2 Start by coloring the tile in the sixth co...,0 2 Abstraction Level: simple Instructions: St...
3,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,3,0,0 3 Start by coloring the tile in the sixth co...,0,0 3 Start by coloring the tile in the sixth co...,0 3 Abstraction Level: simple Instructions: St...
4,0,1,Start by coloring the tile in the sixth column...,simple,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",train,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,Start by coloring the tile in the sixth column...,0,4,0,0 4 Start by coloring the tile in the sixth co...,0,0 4 Start by coloring the tile in the sixth co...,0 4 Abstraction Level: simple Instructions: St...


In [None]:
expanded_df.to_excel("expanded_df_final.xlsx")