In [1]:
import os
import sys
from pathlib import Path

CURRENT_DIRECTORY = Path(os.getcwd())
ROOT_DIRECTORY = (CURRENT_DIRECTORY / "..").absolute().resolve()

print(f"Current directory: {CURRENT_DIRECTORY}")
print(f"Root directory: {ROOT_DIRECTORY}")

sys.path.append(str(ROOT_DIRECTORY))

Current directory: /home/ubuntu/arga-arc/tf_coder
Root directory: /home/ubuntu/arga-arc


In [42]:
import typing as t
import json
from pprint import pprint
from dataclasses import dataclass
import tensorflow as tf
import numpy as np
import math
from config import CONFIG
from openai import OpenAI
import re
from collections import Counter
import random
import ast
import traceback
from tf_coder.utils import Example, extract_code, normalize_code, TFOPERATORS, SPARSETF_OPERATORS, ConstantCounts, Task, TaskJSONWithOutput

pprint(CONFIG.__dict__.keys())


dict_keys(['OPENAI_SECRET_KEY', 'OPENAI_ORGANIZATION', 'TOGETHER_SECRET_KEY', 'TOGETHER_BASE_URL', 'OCTO_SECRET_KEY'])


In [43]:
COMPLETIONS_FILE = CONFIG.ROOT_DIR / "tf_coder/tfcoder_output.deepseek-ai__deepseek-coder-33b-instruct.json"

with open(COMPLETIONS_FILE, "r") as f:
    COMPLETIONS_JSON = json.load(f)

COMPLETIONS = sum([output["completions"] for output in COMPLETIONS_JSON], [])
print(len(COMPLETIONS)) 

10350


In [44]:
dataset_json: t.List[TaskJSONWithOutput] = json.loads(COMPLETIONS_FILE.read_text())
tasks = [Task.from_json_with_output(task_json) for task_json in dataset_json]

2024-05-15 00:19:00.030311: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2256] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [56]:
EXAMPLES = [
    "\ndef transform(in1):\n    indices = tf.expand_dims(in1, -1)\n    range_ = tf.expand_dims(tf.range(tf.shape(indices)[0]), -1)\n    return tf.concat([range_, indices], axis=1))\n\nLead by example, here are examples of the coding assistant's role:\n\n[TASK DESCRIPTION]:\nExecute a MatMul of two tensors.\n\n[INPUTS]:\n[[1,2,3],[4,5,6]] and [[1,2],[3,4],[5,6]]\n\n[OUTPUTS]:\n[[22,28],[49,64]]\n\n[PROGRAM]:\ndef transform(in1, in2):\n    return tf.matmul(in1, in2)\n\n\n[TASK DESCRIPTION]:\nCompute the sum of a tensor.\n\n[INPUTS]:\n[[1,2,3],[4,5,6]]\n\n[OUTPUTS]:\n21\n\n[PROGRAM]:\ndef transform(in1):\n    return tf.reduce_sum(in1)\n\n\n[TASK DESCRIPTION]:\nFind the maximum value in a",
    'return tf.stack([tf.math.bincount(in1), in1], axis=1)',
 'return tf.sparse.slice(in1, start=[0,0,0], size=[1,-1,-1])',
 'return tf.sparse.slice(in1, start=[3,800,2], size=[1,-1,-1])',
 'return tf.reshape(in1, [4, 3, 2])',
 'tf.reshape(in1, shape=(4, 3, 2))'
]

## normalize code

In [57]:
EXTRACTED_CODE = [
    extract_code(example) for example in EXAMPLES
]
EXTRACTED_CODE

['def transform(in1):\n    indices = tf.expand_dims(in1, -1)\n    range_ = tf.expand_dims(tf.range(tf.shape(indices)[0]), -1)\n    return tf.concat([range_, indices], axis=1))',
 'return tf.stack([tf.math.bincount(in1), in1], axis=1)',
 'return tf.sparse.slice(in1, start=[0,0,0], size=[1,-1,-1])',
 'return tf.sparse.slice(in1, start=[3,800,2], size=[1,-1,-1])',
 'return tf.reshape(in1, [4, 3, 2])',
 'tf.reshape(in1, shape=(4, 3, 2))']

In [58]:
for code in EXTRACTED_CODE:
    try:
        ast.parse(code)
    except Exception as e:
        print(f"Error: {e}")
        print(code)
        print()

Error: unmatched ')' (<unknown>, line 4)
def transform(in1):
    indices = tf.expand_dims(in1, -1)
    range_ = tf.expand_dims(tf.range(tf.shape(indices)[0]), -1)
    return tf.concat([range_, indices], axis=1))



In [59]:
NORMALIZED_CODE = [
    normalize_code(code, 'def transform(in1):') for code in EXTRACTED_CODE if code is not None
]
NORMALIZED_CODE

['def transform(in1):\n    indices = tf.expand_dims(in1, -1)\n    range_ = tf.expand_dims(tf.range(tf.shape(indices)[0]), -1)\n    return tf.concat([range_, indices], axis=1))',
 'def transform(in1):\n    return tf.stack([tf.math.bincount(in1), in1], axis=1)',
 'def transform(in1):\n    return tf.sparse.slice(in1, start=[0,0,0], size=[1,-1,-1])',
 'def transform(in1):\n    return tf.sparse.slice(in1, start=[3,800,2], size=[1,-1,-1])',
 'def transform(in1):\n    return tf.reshape(in1, [4, 3, 2])',
 'def transform(in1):\n    return tf.reshape(in1, shape=(4, 3, 2))']

In [60]:
for code in NORMALIZED_CODE:
    try:
        ast.parse(code)
    except Exception as e:
        print(f"Error: {e}")
        print(code)
        print()

Error: unmatched ')' (<unknown>, line 4)
def transform(in1):
    indices = tf.expand_dims(in1, -1)
    range_ = tf.expand_dims(tf.range(tf.shape(indices)[0]), -1)
    return tf.concat([range_, indices], axis=1))



## extract operators

In [61]:
import tokenize
import io

for code in NORMALIZED_CODE:
    try:
        for token in tokenize.tokenize(io.BytesIO(code.encode('utf-8')).readline):
            print(token)
    except Exception as e:
        print(f"Error: {e}")
        print(code)
        print()

TokenInfo(type=63 (ENCODING), string='utf-8', start=(0, 0), end=(0, 0), line='')
TokenInfo(type=1 (NAME), string='def', start=(1, 0), end=(1, 3), line='def transform(in1):\n')
TokenInfo(type=1 (NAME), string='transform', start=(1, 4), end=(1, 13), line='def transform(in1):\n')
TokenInfo(type=54 (OP), string='(', start=(1, 13), end=(1, 14), line='def transform(in1):\n')
TokenInfo(type=1 (NAME), string='in1', start=(1, 14), end=(1, 17), line='def transform(in1):\n')
TokenInfo(type=54 (OP), string=')', start=(1, 17), end=(1, 18), line='def transform(in1):\n')
TokenInfo(type=54 (OP), string=':', start=(1, 18), end=(1, 19), line='def transform(in1):\n')
TokenInfo(type=4 (NEWLINE), string='\n', start=(1, 19), end=(1, 20), line='def transform(in1):\n')
TokenInfo(type=5 (INDENT), string='    ', start=(2, 0), end=(2, 4), line='    indices = tf.expand_dims(in1, -1)\n')
TokenInfo(type=1 (NAME), string='indices', start=(2, 4), end=(2, 11), line='    indices = tf.expand_dims(in1, -1)\n')
TokenInfo(

In [62]:
def lex(code: str) -> t.List[tokenize.TokenInfo]:
    ans = []
    try:
        for token in tokenize.tokenize(io.BytesIO(code.encode('utf-8')).readline):
            ans.append(token)
    except Exception as e:
        pass
    return ans

In [63]:
LEXED = [
    lex(code) for code in NORMALIZED_CODE
]

for lexed in LEXED:
    pprint(lexed)
    print()

[TokenInfo(type=63 (ENCODING), string='utf-8', start=(0, 0), end=(0, 0), line=''),
 TokenInfo(type=1 (NAME), string='def', start=(1, 0), end=(1, 3), line='def transform(in1):\n'),
 TokenInfo(type=1 (NAME), string='transform', start=(1, 4), end=(1, 13), line='def transform(in1):\n'),
 TokenInfo(type=54 (OP), string='(', start=(1, 13), end=(1, 14), line='def transform(in1):\n'),
 TokenInfo(type=1 (NAME), string='in1', start=(1, 14), end=(1, 17), line='def transform(in1):\n'),
 TokenInfo(type=54 (OP), string=')', start=(1, 17), end=(1, 18), line='def transform(in1):\n'),
 TokenInfo(type=54 (OP), string=':', start=(1, 18), end=(1, 19), line='def transform(in1):\n'),
 TokenInfo(type=4 (NEWLINE), string='\n', start=(1, 19), end=(1, 20), line='def transform(in1):\n'),
 TokenInfo(type=5 (INDENT), string='    ', start=(2, 0), end=(2, 4), line='    indices = tf.expand_dims(in1, -1)\n'),
 TokenInfo(type=1 (NAME), string='indices', start=(2, 4), end=(2, 11), line='    indices = tf.expand_dims(in1,

In [64]:
ALL_OPERATORS = TFOPERATORS + SPARSETF_OPERATORS
def count_operators(tokens: t.List[tokenize.TokenInfo]) -> t.Dict[str, int]:
    seen_tf = False
    seen_sparse = False

    ans = {}

    for token in tokens:
        if token.string == '.':
            continue
        
        if token.string == 'tf':
            seen_tf = True
            continue

        if seen_tf:
            if seen_sparse:
                if any(f"tf.sparse.{token.string}(" in operator for operator in ALL_OPERATORS):
                    ans[f"tf.sparse.{token.string}"] = ans.get(f"tf.sparse.{token.string}", 0) + 1
                seen_sparse = False
                seen_tf = False
            elif token.string == 'sparse':
                seen_sparse = True
            else:
                if any(f"tf.{token.string}(" in operator for operator in ALL_OPERATORS):
                    ans[f"tf.{token.string}"] = ans.get(f"tf.{token.string}", 0) + 1
                seen_tf = False 
    
    return ans


In [72]:
for example,lexed in zip(NORMALIZED_CODE,LEXED):
    print(example)
    print()
    pprint(count_operators(lexed))
    print()

def transform(in1):
    indices = tf.expand_dims(in1, -1)
    range_ = tf.expand_dims(tf.range(tf.shape(indices)[0]), -1)
    return tf.concat([range_, indices], axis=1))

{'tf.concat': 1, 'tf.expand_dims': 2, 'tf.range': 1, 'tf.shape': 1}

def transform(in1):
    return tf.stack([tf.math.bincount(in1), in1], axis=1)

{'tf.stack': 1}

def transform(in1):
    return tf.sparse.slice(in1, start=[0,0,0], size=[1,-1,-1])

{'tf.sparse.slice': 1}

def transform(in1):
    return tf.sparse.slice(in1, start=[3,800,2], size=[1,-1,-1])

{'tf.sparse.slice': 1}

def transform(in1):
    return tf.reshape(in1, [4, 3, 2])

{'tf.reshape': 1}

def transform(in1):
    return tf.reshape(in1, shape=(4, 3, 2))

{'tf.reshape': 1}



## extract constants

In [66]:
tasks[0].examples

Example(inputs=[array([0, 0, 0, 1, 3, 3])], output=array([[0, 0],
       [0, 1],
       [0, 2],
       [1, 0],
       [3, 0],
       [3, 1]]), input_names=None, json={'inputs': '[[0, 0, 0, 1, 3, 3],]', 'outputs': '[[0, 0], [0, 1], [0, 2], [1, 0], [3, 0], [3, 1]]'})

In [67]:

def add_constant_counts(
    counts1: ConstantCounts, counts2: ConstantCounts
) -> ConstantCounts:
    return {
        key: counts1.get(key, 0) + counts2.get(key, 0)
        for key in set(counts1.keys()) | set(counts2.keys())
    }


def is_common(value: t.Any) -> bool:
    return (type(value) == int or type(value) == bool) and value in [
        0,
        1,
        -1,
        True,
        False,
    ]


def is_axis(value: t.Any, max_input_rank) -> bool:
    return type(value) == int and value in range(2, max_input_rank + 1)


def is_shape(value: t.Any, dimension_lengths) -> bool:
    return type(value) == int and value in dimension_lengths

In [76]:
def get_constants_lexed(tokens: t.List[tokenize.TokenInfo]) -> t.List[int]:
    ans = []
    seen_minus = False
    for token in tokens:
        if token.type == tokenize.OP:
            if token.string == "-":
                seen_minus = True
                continue
        elif token.type == tokenize.NUMBER:
            try:
                value = -int(token.string) if seen_minus else int(token.string)
            except:
                continue
            ans.append(value)
        
        seen_minus = False
    return list(set(ans))


In [77]:
for example,lexed in zip(NORMALIZED_CODE,LEXED):
    print(example)
    print()
    pprint(get_constants_lexed(lexed))
    print()

def transform(in1):
    indices = tf.expand_dims(in1, -1)
    range_ = tf.expand_dims(tf.range(tf.shape(indices)[0]), -1)
    return tf.concat([range_, indices], axis=1))

[0, 1, -1]

def transform(in1):
    return tf.stack([tf.math.bincount(in1), in1], axis=1)

[1]

def transform(in1):
    return tf.sparse.slice(in1, start=[0,0,0], size=[1,-1,-1])

[0, 1, -1]

def transform(in1):
    return tf.sparse.slice(in1, start=[3,800,2], size=[1,-1,-1])

[800, 1, 2, 3, -1]

def transform(in1):
    return tf.reshape(in1, [4, 3, 2])

[2, 3, 4]

def transform(in1):
    return tf.reshape(in1, shape=(4, 3, 2))

[2, 3, 4]



In [68]:
def count_constants(tokens: t.List[tokenize.TokenInfo], example: Example) -> ConstantCounts:
    counts = {
        "common": 0,
        "axis": 0,
        "shape": 0,
        "provided": 0,
        "tf_int32": 0,
        "tf_float32": 0,
        "tf_int64": 0,
        "tf_bool": 0,
        "input_var": 0,
        "shape_tuple": 0,
    }
    # example.max_input_rank,
    #     example.dimension_lengths,
    #     example.output_shape,
    seen_minus = False
    seen_tf = False
    for token in tokens:
        if token.type == tokenize.OP:
            if token.string == "-":
                seen_minus = True
                continue
        elif token.type == tokenize.NUMBER:
            try:
                value = -int(token.string) if seen_minus else int(token.string)
            except:
                continue
            if is_common(value):
                counts["common"] += 1
            elif is_axis(value, example.max_input_rank):
                counts["axis"] += 1
            elif is_shape(value, example.dimension_lengths):
                counts["shape"] += 1
            else:
                counts["provided"] += 1
        elif token.type == tokenize.NAME:
            if token.string == "tf":
                seen_tf = True
                continue
            elif seen_tf:
                if token.string == "int32":
                    counts["tf_int32"] += 1
                elif token.string == "float32":
                    counts["tf_float32"] += 1
                elif token.string == "int64":
                    counts["tf_int64"] += 1
                elif token.string == "bool":
                    counts["tf_bool"] += 1
        elif token.type == tokenize.DOT:
            continue

        # TODO: shape tuple (not sure if it's worth implementing at the lexer level)
        seen_minus = False
        seen_tf = False
    
    return counts

In [73]:
TEST_EXAMPLES = [
    tasks[0].examples,
    tasks[2].examples,
    tasks[2].examples,
    tasks[3].examples,
    tasks[3].examples
]

In [75]:
for code,lexed, example in zip(NORMALIZED_CODE,LEXED, TEST_EXAMPLES):
    print(code)
    print()
    pprint(count_constants(lexed, example))
    print()


def transform(in1):
    indices = tf.expand_dims(in1, -1)
    range_ = tf.expand_dims(tf.range(tf.shape(indices)[0]), -1)
    return tf.concat([range_, indices], axis=1))

{'axis': 0,
 'common': 4,
 'input_var': 0,
 'provided': 0,
 'shape': 0,
 'shape_tuple': 0,
 'tf_bool': 0,
 'tf_float32': 0,
 'tf_int32': 0,
 'tf_int64': 0}

def transform(in1):
    return tf.stack([tf.math.bincount(in1), in1], axis=1)

{'axis': 0,
 'common': 1,
 'input_var': 0,
 'provided': 0,
 'shape': 0,
 'shape_tuple': 0,
 'tf_bool': 0,
 'tf_float32': 0,
 'tf_int32': 0,
 'tf_int64': 0}

def transform(in1):
    return tf.sparse.slice(in1, start=[0,0,0], size=[1,-1,-1])

{'axis': 0,
 'common': 6,
 'input_var': 0,
 'provided': 0,
 'shape': 0,
 'shape_tuple': 0,
 'tf_bool': 0,
 'tf_float32': 0,
 'tf_int32': 0,
 'tf_int64': 0}

def transform(in1):
    return tf.sparse.slice(in1, start=[3,800,2], size=[1,-1,-1])

{'axis': 1,
 'common': 3,
 'input_var': 0,
 'provided': 2,
 'shape': 0,
 'shape_tuple': 0,
 'tf_bool': 0,
 '