In [None]:
import random
import numpy as np
import tensorflow as tf

# Set a fixed seed for reproducibility, for the random module, numpy, and tensorflow
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)

In [None]:
import os
import subprocess

DATA_DIR = "data"

git_repos = [
    "https://github.com/AdaCore/Ada_Drivers_Library.git",
    "https://github.com/AdaCore/gnatstudio.git",
    "https://github.com/AdaCore/spark2014.git",
    "https://github.com/AdaCore/ada_language_server.git",
    "https://github.com/AdaCore/gnat-llvm.git",
    "https://github.com/AdaCore/libadalang.git",
    "https://github.com/AdaCore/aws.git",
    "https://github.com/AdaCore/RecordFlux.git",
    "https://github.com/AdaCore/learn.git",
    "https://github.com/AdaCore/gtkada.git",
    "https://github.com/AdaCore/gprbuild.git",
    "https://github.com/AdaCore/bb-runtimes.git",
    "https://github.com/AdaCore/svd2ada.git",
    "https://github.com/AdaCore/VSS.git",
    "https://github.com/AdaCore/gnatcoll-core.git",
    "https://github.com/AdaCore/Certyflie.git",
    "https://github.com/AdaCore/gnatcoverage.git",
]

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

for repo in git_repos:
    subprocess.run(["git", "clone", "--depth", "1", repo], cwd=DATA_DIR)


In [None]:
import hashlib
import math

from typing import List


def file_hash(file_path: str):
    # Calculate the hash of a file
    with open(file_path, "rb") as f:
        file_hash = hashlib.sha256()
        while chunk := f.read(8192):
            file_hash.update(chunk)
    return file_hash.hexdigest()

def is_file_mostly_space_indented(file_path: str):
    # Returns True if the file is mostly space indented
    # Returns False if the file is mostly tab indented
    # Defaults to False if the file is empty
    space_indent_count = 0
    tab_indent_count = 0
    with open(file_path, "r", encoding="utf-8") as f:
        file_contents = f.readlines()
        for line in file_contents:
            whitespace_count = len(line) - len(line.lstrip())
            whitespaces = line[:whitespace_count]
            space_indent_count += whitespaces.count(" ")
            tab_indent_count += whitespaces.count("\t")

    # In ada, the convention is to use 3 spaces for indentation
    space_indent_count = math.ceil(space_indent_count / 3)
    return space_indent_count > tab_indent_count or tab_indent_count == 0


def get_files_to_process(data_dir: str, skip_non_utf8_files: bool = True):
    # returns a list of unique ada files in the data ada_code_bases directory
    file_types_to_keep = {".ads", ".adb", ".gpr"}
    hashes = set()
    files_to_process = []

    for root, _, files in os.walk(data_dir):
        for file in files:
            file_type = os.path.splitext(file)[1]
            if file_type in file_types_to_keep:
                file_path = os.path.join(root, file)
                hash = file_hash(file_path)
                if hash not in hashes:
                    hashes.add(hash)
                    # If the file is not UTF-8, skip it
                    if skip_non_utf8_files:
                        try:
                            with open(file_path, "r", encoding="utf-8") as f:
                                f.read()
                            # We only want to process files that are mostly space indented
                            if not is_file_mostly_space_indented(file_path):
                                continue
                            files_to_process.append(file_path)
                        except UnicodeDecodeError:
                            continue
                    else:
                        files_to_process.append(file_path)
    return files_to_process

files_to_process = get_files_to_process(DATA_DIR)
print(f"Number of files to process: {len(files_to_process)}")


In [None]:
from typing import Tuple

def filter_empty_lines(lines: List[str]) -> List[str]:
    # Returns a list of all lines from lines that are not empty
    return [line for line in lines if line.strip() != ""]

def label_data(files_to_process: List[str], lines_to_group: int) -> List[Tuple[str, int]]:
    # For every file in files_to_process, read all of the lines from the file, and assign a label to each line, which is the number of spaces at the beginning of the next line
    labelled_data = []
    for file in files_to_process:
        with open(file, "r", encoding="utf-8") as f:
            lines = filter_empty_lines(f.readlines())
            lines.append("")
            for i in range(len(lines) - lines_to_group):
                data = ''.join(lines[i : i + lines_to_group])
                next_line = lines[i + lines_to_group]
                label = len(next_line) - len(next_line.lstrip(' '))
                labelled_data.append((data, label))
    return labelled_data

labelled_data = label_data(files_to_process, 3)

In [None]:
import tiktoken

# Note that since 0 is an identation too, the actual max is 119
INDENTATION_PREDICTION_CATEGORIES = 120

labelled_data = [(data, label) for data, label in labelled_data if label < INDENTATION_PREDICTION_CATEGORIES]

enc = tiktoken.get_encoding("cl100k_base")
labelled_data = [(enc.encode(data), label) for data, label in labelled_data]
labelled_data = [(data, tf.keras.utils.to_categorical(label, num_classes=INDENTATION_PREDICTION_CATEGORIES)) for data, label in labelled_data]

# For now we are going to limit the length of the data to 250 tokens
# If the data is longer than 250 tokens, we will truncate it by taking the last 250 tokens
# If the data is shorter than 250 tokens, we will left pad it using enc.eot_token 
labelled_data = [(data[-250:], label) if len(data) > 250 else ([enc.eot_token] * (250 - len(data)) + data, label) for data, label in labelled_data]

# Split labelled_data into two lists, X and y
X = [data for data, _ in labelled_data]
y = [label for _, label in labelled_data]

# Convert X to a numpy array
X = np.array(X)
y = np.array(y)

del labelled_data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, train_size=0.5, random_state=42)

In [None]:
def create_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Embedding(enc.n_vocab, 64, input_length=250))
    model.add(tf.keras.layers.Dense(64, activation="relu"))
    model.add(tf.keras.layers.Flatten())
    model.add(tf.keras.layers.Dense(INDENTATION_PREDICTION_CATEGORIES, input_shape=(250,), activation="softmax"))
    adam = tf.keras.optimizers.Adam(learning_rate=0.0003)
    model.compile(optimizer=adam, loss="categorical_crossentropy", metrics=["accuracy"])
    return model

model = create_model()
model.summary()


In [None]:
checkpoint_path = "checkpoints/indentation_prediction_v0_10_epoch.ckpt"
callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, save_weights_only=True, verbose=0)

In [None]:
# test = model.predict(X_train[:1])
model.fit(X_train, y_train, epochs=1000, batch_size=32, verbose=0, validation_data=(X_val, y_val), callbacks=[callback])

In [13]:
model = create_model()
model.load_weights(checkpoint_path)
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
loss, accuracy = model.evaluate(X_val, y_val, verbose=1)

