# Get Config


In [1]:
from utils import load_config

config = load_config("../config.json")
secret = load_config('../secret.json')

# Corpus Creation


## Source Code Dataset Retrieval


### Steps:

1. Go to [hpc-instruct dataset](https://huggingface.co/datasets/hpcgroup/hpc-instruct)
2. Query the dataset for C++ code using the following:

```sql
SELECT *
FROM train
WHERE (language = 'C++' OR language = 'Python')
AND "problem statement" NOT LIKE '%Translate%'
AND "problem statement" NOT LIKE '%translated%'
AND "problem statement" NOT LIKE '%optimize%'
AND "problem statement" NOT LIKE '%Optimize%';
```

3. Download the dataset. Will be saved as `raw_hpc_instruct_cpp.parquet`.


## Data Pre-Processing

1. randomly select ~4000k
2. iteratye stripp till gets code + ensure in cpp
3. ask to generate a non-optimal version

have format be like:

```json
{
	"id": "1",
	"seed": "seed code",
	"chunk": "generate a program that returns 0",
	"positive_sample": "int main() { return 0; }",
	"negative_sample": "int main() { return 1; }"
}
```


In [2]:
import pandas as pd


df = pd.read_parquet("../" + config.get("raw_code_dataset"))
print(df.shape)


def extract_code(text):
    start = text.find("```cpp")
    if start == -1:
        start = text.find("```python")
    end = text.rfind("```")
    if start != -1 and end != -1:
        return text[start + 6: end].strip()
    return None


df['solution'] = df['solution'].apply(extract_code)
df = df.dropna(subset=['solution']).reset_index(drop=True)
print(df.shape)
print(df.columns)

df_cpp = df[df['language'] == 'C++']
df_python = df[df['language'] == 'Python']
n_per_language = min(len(df_cpp), len(df_python), 2000)
sampled_cpp = df_cpp.sample(n=n_per_language, random_state=42)
sampled_python = df_python.sample(n=n_per_language, random_state=42)
df = pd.concat([sampled_cpp, sampled_python]).sample(
    frac=1, random_state=42).reset_index(drop=True)
print(df.shape)
print(df.columns)

(32875, 5)
(23140, 5)
Index(['language', 'seed', 'problem statement', 'solution', 'model'], dtype='object')
(4000, 5)
Index(['language', 'seed', 'problem statement', 'solution', 'model'], dtype='object')


In [3]:
from openai import OpenAI
from few_shots import code_example_1, code_example_2


def generate_nonperformant_code(openai: OpenAI, code: str):
    prompt = f"""
        For the following performant code, give me the non-performant version of it.
        You should not change any of the functionality of the code, just change optimizations that make it performant.

        Some dimensions to keep an eye out for are the following:

        - Algorithmic Efficiency: Introduce less efficient algorithms, unnecessary recalculations, or use suboptimal data structures.
        - Memory Usage: Increase memory allocations, introduce redundant data structures, or create fragmented access patterns.
        - I/O Operations: Avoid buffered I/O, use smaller chunks, and insert synchronous operations where asynchronous would work.
        - Compiler Optimizations: Omit compiler optimizations, avoid intrinsics, or use simpler constructs where efficient ones are available.
        - Concurrency and Parallelism: Avoid multi-threading, introduce locking overhead, and serialize tasks unnecessarily.

        Example 1 (unnecessary memory allocation):
        {code_example_1}

        Examples 2 (lack of move semantics):
        {code_example_2}

        My input:
        performant code:
        {code}

        non-performant code:
    """
    try:
        response = openai.chat.completions.create(
            model="gpt-4o-mini",
            messages=[{"role": "user", "content":  prompt}],
        )
        non_optimal_code = response.choices[0].message.content

        # this ensures that we only have the code.
        # there's potential for this to just set non_optimal_code to non-code text.
        # e.g. non_optimal_code both doesn't have the ```cpp and ``` pref/suffixes,
        # and ins't only code.
        # further, investigation can be done to ensure that the code is actually returned.
        tmp = extract_code(non_optimal_code)
        if tmp is not None:  # if None, then assumption is that the code is already clean.
            non_optimal_code = tmp

        return non_optimal_code
    except Exception as e:
        print(f"[ERROR] generate_nonperformant_code: {e}")
        return None

In [4]:
from concurrent.futures import ThreadPoolExecutor, as_completed

client: OpenAI = OpenAI(api_key=secret.get("OPEN_AI_API_KEY"))

MAX_RUNS = 4000


def process_row(idx, row):
    non_optimal_code = generate_nonperformant_code(client, row['solution'])
    if non_optimal_code:
        print(f"Generated non-optimal code for idx: {idx}")
        return {
            'id': idx,
            'seed': row['seed'],
            'problem': row['problem statement'],
            'solution': row['solution'],
            'non_optimal_solution': non_optimal_code
        }
    else:
        print(f"[ERROR]: failed on row {idx}")
        return None


processed_data = []
with ThreadPoolExecutor(max_workers=20) as executor:
    futures = {executor.submit(
        process_row, idx, row): idx for idx, row in df.iterrows() if idx < MAX_RUNS}

    for future in as_completed(futures):
        result = future.result()
        if result:
            processed_data.append(result)
        else:
            print(f"[ERROR]: Processing failed for idx: {futures[future]}")

processed_data.sort(key=lambda x: x['id'])

Generated non-optimal code for idx: 3
Generated non-optimal code for idx: 4
Generated non-optimal code for idx: 17
Generated non-optimal code for idx: 19
Generated non-optimal code for idx: 14
Generated non-optimal code for idx: 6
Generated non-optimal code for idx: 8
Generated non-optimal code for idx: 7
Generated non-optimal code for idx: 10
Generated non-optimal code for idx: 11
Generated non-optimal code for idx: 1
Generated non-optimal code for idx: 15
Generated non-optimal code for idx: 13
Generated non-optimal code for idx: 12
Generated non-optimal code for idx: 0
Generated non-optimal code for idx: 23
Generated non-optimal code for idx: 24
Generated non-optimal code for idx: 18
Generated non-optimal code for idx: 22
Generated non-optimal code for idx: 16
Generated non-optimal code for idx: 25
Generated non-optimal code for idx: 30
Generated non-optimal code for idx: 2
Generated non-optimal code for idx: 5
Generated non-optimal code for idx: 26
Generated non-optimal code for idx

{'id': 0, 'seed': '        // Prefer NULL, otherwise SAFECOOKIE. If a password is provided, use HASHEDPASSWORD\n        /* Authentication:\n         *   cookie:   hex-encoded ~/.tor/control_auth_cookie\n         *   password: "password"\n         */\n        std::string torpassword = GetArg("-torpassword", "");', 'problem': 'You are given a code snippet from a program that handles user authentication using a Tor control authentication cookie or a password. The original code uses the `GetArg` function to retrieve the authentication information, which can be either a hex-encoded `.tor/control_auth_cookie` or a plaintext password provided via command-line arguments.\n\nYour task is to translate this authentication mechanism to use Kokkos for heterogeneous parallel computing. Specifically, you will create a Kokkos-based function that accepts authentication information in the form of a `std::string`, which can represent either a hex-encoded cookie or a hashed password.\n\nAssumptions:\n\n1.

In [5]:
import json

with open("../" + config.get('training_data_path'), "w") as file:
    json.dump(processed_data, file, indent=4)