<a href="https://colab.research.google.com/github/shahdhesham/Colab-Thesis/blob/main/Magicoder_Set1_ZeroShot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'


In [None]:
import torch

if torch.cuda.is_available():
    print("CUDA is available! Using GPU.")
    print(f"GPU device name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA NOT available. Using CPU.")

CUDA is available! Using GPU.
GPU device name: NVIDIA A100-SXM4-40GB


In [None]:
!free -h

               total        used        free      shared  buff/cache   available
Mem:            83Gi       1.2Gi        76Gi       1.0Mi       5.4Gi        81Gi
Swap:             0B          0B          0B


In [None]:
from google.colab import files
import zipfile
import torch

import os
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
# 1. Upload ZIP file
print("Upload your ZIP file containing .c files:")
uploaded = files.upload()
zip_name = next(iter(uploaded))

# 2. Extract ZIP
with zipfile.ZipFile(zip_name, 'r') as z:
    z.extractall('input_folder')
print("Files extracted to 'input_folder/'")

In [None]:
# 3. Load model
model = AutoModelForCausalLM.from_pretrained(
    "ise-uiuc/Magicoder-S-CL-7B",
    device_map="auto",
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained("ise-uiuc/Magicoder-S-CL-7B")
tokenizer.pad_token = tokenizer.eos_token  # Add this line


In [None]:
# # Core translation function_Intial
# def translate(c_code):
#     prompt =  f"""
#     You are an expert code translator. Your ONLY task is to convert c++ code to c code.
#     Rules you MUST follow:
#     1. Output ONLY executable C code
#     2. Never include markdown or explanations
#     3. Preserve all functionality exactly
#     4. Use standard C libraries
#     5. Match the original code's input/output behavior

#     C:
#     {c_code}

#     """

#     inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
#     outputs = model.generate(**inputs, max_new_tokens=512) #context window size
#     return tokenizer.decode(outputs[0], skip_special_tokens=True).split("C++:")[-1].strip()
#     # return tokenizer.decode(outputs[0], skip_special_tokens=True).split("@@ Response")[-1].strip()


In [None]:
#BATCHING
def translate_batch(c_code_list):
    # Prepare prompt list for each input code
    prompts = []
    for c_code in c_code_list:
        prompt = f"""
You are an expert code translator. Your ONLY task is to convert c++ code to c code.
Rules you MUST follow:
1. Output ONLY executable C++ code
2. Never include markdown or explanations
3. Preserve all functionality exactly
4. Use standard C++ libraries
5. Match the original code's input/output behavior

C:
{c_code}
"""
        prompts.append(prompt)

    # Tokenize with padding for batch processing
    inputs = tokenizer(prompts, padding=True, return_tensors="pt").to(model.device)

    # Generate outputs for the whole batch
    outputs = model.generate(**inputs, max_new_tokens=512)

    # Decode each output separately and clean it up
    results = []
    for output in outputs:
        decoded = tokenizer.decode(output, skip_special_tokens=True)
        # Assuming the translation starts after "C++:" marker (adjust if needed)
        translation = decoded.split("C++:")[-1].strip()
        results.append(translation)

    return results

In [None]:
#batching
batch_size = 4
batch_files = []
batch_codes = []
batch_paths = []

for root, _, files in os.walk('input_folder'):
    for file in files:
        if file.endswith('.c'):
            in_path = os.path.join(root, file)
            out_path = in_path.replace('input_folder', 'output_folder').replace('.c', '.cpp')
            os.makedirs(os.path.dirname(out_path), exist_ok=True)

            with open(in_path, 'r') as f:
                code = f.read()

            batch_files.append(file)
            batch_codes.append(code)
            batch_paths.append((in_path, out_path))

            # Once batch is full, translate all at once
            if len(batch_codes) == batch_size:
                translations = translate_batch(batch_codes)
                for (in_p, out_p), translation in zip(batch_paths, translations):
                    with open(out_p, 'w') as f_out:
                        f_out.write(translation)
                    print(f"Translated: {in_p} → {out_p}")

                import gc
                gc.collect()
                torch.cuda.empty_cache()
                # Clear batch lists
                batch_files = []
                batch_codes = []
                batch_paths = []




# Translate any remaining files smaller than batch size
if batch_codes:
    translations = translate_batch(batch_codes)
    for (in_p, out_p), translation in zip(batch_paths, translations):
        with open(out_p, 'w') as f_out:
            f_out.write(translation)
        print(f"Translated: {in_p} → {out_p}")

In [None]:
# # 5. Process all .c files - INTIAL
# for root, _, files in os.walk('input_folder'):
#     for file in files:
#         if file.endswith('.c'):
#             # Set paths
#             in_path = os.path.join(root, file)
#             out_path = in_path.replace('input_folder', 'output_folder').replace('.c', '.cpp')
#             os.makedirs(os.path.dirname(out_path), exist_ok=True)

#             # Translate
#             with open(in_path, 'r') as f:
#                 c_code = f.read()
#             cpp_code = translate(c_code)

#             # Save
#             with open(out_path, 'w') as f:
#                 f.write(cpp_code)
#             print(f"Translated: {in_path} → {out_path}")

In [None]:
from google.colab import files as colab_files  # CHANGED: Added alias

In [None]:
# 6. Compress and download
print("\nCreating output ZIP...")
!zip -r output.zip output_folder
colab_files.download('output.zip')  # CHANGED: Uses alias
print("Done! Download should start automatically.")

In [None]:
print(model.generation_config)
