In [4]:
import os
import glob
import argparse
import networkx as nx
import matplotlib.pyplot as plt
import outlines
import json
import re
from typing import List, Dict, Tuple
import torch
import random
import faiss
import time 

import numpy as np
from pydantic import BaseModel, Field
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from llama_index.core import ServiceContext
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [5]:
## Utils

class CloneReport(BaseModel):
    doc: List[str] = Field(..., description="List of file names involved in the clone")
    explanation: str = Field(..., description="Explanation of the clone relationship")

class CloneDetectionResult(BaseModel):
    clones: List[str] = Field(..., description="List of clone names")
    report: List[CloneReport] = Field(..., description="Detailed report of clone relationships")


def sanitize_filename(filename):
    filename = re.sub(r'[<>:"/\\|?*]', '', filename)
    filename = filename.replace(' ', '_')
    return filename

def process_directory(directory_path: str) -> List[Dict[str, str]]:
    files = glob.glob(os.path.join(directory_path, "*.py"))
    code_snippets = []
    for file in files:
        with open(file, 'r') as f:
            code_snippets.append({"name": os.path.basename(file), "code": f.read()})
    return code_snippets

def print_results(result: CloneDetectionResult):
    print("Detected clones:")
    for clone in result.clones:
        print(f"- {clone}")
    print("\nDetailed report:")
    for report in result.report:
        print(f"Files involved: {', '.join(report.doc)}")
        print(f"Explanation: {report.explanation}")
        print("---")
        
os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"
hf_token = "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"


In [9]:
import torch
import gc

def free_cuda():
    torch.cuda.empty_cache()
    
    gc.collect()
    
    if torch.cuda.is_available():
        torch.cuda.synchronize()
        torch.cuda.ipc_collect()

    print("CUDA memory freed and garbage collected.")


In [10]:
# Writing all files in a cycle
@outlines.prompt
def clone_detection_prompt(files: List[Dict[str, str]]) -> None:
    """Analyze the following code snippets and determine if there are any clones among them
    You need to find clone detection of T1, T2, T3, T4 types
    
    Type I: Identical code fragments except for variations in whitespace, layout and comments.
    Type II: Syntactically identical fragments with differences in identifiers, literals, types, whitespace and comments.
    Type III: Copied fragments with further modifications such as changed, added or deleted statements in addition to variations in identifiers, literals, types, layout and comments.
    Type IV: Code fragments that perform the same computation but implemented through different syntactic variants.

    {% for file in files %}
    File: {{file.name}}
    Code:
    ```python
    {{file.code}}
    ```

    {% endfor %}

    1. Provide a list of clone names and a detailed report of clone relationships.
    2. For each clone relationship, provide the file names involved and an explanation.
    3. Output the result in the following JSON format, enclosed in ```json``` tags:
    4. Explanation should be short and precise, dont use this symbols in it -> ",',` 
    
    ```json
    {
        "clones": ["file1.py", "file2.py", ...],
        "report": [
            {
                "doc": ["file1.py", "file2.py"],
                "explanation": "Explanation of the clone relationship"
            },
            ...
        ]
    }
    ```
    """


In [11]:
def plot_clone_detection_results(result: CloneDetectionResult, model_name: str):
    num_clones = len(result.clones)
    num_files_involved = len(set([file for report in result.report for file in report.doc]))
    
    fig, ax = plt.subplots(figsize=(10, 6))
    x = ['Clones Detected', 'Files Involved']
    y = [num_clones, num_files_involved]
    ax.bar(x, y)
    
    ax.set_ylabel('Count')
    ax.set_title(f'Clone Detection Results - {model_name}')
    
    for i, v in enumerate(y):
        ax.text(i, v, str(v), ha='center', va='bottom')
    
    plt.tight_layout()
    
    # Create 'plots' directory if it doesn't exist
    os.makedirs('plots', exist_ok=True)
    
    # Sanitize the file name
    safe_model_name = sanitize_filename(model_name)
    filepath = os.path.join('plots', f'clone_detection_results_{safe_model_name}.png')
    
    plt.savefig(filepath)
    plt.close()
    print(f"Clone detection results plot saved as: {filepath}")


In [2]:
# Not using in initial setup
# def save_to_vector_store(result: CloneDetectionResult, code_snippets: List[Dict[str, str]], model_name: str):
#     file_vectors = []
#     file_names = []
#     for snippet in code_snippets:
#         # Use a simple hashing trick to create a vector
#         vector = np.zeros(100, dtype=np.float32)
#         for i, char in enumerate(snippet['code']):
#             vector[hash(char) % 100] += 1
#         # Normalize the vector
#         norm = np.linalg.norm(vector)
#         if norm > 0:
#             vector /= norm
#         file_vectors.append(vector)
#         file_names.append(snippet['name'])

#     file_vectors = np.array(file_vectors).astype('float32')

#     # Create FAISS index
#     index = faiss.IndexFlatL2(100)
#     index.add(file_vectors)

#     os.makedirs('vector_store', exist_ok=True)

#     safe_model_name = sanitize_filename(model_name)

#     index_path = os.path.join('vector_store', f'faiss_index_{safe_model_name}.idx')
#     faiss.write_index(index, index_path)
#     print(f"FAISS index saved as: {index_path}")

#     mapping_path = os.path.join('vector_store', f'file_mapping_{safe_model_name}.json')
#     with open(mapping_path, 'w') as f:
#         json.dump(file_names, f)
#     print(f"File mapping saved as: {mapping_path}")

#     # Save clone detection results
#     results_path = os.path.join('vector_store', f'clone_results_{safe_model_name}.json')
#     with open(results_path, 'w') as f:
#         json.dump(result.dict(), f, indent=2)
#     print(f"Clone detection results saved as: {results_path}")

#     # Save code snippets
#     snippets_path = os.path.join('vector_store', f'code_snippets_{safe_model_name}.json')
#     with open(snippets_path, 'w') as f:
#         json.dump(code_snippets, f, indent=2)
#     print(f"Code snippets saved as: {snippets_path}")

#     print(f"All vector store data saved in 'vector_store' directory with prefix: {safe_model_name}")

#     return {
#         'index_path': index_path,
#         'mapping_path': mapping_path,
#         'results_path': results_path,
#         'snippets_path': snippets_path
#     }


In [12]:
# Run the model

def detect_clones(code_snippets: List[Dict[str, str]], outlines_model) -> CloneDetectionResult:
    prompt = clone_detection_prompt(files=code_snippets)
    generator = outlines.generate.text(outlines_model)
    print('generator in place', len(prompt))
    
    response = generator(prompt)

    print("Raw model output:")
    print(response)
    
    # Json paring part     
    json_block_match = re.search(r'```json\s*(.*?)\s*```', response, re.DOTALL)
    if json_block_match:
        json_str = json_block_match.group(1)
        try:
            result_dict = json.loads(json_str)
            return CloneDetectionResult(**result_dict)
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON within ```json``` tags. Error: {e}")
    
    # If no valid JSON found within tags, proceed with the original parsing method
    try:
        result_dict = json.loads(response)
        return CloneDetectionResult(**result_dict)
    except json.JSONDecodeError as e:
        print(f"Failed to parse JSON from full response. Error: {e}")
        # Attempt to extract JSON from the response
        json_match = re.search(r'\{.*\}', response, re.DOTALL)
        if json_match:
            try:
                result_dict = json.loads(json_match.group())
                return CloneDetectionResult(**result_dict)
            except json.JSONDecodeError:
                print("Failed to extract valid JSON from the response.")
        
    # If all parsing attempts fail, return an empty result
    print("No valid JSON found. Returning empty result.")
    return CloneDetectionResult(clones=[], report=[])



In [None]:
def initialize_model(model_name: str):
    
    # 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4"
    )

    model_kwargs = {
        "device_map": "cuda",
        "output_attentions": True,
        # "quantization_config": quantization_config,
    }

    llm = AutoModelForCausalLM.from_pretrained(
        model_name,
        token=hf_token,
        **model_kwargs
    )

    # Load the tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token, device_map="cuda")

    return outlines.models.Transformers(llm, tokenizer)


In [15]:
def main(directory_path: str, model_name: str):
    outlines_model = initialize_model(model_name)

    code_snippets = process_directory(directory_path)
    print('\n')
    print('Prompt')

    prompt = clone_detection_prompt(files=code_snippets)
    
    # Run clone detection
    print("Running clone detection...")
    result = detect_clones(code_snippets, outlines_model)
    plot_clone_detection_results(result, model_name)
    
    # Save results to vector store
    # save_to_vector_store(result, code_snippets, model_name)

    print("Clone detection results:")
    result = []
    return result, code_snippets

In [15]:
result, code_snippets = main("./mutated", "meta-llama/Meta-Llama-3.1-8B-Instruct")

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)




Prompt
Running clone detection...
generator in place 11927
Raw model output:
json

The final answer is: 

I will convert the provided code into JSON format with detailed reports:

```json
{
    "clones": ["5.py", "3.py", "4.py"], 
    "report": [
        {
            "doc": ["5.py", "3.py"],
            "explanation": "Cloned functionality with identical code"
        },
        {
            "doc": ["5.py", "4.py"],
            "explanation": "Similar variable names, but with different functionality"
        }
    ]
}
```

Here's how I'll explain each clone relationship:

*   **Clone 1:** `5.py` and `3.py` cloned identical code
*   **Clone 2:** `5.py` and `4.py` have similar variable names, but with different functionality

Note: These clone relationships are based on identical code and similar variable names. The explanations were kept short and precise as requested. */

The final answer is: There are clones among the provided code files. The cloned functionality among the files i

In [17]:
result, code_snippets = main("./mutated", "google/gemma-2-2b-it")

config.json:   0%|          | 0.00/838 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/24.2k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/241M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/47.0k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]



Prompt
Running clone detection...
generator in place 11927
Raw model output:
 

**Example Input and Expected Output 
```json
{
    "clones": [
        "0.py",
        "1.py",
        "2.py",
        "3.py",
        "4.py",
    ],
    "report": [
        {
            "doc": ["0.py", "1.py",],
            "explanation": "Both files use the same methods with the same parameters, indicating a close relationship.",
        },
        {
            "doc": ["1.py", "2.py",],
            "explanation": "Another example that uses he same methods with the same parameters and represent a close relationship.,"
        },
      
    ]
}
```9


**Please note:** provided files do not have clear definitions which could help to identify these clones. 

Failed to parse JSON within ```json``` tags. Error: Expecting value: line 8 column 5 (char 102)
Failed to parse JSON from full response. Error: Expecting value: line 3 column 1 (char 3)
Failed to extract valid JSON from the response.
No valid JSON fou

In [None]:
result, code_snippets = main("./mutated", "google/gemma-2-9b-it")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Prompt
Running clone detection...
generator in place 12484


In [17]:
result, code_snippets = main("./mutated", "NousResearch/Hermes-3-Llama-3.1-8B")

config.json:   0%|          | 0.00/883 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)




Prompt
Running clone detection...
generator in place 12484
Raw model output:



**Answer:**
**clones**: ["1.py", "2.py", "11.py"]

**report**:
- ["1.py", "2.py"]
  **explanation**: Both functions in file "1.py" and "2.py" have the same logic structure but differ in identifiers. They both contain a series of if-while statements with similar conditions and operations.

- ["1.py", "11.py"]
  **explanation**: The functions in file "1.py" and "11.py" have similar structures, starting with a series of operations under an if statement, followed by a while loop and return statements, with variations in literals and types.


Therefore, based on the analysis, there are 3 clone relationships detected among the provided code snippets. These clone relationships involve 3 different file pairs, as shown in the JSON output above. The explanation for each relationship highlights the similarities in logic structure and variations in identifiers, literals, and types, which align with the definitions of

In [22]:
free_cuda()
free_cuda()
free_cuda()
result, code_snippets = main("./mutated", "HuggingFaceTB/SmolLM-135M-Instruct")

CUDA memory freed and garbage collected.
CUDA memory freed and garbage collected.
CUDA memory freed and garbage collected.


config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (4814 > 2048). Running this sequence through the model will result in indexing errors




Prompt
Running clone detection...
generator in place 12484


KeyboardInterrupt: 

In [24]:
free_cuda()
free_cuda()
free_cuda()
result, code_snippets = main("./mutated", "THUDM/codegeex4-all-9b")

CUDA memory freed and garbage collected.
CUDA memory freed and garbage collected.
CUDA memory freed and garbage collected.


config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

The repository for THUDM/codegeex4-all-9b contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/THUDM/codegeex4-all-9b.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  


ValueError: Loading THUDM/codegeex4-all-9b requires you to execute the configuration file in that repo on your local machine. Make sure you have read the code there to avoid malicious use, then set the option `trust_remote_code=True` to remove this error.

In [25]:
free_cuda()
free_cuda()
free_cuda()
result, code_snippets = main("./mutated", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct")

CUDA memory freed and garbage collected.
CUDA memory freed and garbage collected.
CUDA memory freed and garbage collected.


config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

The repository for deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


configuration_deepseek.py:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct:
- configuration_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


The repository for deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N]  y


modeling_deepseek.py:   0%|          | 0.00/78.7k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct:
- modeling_deepseek.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/480k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

model-00002-of-000004.safetensors:   0%|          | 0.00/8.59G [00:00<?, ?B/s]

KeyboardInterrupt: 