In [None]:
import hashlib
import ast
import difflib
import re
from difflib import SequenceMatcher

def remove_comments(code):
    """
    Removes comments from code.
    """
    lines = code.splitlines()
    lines_without_comments = []

    for line in lines:
        # Remove inline comments
        line = line.split('#', 1)[0].strip()
        lines_without_comments.append(line)

    return '\n'.join(lines_without_comments)

def preprocess_code(code):
    """
    Preprocesses code by removing comments and whitespace.
    """
    # Remove comments
    code_without_comments = remove_comments(code)

    # Remove whitespace
    code_without_whitespace = ''.join(code_without_comments.split())

    return code_without_whitespace

def compare_token_similarity(code1, code2):
    """
    Compares two code snippets based on token similarity.
    """
    tokens1 = preprocess_code(code1).split()
    tokens2 = preprocess_code(code2).split()
    #tokens1 = code1
    #tokens2 = code2

    # Use difflib's SequenceMatcher to compare token similarity
    matcher = difflib.SequenceMatcher(None, tokens1, tokens2)
    similarity_ratio = matcher.ratio() * 100

    return similarity_ratio

def calculate_similarity_percentage_hashing(hash1, hash2):
    """
    Calculates the similarity percentage between two code snippets using SHA-256 hashes.
    """
    similarity_count = sum(a == b for a, b in zip(hash1, hash2))
    total_chars = max(len(hash1), len(hash2))
    similarity_percentage = (similarity_count / total_chars) * 100
    return similarity_percentage

def calculate_similarity_percentage_ast(code1, code2):
    ast1 = ast.parse(code1)
    ast2 = ast.parse(code2)

    d = difflib.Differ()
    diff = list(d.compare(ast.dump(ast1).splitlines(), ast.dump(ast2).splitlines()))

    # Filter out lines that are not common
    common_lines = [line[2:] for line in diff if line.startswith(' ')]

    # Calculate the percentage of copied code
    total_lines = max(len(ast.dump(ast1).splitlines()), len(ast.dump(ast2).splitlines()))
    common_percentage = (len(common_lines) / total_lines) * 100

    return common_percentage

def calculate_similarity_percentage_string(code1, code2):
    # Use SequenceMatcher to calculate the similarity ratio
    matcher = SequenceMatcher(None, code1, code2)
    similarity_ratio = matcher.ratio() * 100

    return similarity_ratio

def detect_direct_clone(hash_similarity, ast_similarity, string_similarity, token_similarity):
    if hash_similarity == 100 or ast_similarity == 100 or string_similarity == 100 or token_similarity >= 90:
        return True
    return False

def detect_near_miss(token_similarity, string_similarity):
    # Set threshold values for near-miss detection
    token_threshold = 80
    string_threshold = 80

    if token_similarity >= token_threshold and string_similarity >= string_threshold:
        return True
    return False

def read_code_from_file(file_path):
    """
    Reads code from a file.
    """
    with open(file_path, 'r') as file:
        code = file.read()
    return code

if __name__ == "__main__":
    print("Choose the method to detect code similarity:")
    print("1. SHA-256 Hashing")
    print("2. AST-based Comparison")
    print("3. String Similarity")
    print("4. Token Similarity")
    print("5. Hybrid Direct Clone Detection")
    print("6. Hybrid Near-Miss Detection")

    choice = input()

    if choice == "1":
        # SHA-256 Hashing
        print("Enter the file path for the first code snippet:")
        file_path1 = input()

        print("Enter the file path for the second code snippet:")
        file_path2 = input()

        # Read code from files
        code_snippet1 = read_code_from_file(file_path1)
        code_snippet2 = read_code_from_file(file_path2)

        # Hash the code snippets
        hash1 = hashlib.sha256(code_snippet1.encode()).hexdigest()
        hash2 = hashlib.sha256(code_snippet2.encode()).hexdigest()

        # Compare the hashes
        if hash1 == hash2:
            print("Duplicate code detected!")
        else:
            print("Code snippets are different.")

        # Calculate and display similarity percentage
        similarity_percentage = calculate_similarity_percentage_hashing(hash1, hash2)
        print(f"Similarity Percentage (using SHA-256 Hashing): {similarity_percentage:.2f}%")

    elif choice == "2":
        # AST-based Comparison
        file_path1 = input("Enter the file path for the first code snippet:\n")
        file_path2 = input("Enter the file path for the second code snippet:\n")

        # Read code from files
        code_snippet1 = read_code_from_file(file_path1)
        code_snippet2 = read_code_from_file(file_path2)

        similarity_percentage = calculate_similarity_percentage_ast(code_snippet1, code_snippet2)
        print(f"\nSimilarity percentage (using AST-based Comparison): {similarity_percentage:.2f}%")

    elif choice == "3":
        # String Similarity
        file_path1 = input("Enter the file path for the first code snippet:\n")
        file_path2 = input("Enter the file path for the second code snippet:\n")

        # Read code from files
        code_snippet1 = read_code_from_file(file_path1)
        code_snippet2 = read_code_from_file(file_path2)

        similarity_ratio = calculate_similarity_percentage_string(code_snippet1, code_snippet2)
        print(f"\nSimilarity ratio (using String Similarity): {similarity_ratio:.2f}%")

    elif choice == "4":
        # Token Similarity
        file_path1 = input("Enter the file path for the first code snippet:\n")
        file_path2 = input("Enter the file path for the second code snippet:\n")

        # Read code from files
        code_snippet1 = read_code_from_file(file_path1)
        code_snippet2 = read_code_from_file(file_path2)

        token_similarity = compare_token_similarity(code_snippet1, code_snippet2)
        print(f"\nToken Similarity Percentage: {token_similarity:.2f}%")

    elif choice == "5":
        # Hybrid Direct Clone Detection
        print("Enter the file path for the first code snippet:")
        file_path1 = input()

        print("Enter the file path for the second code snippet:")
        file_path2 = input()

        # Read code from files
        code_snippet1 = read_code_from_file(file_path1)
        code_snippet2 = read_code_from_file(file_path2)

        # Hash the code snippets
        hash1 = hashlib.sha256(code_snippet1.encode()).hexdigest()
        hash2 = hashlib.sha256(code_snippet2.encode()).hexdigest()

        # Generate ASTs
        ast1 = ast.parse(code_snippet1)
        ast2 = ast.parse(code_snippet2)

        # Compare the ASTs
        ast_similarity = calculate_similarity_percentage_ast(code_snippet1, code_snippet2)
        print(f"AST Similarity Percentage: {ast_similarity:.2f}%")

        # Check for direct clone
        if detect_direct_clone(calculate_similarity_percentage_hashing(hash1, hash2),
                               ast_similarity,
                               calculate_similarity_percentage_string(code_snippet1, code_snippet2),
                               compare_token_similarity(code_snippet1, code_snippet2)):
            print("Direct clone detected!")
        else:
            print("Code snippets are not direct clones.")

    elif choice == "6":
        # Hybrid Near-Miss Detection
        print("Enter the file path for the first code snippet:")
        file_path1 = input()

        print("Enter the file path for the second code snippet:")
        file_path2 = input()

        # Read code from files
        code_snippet1 = read_code_from_file(file_path1)
        code_snippet2 = read_code_from_file(file_path2)

        # Calculate similarities
        token_similarity = compare_token_similarity(code_snippet1, code_snippet2)
        string_similarity = calculate_similarity_percentage_string(code_snippet1, code_snippet2)

        print(f"Token Similarity Percentage: {token_similarity:.2f}%")
        print(f"String Similarity Percentage: {string_similarity:.2f}%")

        # Check for near-miss
        if detect_near_miss(token_similarity, string_similarity):
            print("Near-miss detected! Code snippets are potentially similar.")
        else:
            print("Code snippets are not near-miss duplicates.")

    else:
        print("Invalid choice. Please choose 1, 2, 3, 4, 5, or 6.")
