In [1]:
import difflib
import tokenize
from io import BytesIO

def tokenize_code(code):
    # Tokenize the code using Python's tokenize module
    tokens = tokenize.tokenize(BytesIO(code.encode('utf-8')).readline)
    return [token.string for token in tokens if token.type != tokenize.COMMENT]

def calculate_token_similarity(code1, code2):
    tokens1 = tokenize_code(code1)
    tokens2 = tokenize_code(code2)

    # Calculate the similarity using the SequenceMatcher from difflib
    matcher = difflib.SequenceMatcher(None, tokens1, tokens2)
    similarity_ratio = matcher.ratio() * 100  # Convert ratio to percentage

    return similarity_ratio

if __name__ == "__main__":
    # Get file paths from the user
    file_path1 = input("Enter the file path for the first code snippet:\n")
    file_path2 = input("Enter the file path for the second code snippet:\n")

    # Read code from files
    with open(file_path1, 'r') as file:
        code_snippet1 = file.read()

    with open(file_path2, 'r') as file:
        code_snippet2 = file.read()

    similarity_percentage = calculate_token_similarity(code_snippet1, code_snippet2)
    print(f"\nToken-based similarity percentage: {similarity_percentage:.2f}%")


Enter the file path for the first code snippet:
/content/abc1.py
Enter the file path for the second code snippet:
/content/abc4.py

Token-based similarity percentage: 97.06%


In [2]:
import difflib

def remove_comments(code):
    """
    Removes comments from code.
    """
    lines = code.splitlines()
    lines_without_comments = []

    for line in lines:
        # Remove inline comments
        line = line.split('#', 1)[0].strip()
        lines_without_comments.append(line)

    return '\n'.join(lines_without_comments)

def preprocess_code(code):
    """
    Preprocesses code by removing comments and whitespace.
    """
    # Remove comments
    code_without_comments = remove_comments(code)

    # Remove whitespace
    code_without_whitespace = ''.join(code_without_comments.split())

    return code_without_whitespace

def compare_token_similarity(code1, code2):
    """
    Compares two code snippets based on token similarity.
    """
    tokens1 = preprocess_code(code1).split()
    tokens2 = preprocess_code(code2).split()

    # Use difflib's SequenceMatcher to compare token similarity
    matcher = difflib.SequenceMatcher(None, tokens1, tokens2)
    similarity_ratio = matcher.ratio() * 100

    return similarity_ratio

def read_code_from_file(file_path):
    """
    Reads code from a file.
    """
    with open(file_path, 'r') as file:
        code = file.read()
    return code

def detect_duplicate_code():
    """
    Takes two file paths from the user, reads code from files,
    and detects if they are duplicates. Also calculates the token similarity percentage.
    """
    print("Enter the file path for the first code snippet:")
    file_path1 = input()

    print("Enter the file path for the second code snippet:")
    file_path2 = input()

    # Read code from files
    code_snippet1 = read_code_from_file(file_path1)
    code_snippet2 = read_code_from_file(file_path2)

    # Compare token similarity
    token_similarity = compare_token_similarity(code_snippet1, code_snippet2)
    print(f"Token Similarity Percentage: {token_similarity:.2f}%")

    # Display the result
    if token_similarity >= 90:  # Adjust the threshold as needed
        print("High token similarity. Possible duplicate code.")
    else:
        print("Code snippets are different.")

if __name__ == "__main__":
    detect_duplicate_code()


Enter the file path for the first code snippet:
/content/abc1.py
Enter the file path for the second code snippet:
/content/abc4.py
Token Similarity Percentage: 0.00%
Code snippets are different.
