In [1]:
# ! pip install -r requirements.txt

In [2]:
import logging
import re

logging.basicConfig(format='%(asctime)s : %(name)s : %(levelname)s : %(message)s')
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

def read_code(file_path):
    logger.info(f'Reading code from {file_path}')
    with open(file_path, 'r') as f:
        return f.read()

def tokenize(content):
    tokens = re.findall(r'\b\w+\b', content)
    return tokens

In [3]:
common_file_path = 'src/main/java/com/example/glue.java'

# Variations 1 and 3 were generated by the same GPT and Variation 2 was generated by a different GPT
first_variation_path = f'./sample_code/variation1/{common_file_path}'
second_variation_path = f'./sample_code/variation2/{common_file_path}'
third_variation_path = f'./sample_code/variation3/{common_file_path}'

variation1 = read_code(first_variation_path)
variation2 = read_code(second_variation_path)
variation3 = read_code(third_variation_path)

2024-05-24 23:26:58,838 : __main__ : INFO : Reading code from ./sample_code/variation1/src/main/java/com/example/glue.java
2024-05-24 23:26:58,841 : __main__ : INFO : Reading code from ./sample_code/variation2/src/main/java/com/example/glue.java
2024-05-24 23:26:58,847 : __main__ : INFO : Reading code from ./sample_code/variation3/src/main/java/com/example/glue.java


### Line count metric
Count the number of lines in each implementation.

In [4]:
def count_lines(file_path):
    with open(file_path, 'r') as file:
        return sum(1 for line in file if line.strip())

implementation1 = count_lines(first_variation_path)
implementation2 = count_lines(second_variation_path)
implementation3 = count_lines(third_variation_path)

logger.info(f'Lines in first implementation: {implementation1}')
logger.info(f'Lines in second implementation: {implementation2}')
logger.info(f'Lines in third implementation: {implementation3}')

2024-05-24 23:27:03,225 : __main__ : INFO : Lines in first implementation: 74
2024-05-24 23:27:03,225 : __main__ : INFO : Lines in second implementation: 61
2024-05-24 23:27:03,225 : __main__ : INFO : Lines in third implementation: 110


### Cyclomatic complexity

Measures the number of linearly-independent paths through a program module. Programs with lower Cyclomatic complexity are easier to understand and less risky to modify.

In [8]:
import lizard
import json

In [9]:
analyze_implementation1 = lizard.analyze_file(first_variation_path)
logger.info(json.dumps(analyze_implementation1.function_list[0].__dict__, indent=4))

2024-05-24 23:32:27,707 : __main__ : INFO : {
    "cyclomatic_complexity": 1,
    "nloc": 38,
    "token_count": 354,
    "name": "GlueJob::main",
    "long_name": "GlueJob::main( String [ ] args)",
    "start_line": 30,
    "end_line": 88,
    "full_parameters": [
        "String [ ] args"
    ],
    "filename": "./sample_code/variation1/src/main/java/com/example/glue.java",
    "top_nesting_level": 1,
    "fan_in": 0,
    "fan_out": 0,
    "general_fan_out": 0
}


In [10]:
analyze_implementation2 = lizard.analyze_file(second_variation_path)
logger.info(json.dumps(analyze_implementation2.function_list[0].__dict__, indent=4))

2024-05-24 23:32:28,771 : __main__ : INFO : {
    "cyclomatic_complexity": 2,
    "nloc": 36,
    "token_count": 259,
    "name": "GlueETLJob::main",
    "long_name": "GlueETLJob::main( String [ ] args)",
    "start_line": 18,
    "end_line": 67,
    "full_parameters": [
        "String [ ] args"
    ],
    "filename": "./sample_code/variation2/src/main/java/com/example/glue.java",
    "top_nesting_level": 1,
    "fan_in": 0,
    "fan_out": 0,
    "general_fan_out": 0
}


In [11]:
analyze_implementation3 = lizard.analyze_file(third_variation_path)
logger.info(json.dumps(analyze_implementation3.function_list[0].__dict__, indent=4))

2024-05-24 23:32:29,670 : __main__ : INFO : {
    "cyclomatic_complexity": 1,
    "nloc": 22,
    "token_count": 147,
    "name": "GlueJobKCL::main",
    "long_name": "GlueJobKCL::main( String [ ] args)",
    "start_line": 31,
    "end_line": 62,
    "full_parameters": [
        "String [ ] args"
    ],
    "filename": "./sample_code/variation3/src/main/java/com/example/glue.java",
    "top_nesting_level": 1,
    "fan_in": 0,
    "fan_out": 0,
    "general_fan_out": 0
}


### Cosine similarity using `CountVectorizer`

In [5]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [6]:
def cosine_sim_count(first_code, second_code):
    logger.info('Calculating cosine similarity using Count Vectorizer')
    logger.info('Tokenizing first code')
    first_code_tokens = ' '.join(tokenize(first_code))
    logger.info('Tokenizing second code')
    second_code_tokens = ' '.join(tokenize(second_code))

    vect = CountVectorizer().fit_transform([first_code_tokens, second_code_tokens])
    vectors = vect.toarray()

    cosine_sim = cosine_similarity(vectors)
    similarity = cosine_sim[0, 1] # Simlarity between first and second vectors
    logger.info(f'Cosine similarity: {similarity}')
    return similarity

In [7]:
cosine_sim_count_score_12 = cosine_sim_count(variation1, variation2)

2024-05-24 23:16:33,305 : __main__ : INFO : Calculating cosine similarity using Count Vectorizer
2024-05-24 23:16:33,306 : __main__ : INFO : Tokenizing first code
2024-05-24 23:16:33,307 : __main__ : INFO : Tokenizing second code
2024-05-24 23:16:33,310 : __main__ : INFO : Cosine similarity: 0.8512908068064506


In [8]:
cosine_sim_count_score_13 = cosine_sim_count(variation1, variation3)

2024-05-24 23:16:39,605 : __main__ : INFO : Calculating cosine similarity using Count Vectorizer
2024-05-24 23:16:39,606 : __main__ : INFO : Tokenizing first code
2024-05-24 23:16:39,606 : __main__ : INFO : Tokenizing second code
2024-05-24 23:16:39,608 : __main__ : INFO : Cosine similarity: 0.8724804327537654


In [9]:
cosine_sim_count_score_23 = cosine_sim_count(variation2, variation3)

2024-05-24 23:16:41,642 : __main__ : INFO : Calculating cosine similarity using Count Vectorizer
2024-05-24 23:16:41,643 : __main__ : INFO : Tokenizing first code
2024-05-24 23:16:41,643 : __main__ : INFO : Tokenizing second code
2024-05-24 23:16:41,645 : __main__ : INFO : Cosine similarity: 0.8168865148143062


### Cosine similarity using `TfidfVectorizer`

In [10]:
def cosine_sim_tfidf(first_code, second_code):
    logger.info('Calculating cosine similarity using Tfidf Vectorizer')
    logger.info('Tokenizing first code')
    first_code_tokens = ' '.join(tokenize(first_code))
    logger.info('Tokenizing second code')
    second_code_tokens = ' '.join(tokenize(second_code))

    vect = TfidfVectorizer().fit_transform([first_code_tokens, second_code_tokens])
    vectors = vect.toarray()

    cosine_sim = cosine_similarity(vectors)
    similarity = cosine_sim[0, 1] # Simlarity between first and second vectors
    logger.info(f'Cosine similarity: {similarity}')
    return similarity

In [11]:
cosine_sim_tfidf_score_12 = cosine_sim_tfidf(variation1, variation2)

2024-05-24 23:16:47,144 : __main__ : INFO : Calculating cosine similarity using Tfidf Vectorizer
2024-05-24 23:16:47,144 : __main__ : INFO : Tokenizing first code
2024-05-24 23:16:47,145 : __main__ : INFO : Tokenizing second code
2024-05-24 23:16:47,147 : __main__ : INFO : Cosine similarity: 0.7674965341545423


In [12]:
cosine_sim_tfidf_score_13 = cosine_sim_tfidf(variation1, variation3)

2024-05-24 23:16:49,481 : __main__ : INFO : Calculating cosine similarity using Tfidf Vectorizer
2024-05-24 23:16:49,482 : __main__ : INFO : Tokenizing first code
2024-05-24 23:16:49,482 : __main__ : INFO : Tokenizing second code
2024-05-24 23:16:49,484 : __main__ : INFO : Cosine similarity: 0.8163420018711519


In [13]:
cosine_sim_tfidf_score_23 = cosine_sim_tfidf(variation2, variation3)

2024-05-24 23:16:50,906 : __main__ : INFO : Calculating cosine similarity using Tfidf Vectorizer
2024-05-24 23:16:50,907 : __main__ : INFO : Tokenizing first code
2024-05-24 23:16:50,907 : __main__ : INFO : Tokenizing second code
2024-05-24 23:16:50,909 : __main__ : INFO : Cosine similarity: 0.739502588116892


### Levenshtein Distance

In [14]:
import Levenshtein

def levenshtein_distance(first_code, second_code):
    logger.info('Calculating Levenshtein distance')
    logger.info('Tokenizing first code')
    first_code_tokens = ' '.join(tokenize(first_code))
    logger.info('Tokenizing second code')
    second_code_tokens = ' '.join(tokenize(second_code))
    
    distance = Levenshtein.distance(first_code_tokens, second_code_tokens)
    logger.info(f'Levenshtein distance: {distance}')

    ratio = Levenshtein.ratio(first_code_tokens, second_code_tokens)
    logger.info(f'Levenshtein ratio: {ratio}')
    return distance

In [15]:
levenshtein_distance_score_12 = levenshtein_distance(variation1, variation2)

2024-05-24 23:16:56,834 : __main__ : INFO : Calculating Levenshtein distance
2024-05-24 23:16:56,835 : __main__ : INFO : Tokenizing first code
2024-05-24 23:16:56,836 : __main__ : INFO : Tokenizing second code
2024-05-24 23:16:56,837 : __main__ : INFO : Levenshtein distance: 1841
2024-05-24 23:16:56,838 : __main__ : INFO : Levenshtein ratio: 0.5758945386064029


In [16]:
levenshtein_distance_score_13 = levenshtein_distance(variation1, variation3)

2024-05-24 23:17:00,392 : __main__ : INFO : Calculating Levenshtein distance
2024-05-24 23:17:00,393 : __main__ : INFO : Tokenizing first code
2024-05-24 23:17:00,394 : __main__ : INFO : Tokenizing second code
2024-05-24 23:17:00,395 : __main__ : INFO : Levenshtein distance: 1606
2024-05-24 23:17:00,395 : __main__ : INFO : Levenshtein ratio: 0.7330156569094622


In [17]:
levenshtein_distance_score_23 = levenshtein_distance(variation2, variation3)

2024-05-24 23:17:09,681 : __main__ : INFO : Calculating Levenshtein distance
2024-05-24 23:17:09,682 : __main__ : INFO : Tokenizing first code
2024-05-24 23:17:09,682 : __main__ : INFO : Tokenizing second code
2024-05-24 23:17:09,683 : __main__ : INFO : Levenshtein distance: 2719
2024-05-24 23:17:09,684 : __main__ : INFO : Levenshtein ratio: 0.5058511468247777


In [13]:
def jaccard_similarity(code1, code2):
    set1, set2 = set(code1.split()), set(code2.split())
    return len(set1 & set2) / len(set1 | set2)

In [14]:
jaccard_sim = jaccard_similarity(variation1, variation2)
jaccard_sim

0.24242424242424243