## CodeBERT: Evaluates code quality and relevance.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Function to evaluate the code with CodeBERT
def evaluate_with_codebert(code: str):
    tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
    model = AutoModelForSequenceClassification.from_pretrained("microsoft/codebert-base")

    inputs = tokenizer(code, return_tensors="pt", padding=True, truncation=True)
)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    softmax = torch.nn.Softmax(dim=1)
    probabilities = softmax(logits)

    max_prob = probabilities.max().item()

    print(f"CodeBERT Quality Score: {max_prob}")

generated_code = '''
def factorial(num):
    if num == 0:
        return 1
    else:
        return num * factorial(num-1)
'''

evaluate_with_codebert(generated_code)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CodeBERT Quality Score: 0.5823578238487244


## Pylint: Checks code style and standards.

In [None]:
!pip install pylint

Collecting pylint
  Downloading pylint-3.3.1-py3-none-any.whl.metadata (12 kB)
Collecting astroid<=3.4.0-dev0,>=3.3.4 (from pylint)
  Downloading astroid-3.3.5-py3-none-any.whl.metadata (4.5 kB)
Collecting isort!=5.13.0,<6,>=4.2.5 (from pylint)
  Downloading isort-5.13.2-py3-none-any.whl.metadata (12 kB)
Collecting mccabe<0.8,>=0.6 (from pylint)
  Downloading mccabe-0.7.0-py2.py3-none-any.whl.metadata (5.0 kB)
Collecting tomlkit>=0.10.1 (from pylint)
  Downloading tomlkit-0.13.2-py3-none-any.whl.metadata (2.7 kB)
Collecting dill>=0.2 (from pylint)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Downloading pylint-3.3.1-py3-none-any.whl (521 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.8/521.8 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading astroid-3.3.5-py3-none-any.whl (274 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m274.6/274.6 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.9-py3-non

In [None]:
import pylint.lint
from io import StringIO
import sys

# Function to evaluate code using Pylint
def evaluate_with_pylint(code: str):

    pylint_output = StringIO()
    sys.stdout = pylint_output

    with open('temp_code.py', 'w') as f:
        f.write(code)


    pylint_opts = ['--disable=C0114', '--disable=C0115', '--disable=C0116']
    pylint.lint.Run(['temp_code.py'] + pylint_opts, exit=False)


    sys.stdout = sys.__stdout__

    pylint_output.seek(0)
    output = pylint_output.getvalue()
    for line in output.splitlines():
        if line.startswith("Your code has been rated at"):
            score_line = line.strip()
            print(score_line)
            return score_line


generated_code = '''
def factorial(num):
    if num == 0:
        return 1
    else:
        return num * factorial(num-1)
'''

evaluate_with_pylint(generated_code)


'Your code has been rated at 7.50/10 (previous run: 7.50/10, +0.00)'

## Flake8: Enforces style, linting, and complexity.

In [None]:
pip install flake8

In [None]:
import subprocess

def evaluate_with_flake8(code: str):

    with open("temp_code.py", "w") as f:
        f.write(code)

    result = subprocess.run(['flake8', 'temp_code.py'], stdout=subprocess.PIPE, stderr=subprocess.PIPE)

    output = result.stdout.decode('utf-8')

    if output:
        print("Flake8 Issues Found:\n", output)
        num_issues = len(output.splitlines())
    else:
        print("No style issues found with Flake8!")
        num_issues = 0

    total_lines = len(code.splitlines())
    if total_lines == 0:
        score = 100
    else:
        score = max(0, (1 - (num_issues / total_lines)) * 100)

    print(f"Flake8 Score: {score:.2f}%")
    return score

generated_code = '''
def factorial(num):
    if num == 0:
        return 1
    else:
        return num * factorial(num-1)
'''

evaluate_with_flake8(generated_code)



100.0

## Success Rate: Measures test case accuracy.

In [1]:

def factorial(num):
    if num == 0:
        return 1
    else:
        return num * factorial(num-1)

# Test cases for factorial
test_cases = [
    (0, 1),
    (3, 6),
    (6, 720),
    (9, 362880),
    (12, 479001600),
    (15, 1307674368000),
    (18, 6402373705728000),
    (21, 51090942171709440000),
    (25, 15511210043330985984000000),
    (30, 265252859812191058636308480000000)
]


successful_cases = 0  # Track successful cases for debugging

for n, expected in test_cases:
    result = factorial(n)
    if result == expected:
        successful_cases += 1
    else:

        print(f"Test case failed for input {n}: Expected {expected}, got {result}")

# Calculate and print success rate
success_rate = (successful_cases / len(test_cases)) * 100
print(f"Success Rate: {success_rate}%")

Success Rate: 100.0%
