<a href="https://colab.research.google.com/github/rumeshsmrr/GEN_AI_BASE_PROGRAMMING_AUTOGRADER/blob/main/approach2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import subprocess
import difflib
from transformers import RobertaTokenizer, RobertaForSequenceClassification
import torch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base")
model = RobertaForSequenceClassification.from_pretrained("microsoft/codebert-base")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/498 [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
# Step 2: Error Handling Evaluation
def evaluate_error_handling(code_snippet):
    if "try" in code_snippet and "catch" in code_snippet:
        return 100  # Full score for error handling
    return 50  # Partial score (basic error handling or missing try-catch)


# Step 3: Boundary Conditions Evaluation
def evaluate_boundary_conditions(code_snippet):
    if any(boundary_check in code_snippet for boundary_check in ["< 0", "== 0", "> 1000000"]):
        return 100  # Full score for boundary condition handling
    return 50  # Partial score (some conditions missing)


# Step 4: Code Quality Evaluation
def evaluate_code_quality(code_snippet):
    has_comments = "//" in code_snippet or "/*" in code_snippet
    proper_indentation = "    " in code_snippet or "\t" in code_snippet
    meaningful_variable_names = any(var in code_snippet for var in ["int ", "String ", "double "])

    score = 0
    if has_comments:
        score += 40  # Comments are well-formed
    if proper_indentation:
        score += 30  # Indentation is proper
    if meaningful_variable_names:
        score += 30  # Variable names are meaningful
    return score  # Total out of 100


# Step 5: Code Robustness Evaluation
def evaluate_code_robustness(code_snippet):
    if any(condition in code_snippet for condition in ["if", "while", "for", "validate"]):
        return 100  # Full score for robustness (input validation, condition handling)
    return 50  # Partial score (some robustness missing)


# Step 6: OOP Principles Evaluation (Java Specific)
def evaluate_oop_principles(code_snippet):
    has_encapsulation = "private" in code_snippet and ("get" in code_snippet or "set" in code_snippet)
    has_inheritance = "extends" in code_snippet
    has_polymorphism = "interface" in code_snippet or "abstract" in code_snippet

    score = 0
    if has_encapsulation:
        score += 40  # Encapsulation followed
    if has_inheritance:
        score += 30  # Inheritance used
    if has_polymorphism:
        score += 30  # Polymorphism applied
    return score  # Total out of 100


# Step 7: Syntax, Output, and Code Similarity Evaluation
def evaluate_java_code(reference_code, answer_code, input_data=""):
    # 1. Check for syntax errors using CodeBERT
    prompt = f"Evaluate the following Java code for syntax errors and correctness:\n{answer_code}"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=512)

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs.logits
    predicted_label = torch.argmax(logits).item()

    # Placeholder syntax error detection based on model output
    syntax_feedback = "No syntax errors detected." if predicted_label == 1 else "Syntax error found."
    syntax_errors_count = 0 if predicted_label == 1 else 1  # Simplified syntax error counting

    # 2. Run the Java code for both reference and answer codes
    ref_output = run_java_code(reference_code, input_data, "Reference")
    ans_output = run_java_code(answer_code, input_data, "Answer")

    # 3. Compare outputs line by line
    output_match_percentage = compare_outputs(ref_output, ans_output)

    # 4. Calculate code similarity between reference and answer codes
    code_match_percentage = compare_code_similarity(reference_code, answer_code)

    # Return the results
    return {
        "syntax_feedback": syntax_feedback,
        "syntax_errors_count": syntax_errors_count,
        "output_match_percentage": output_match_percentage,
        "code_match_percentage": code_match_percentage
    }

def run_java_code(code, input_data, filename):
    filename = f"{filename}.java"
    with open(filename, "w") as f:
        f.write(code)

    # Compile the Java code
    compile_process = subprocess.run(["javac", filename], capture_output=True, text=True)
    if compile_process.returncode != 0:
        return f"Compilation Error: {compile_process.stderr}"

    # Run the Java program
    run_process = subprocess.run(["java", filename.replace(".java", "")], capture_output=True, text=True, input=input_data)
    if run_process.returncode != 0:
        return f"Runtime Error: {run_process.stderr}"

    return run_process.stdout

def compare_outputs(ref_output, ans_output):
    ref_lines = ref_output.splitlines()
    ans_lines = ans_output.splitlines()
    total_lines = max(len(ref_lines), len(ans_lines))
    matching_lines = sum(1 for ref_line, ans_line in zip(ref_lines, ans_lines) if ref_line == ans_line)

    if total_lines == 0:
        return 100.0 if ref_output == ans_output else 0.0
    return (matching_lines / total_lines) * 100

def compare_code_similarity(ref_code, ans_code):
    vectorizer = TfidfVectorizer().fit_transform([ref_code, ans_code])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)[0, 1]
    return cosine_sim * 100


# Step 8: Overall Evaluation Function
def evaluate_code(reference_code, answer_code, input_data=""):
    # Perform Syntax, Output, and Code Similarity evaluation
    syntax_results = evaluate_java_code(reference_code, answer_code, input_data)

    # Evaluate other criteria (Error Handling, Boundary, Code Quality, etc.)
    error_handling_score = evaluate_error_handling(answer_code)
    boundary_conditions_score = evaluate_boundary_conditions(answer_code)
    code_quality_score = evaluate_code_quality(answer_code)
    code_robustness_score = evaluate_code_robustness(answer_code)
    oop_principles_score = evaluate_oop_principles(answer_code)

    # Calculate final weighted score
    final_score = (
        error_handling_score * criteria_weights["error_handling"] +
        boundary_conditions_score * criteria_weights["boundary_conditions"] +
        code_quality_score * criteria_weights["code_quality"] +
        code_robustness_score * criteria_weights["code_robustness"] +
        oop_principles_score * criteria_weights["oop_principles"] +
        syntax_results["syntax_errors_count"] * criteria_weights["syntax_correctness"] +
        syntax_results["output_match_percentage"] * criteria_weights["output_match"] +
        syntax_results["code_match_percentage"] * criteria_weights["code_similarity"]
    )

    return {
        "final_score": round(final_score, 2),
        "detailed_results": syntax_results,
        "error_handling_score": error_handling_score,
        "boundary_conditions_score": boundary_conditions_score,
        "code_quality_score": code_quality_score,
        "code_robustness_score": code_robustness_score,
        "oop_principles_score": oop_principles_score
    }


In [4]:
reference_code = """
public class Reference {
    public static int factorial(int n) {
        if (n < 0) {
            return -1; // Error handling for negative input
        } else if (n == 0) {
            return 1;
        } else {
            return n * factorial(n - 1);
        }
    }

    public static void main(String[] args) {
        System.out.println(factorial(5));
    }
}
"""

answer_code = """
public class Answer {
    public static int factorial(int n) {
        if (n < 0) {
            return 0; // Error handling for negative input
        } else if (n == 0) {
            return 1;
        } else {
            return n * factorial(n - 1);
        }
    }

    public static void main(String[] args) {
        System.out.println(factorial(5));
    }
}
"""


In [15]:
# Step 1: Define the Evaluation Criteria and Weights
criteria_weights = {
    "error_handling":0,     # 15%
    "boundary_conditions": 0.15, # 15%
    "code_quality": 0.15,        # 15%
    "code_robustness": 0.3,     # 15%
    "oop_principles": 0,       # 10%
    "syntax_correctness": 0.1,   # 10%
    "output_match": 0.2,         # 10%
    "code_similarity": 0.1       # 10%
}




In [16]:
import json
evaluation_result = evaluate_code(reference_code, answer_code)

# Convert the result to JSON format
evaluation_result_json = json.dumps(evaluation_result, indent=4)

# Output the JSON result
print(evaluation_result_json)

{
    "final_score": 89.76,
    "detailed_results": {
        "syntax_feedback": "Syntax error found.",
        "syntax_errors_count": 1,
        "output_match_percentage": 100.0,
        "code_match_percentage": 96.59280577416355
    },
    "error_handling_score": 50,
    "boundary_conditions_score": 100,
    "code_quality_score": 100,
    "code_robustness_score": 100,
    "oop_principles_score": 0
}
