# Java Metrics Analyzer (No Radon)

This notebook reads Java file paths from `labels.csv`, parses the Java code, and calculates:
- Methods per class
- Parameters per method
- Cyclomatic complexity (custom logic)
- Comment density (excluding commented-out code)
- LOC per method
- Number of public methods
- halsted volume

It saves results in `metrics.csv`.

In [1]:
import pandas as pd
import javalang
import os
import re

def read_java_file(file_path):
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except:
        return ""

In [2]:
import re
import math

def calculate_halstead_volume(code):
    operators = set([
        '+', '-', '*', '/', '%', '++', '--', '==', '!=', '>', '<', '>=', '<=', 
        '&&', '||', '!', '&', '|', '^', '~', '<<', '>>', '=', '+=', '-=', '*=', '/=',
        '%=', '&=', '|=', '^=', '<<=', '>>='
    ])
    
    # Regex for Java operators and operands
    operator_pattern = r'(\+|\-|\*|\/|\%|\+\+|\-\-|\=\=|\!\=|\>\=|\<\=|\>|\<|\&\&|\|\||\!|\=|\+=|\-=|\*=|/=|%=|&=|\|=|\^=|<<=|>>=|&|\||\^|~|<<|>>)'
    operand_pattern = r'\b[_a-zA-Z][_a-zA-Z0-9]*\b'

    operator_matches = re.findall(operator_pattern, code)
    operand_matches = re.findall(operand_pattern, code)

    n1 = len(set(operator_matches))
    n2 = len(set(operand_matches))
    N1 = len(operator_matches)
    N2 = len(operand_matches)

    n = n1 + n2
    N = N1 + N2

    if n == 0:
        return 0.0  # Avoid log(0)

    volume = N * math.log2(max(n, 1))
    return volume


In [3]:
def get_nesting_depth(statements, current_depth=1):
    max_depth = current_depth

    if not statements:
        return max_depth

    if not isinstance(statements, list):
        statements = [statements]

    for stmt in statements:
        if hasattr(stmt, 'body'):
            body = stmt.body
            if isinstance(body, list):
                depth = get_nesting_depth(body, current_depth + 1)
            else:
                depth = get_nesting_depth([body], current_depth + 1)
            max_depth = max(max_depth, depth)

        elif hasattr(stmt, 'then_statement'):
            then_stmt = stmt.then_statement
            if isinstance(then_stmt, list):
                depth = get_nesting_depth(then_stmt, current_depth + 1)
            else:
                depth = get_nesting_depth([then_stmt], current_depth + 1)
            max_depth = max(max_depth, depth)

        elif hasattr(stmt, 'block'):
            block = stmt.block
            if hasattr(block, 'statements'):
                depth = get_nesting_depth(block.statements, current_depth + 1)
                max_depth = max(max_depth, depth)

    return max_depth


def calculate_metrics(code):
    metrics = {
        'methods_per_class': 0,
        'params_per_method': 0,
        'cyclomatic_complexity': 1,  # base = 1
        'comment_density': 0,
        'loc_per_method': 0,
        'public_methods': 0,
        'nesting_depth': 0,
        'halstead_volume': 0,
    }

    try:
        tree = javalang.parse.parse(code)
    except:
        return metrics

    total_classes = 0
    total_methods = 0
    total_params = 0
    total_locs = 0
    public_method_count = 0

    # Estimate method bodies from start to end lines
    lines = code.splitlines()

    for path, node in tree:
        if isinstance(node, javalang.tree.ClassDeclaration):
            total_classes += 1
            methods = node.methods
            total_methods += len(methods)

            for m in methods:
                # Parameters
                total_params += len(m.parameters)

                # Public method count
                if 'public' in m.modifiers:
                    public_method_count += 1

                # Nesting depth
                if m.body:
                    metrics['nesting_depth'] = max(metrics['nesting_depth'], get_nesting_depth(m.body))

                # LOC per method: estimate start and end lines
                method_start = m.position.line if hasattr(m, 'position') and m.position else None
                method_end = None
                if m.body and isinstance(m.body, list):
                    body_lines = [stmt.position.line for stmt in m.body if hasattr(stmt, 'position') and stmt.position]
                    if body_lines:
                        method_end = max(body_lines)
                if method_start and method_end:
                    method_lines = lines[method_start - 1 : method_end]
                    total_locs += len(method_lines)

    # Cyclomatic Complexity (basic estimation)
    control_keywords = ['if', 'for', 'while', 'case', 'catch', '&&', r'\|\|', r'\?']
    for keyword in control_keywords:
        pattern = rf'\b{keyword}\b'
        matches = re.findall(pattern, code)
        metrics['cyclomatic_complexity'] += len(matches)

    # Comment Density
    comment_lines = 0
    code_lines = 0
    in_block_comment = False

    for line in lines:
        stripped = line.strip()
        if in_block_comment:
            comment_lines += 1
            if '*/' in stripped:
                in_block_comment = False
        elif stripped.startswith('//'):
            comment_lines += 1
        elif '/*' in stripped:
            comment_lines += 1
            if '*/' not in stripped:
                in_block_comment = True
        elif stripped and not stripped.startswith('//') and not in_block_comment:
            code_lines += 1

    total_lines = comment_lines + code_lines
    metrics['comment_density'] = comment_lines / total_lines if total_lines > 0 else 0

    # Final calculations
    metrics['methods_per_class'] = total_methods / total_classes if total_classes > 0 else 0
    metrics['params_per_method'] = total_params / total_methods if total_methods > 0 else 0
    metrics['loc_per_method'] = total_locs / total_methods if total_methods > 0 else 0
    metrics['public_methods'] = public_method_count
    metrics['halstead_volume'] = calculate_halstead_volume(code)


    return metrics


In [5]:


def normalize(value, min_val, max_val):
    if max_val == min_val:
        return 5.5  # Neutral value when no variation
    scaled = 1 + 9 * ((value - min_val) / (max_val - min_val))
    return round(max(1.0, min(10.0, scaled)), 2)

# Step 1: Read input.csv
df = pd.read_csv("Ollama_output.csv")

# Step 2: Calculate metrics
metrics_list = []

for _, row in df.iterrows():
    example = row['Example']
    code = row['Generated Java Code']

    # Skip if code is empty or only whitespace or NaN
    if pd.isna(code) or not str(code).strip():
        print(f"⚠️ Skipping Example {example} (no Java code)")
        continue

    # Call your metric calculation function here
    metrics = calculate_metrics(code)

    # Add example number to the metrics dict at the beginning
    metrics_with_example = {'Example': example}
    metrics_with_example.update(metrics)

    metrics_list.append(metrics_with_example)

# Step 3: Create DataFrame
metrics_df = pd.DataFrame(metrics_list)

# Step 4: Normalize each metric (except Example)
for column in metrics_df.columns:
    if column == 'Example':
        continue
    min_val = metrics_df[column].min()
    max_val = metrics_df[column].max()
    metrics_df[column] = metrics_df[column].apply(lambda x: normalize(x, min_val, max_val))
    #metrics_df[column] = metrics_df[column]

# Step 5: Ensure 'Example' is first column
columns_order = ['Example'] + [col for col in metrics_df.columns if col != 'Example']
metrics_df = metrics_df[columns_order]

# Step 6: Save metrics to output.csv
metrics_df.to_csv("reports/llma3.2_Matrics.csv", index=False)
print("✅ Normalized metrics saved to 'output.csv'")


✅ Normalized metrics saved to 'output.csv'


In [20]:
import pandas as pd
import re

def clean_java_code(code):
    if pd.isna(code):
        return ''
    
    code = str(code).strip()
    
    # Remove ```java at the start
    if code.startswith('```java'):
        code = code[len('```java'):].strip()
    
    # Remove ``` at the end
    if code.endswith('```'):
        code = code[:-3].strip()
    
    # Also remove any escaped quotes if needed
    code = code.replace('\"\"', '"').replace("''", "'")
    
    return code

# Example usage:
df = pd.read_csv('Gemini_output200.csv')

# Apply the cleaning function
df['Generated Java Code'] = df['Generated Java Code'].apply(clean_java_code)

# Optionally save the cleaned file to check it
df.to_csv('cleinput.csv', index=False)

print("✅ Cleaned Java code saved to 'cleaned_input.csv'")


✅ Cleaned Java code saved to 'cleaned_input.csv'
