In [12]:
!pip install javalang



In [14]:
import pandas as pd
import math
import re
from statistics import pstdev
from collections import Counter
import javalang
from typing import Dict, Any

class CodeMetricsCalculator:
    def __init__(self):
        pass
    
    def calculate_all_metrics(self, code: str) -> Dict[str, Any]:
        """Calculate all code metrics for a given Java code snippet"""
        metrics = {}
        
        # SLOC
        metrics['sloc'] = self.calculate_sloc(code)
        
        # Proxy Indentation
        metrics['proxy_indentation'] = self.calculate_proxy_indentation(code)
        
        # McCabe Cyclomatic Complexity
        metrics['mcCabe'] = self.calculate_mcCabe(code)
        
        # Nested Block Depth
        metrics['nested_block_depth'] = self.calculate_nbd(code)
        
        # McClure Complexity
        mcclure_metrics = self.calculate_mcClure(code)
        metrics['mcClure'] = mcclure_metrics['MCLC']
        metrics['mcClure_NVAR'] = mcclure_metrics['NVAR']
        metrics['mcClure_NCOMP'] = mcclure_metrics['NCOMP']
        
        # Halstead Metrics
        halstead_metrics = self.calculate_halstead_metrics(code)
        metrics['difficulty'] = halstead_metrics.get('difficulty', 0)
        metrics['effort'] = halstead_metrics.get('effort', 0)
        
        # Maintainability Index
        metrics['maintainability_index'] = self.calculate_maintenance_index(
            code, halstead_metrics, metrics['mcCabe'], metrics['sloc']
        )
        
        # Readability
        metrics['readability'] = self.calculate_readability(code)
        
        # Fan Out
        metrics['fan_out'] = self.calculate_fan_out(code)
        
        return metrics
    
    def calculate_sloc(self, code: str) -> int:
        """Calculate Source Lines of Code"""
        # Remove block comments
        code = re.sub(r"/\*.*?\*/", "", code, flags=re.DOTALL)
        lines = code.splitlines()
        
        blank = 0
        for line in lines:
            stripped = line.strip()
            if stripped == "" or stripped.startswith("//"):
                blank += 1
        
        return len(lines) - blank
    
    def calculate_proxy_indentation(self, code: str) -> float:
        """Calculate indentation complexity"""
        code_wo_block = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
        indent_values = []
        
        for line in code_wo_block.splitlines():
            stripped = line.strip()
            if stripped == "" or stripped.startswith("//"):
                continue
            
            space_count = 0
            is_blank_line = True
            for ch in line:
                if ch == ' ':
                    space_count += 1
                elif ch == '\t':
                    space_count += 8
                else:
                    is_blank_line = False
                    break
            
            if not is_blank_line:
                indent_values.append(space_count)
        
        if not indent_values:
            return 0.0
        
        sd = pstdev(indent_values)
        return round(sd, 2)
    
    def calculate_mcCabe(self, code: str) -> int:
        """Calculate McCabe cyclomatic complexity"""
        if_stmts = len(re.findall(r'\bif\b', code))
        for_stmts = len(re.findall(r'\bfor\b', code))
        while_stmts = len(re.findall(r'\bwhile\b', code))
        do_stmts = len(re.findall(r'\bdo\b', code))
        catch_clauses = len(re.findall(r'\bcatch\b', code))
        ternary_exprs = len(re.findall(r'\?.*?:', code))
        foreach_stmts = len(re.findall(r'for\s*\(', code))
        switch_entries = len(re.findall(r'\bcase\b', code))
        break_stmts = len(re.findall(r'\bbreak\b', code))
        continue_stmts = len(re.findall(r'\bcontinue\b', code))
        throw_stmts = len(re.findall(r'\bthrow\b', code))
        
        total = (
            if_stmts + for_stmts + foreach_stmts + while_stmts + do_stmts +
            switch_entries + catch_clauses + ternary_exprs +
            break_stmts + continue_stmts + throw_stmts + 1
        )
        return total
    
    def calculate_nbd(self, code: str) -> int:
        """Calculate Nested Block Depth"""
        depth = 0
        max_depth = 0
        tokens = re.findall(r'\b(if|switch|try|for|while|do|synchronized|class|void|public|private)\b|[{]|[}]', code)
        
        for token in tokens:
            if token and token.strip():
                keyword = token.strip()
                if keyword in ["if", "switch", "try", "for", "while", "do", "synchronized", "class", "void", "public", "private"]:
                    depth += 1
                    max_depth = max(max_depth, depth)
                elif keyword == "{":
                    depth += 1
                    max_depth = max(max_depth, depth)
                elif keyword == "}":
                    depth = max(0, depth - 1)
        
        return max_depth - 1 if max_depth > 0 else 0
    
    def calculate_mcClure(self, code: str) -> Dict[str, int]:
        """Calculate McClure complexity metrics"""
        namesNVR = set()
        countCompare = 0
        maxCompareInOneExpression = 0
        
        # Extract conditions
        conditional_exprs = []
        conditional_exprs += re.findall(r'if\s*\((.*?)\)', code, re.DOTALL)
        conditional_exprs += re.findall(r'while\s*\((.*?)\)', code, re.DOTALL)
        conditional_exprs += re.findall(r'do\s*\{.*?\}\s*while\s*\((.*?)\)', code, re.DOTALL)
        conditional_exprs += re.findall(r'for\s*\((.*?);(.*?);(.*?)\)', code, re.DOTALL)
        conditional_exprs += re.findall(r'\?(.*?)\:', code, re.DOTALL)
        conditional_exprs += re.findall(r'switch\s*\((.*?)\)', code, re.DOTALL)
        
        for expr in conditional_exprs:
            if isinstance(expr, tuple):
                expr = " ".join(expr)
            
            # Count comparisons
            sum_compares = 1
            sum_compares += expr.count("&&")
            sum_compares += expr.count("||")
            
            if maxCompareInOneExpression < sum_compares:
                maxCompareInOneExpression = sum_compares
            
            countCompare += sum_compares
            
            # Extract variable names
            tokens = re.findall(r'[A-Za-z_]\w*', expr)
            for token in tokens:
                if not token.isupper():
                    namesNVR.add(token)
        
        # Correction for switch statements
        num_switch_selectors = len(re.findall(r'switch\s*\(', code))
        num_cases = len(re.findall(r'case\s+[^:]+:', code))
        countCompare = (countCompare - num_switch_selectors) + num_cases
        
        NVAR = len(namesNVR)
        NCOMP = countCompare
        MCLC = NVAR + NCOMP
        
        return {
            'NVAR': NVAR,
            'NCOMP': NCOMP,
            'MCLC': MCLC,
            'max_compare': maxCompareInOneExpression
        }
    
    def calculate_halstead_metrics(self, code: str) -> Dict[str, float]:
        """Calculate Halstead metrics"""
        try:
            # Simple approximation since full Halstead requires complex parsing
            operators = Counter()
            operands = Counter()
            
            # Count basic operators and operands
            operators['='] = len(re.findall(r'=', code))
            operators['+'] = len(re.findall(r'\+', code))
            operators['-'] = len(re.findall(r'-', code))
            operators['*'] = len(re.findall(r'\*', code))
            operators['/'] = len(re.findall(r'/', code))
            operators['%'] = len(re.findall(r'%', code))
            operators['=='] = len(re.findall(r'==', code))
            operators['!='] = len(re.findall(r'!=', code))
            operators['<'] = len(re.findall(r'<', code))
            operators['>'] = len(re.findall(r'>', code))
            operators['<='] = len(re.findall(r'<=', code))
            operators['>='] = len(re.findall(r'>=', code))
            operators['&&'] = len(re.findall(r'&&', code))
            operators['||'] = len(re.findall(r'\|\|', code))
            operators['!'] = len(re.findall(r'!', code))
            operators['++'] = len(re.findall(r'\+\+', code))
            operators['--'] = len(re.findall(r'--', code))
            
            # Count variables and literals as operands
            variables = re.findall(r'\b([a-zA-Z_][a-zA-Z0-9_]*)\b', code)
            for var in variables:
                if var not in ['if', 'else', 'for', 'while', 'do', 'switch', 'case', 'default', 'return', 'class', 'void', 'public', 'private', 'protected', 'static', 'final', 'int', 'double', 'float', 'boolean', 'char', 'String']:
                    operands[var] += 1
            
            # Count numeric literals
            numbers = re.findall(r'\b\d+\.?\d*\b', code)
            for num in numbers:
                operands[num] += 1
            
            # Count string literals
            strings = re.findall(r'"[^"]*"', code)
            for s in strings:
                operands[s] += 1
            
            n1 = len(operators)
            n2 = len(operands)
            N1 = sum(operators.values())
            N2 = sum(operands.values())
            
            if n1 + n2 == 0:
                return {'difficulty': 0, 'effort': 0, 'volume': 0}
            
            volume = (N1 + N2) * math.log2(n1 + n2)
            difficulty = (n1 / 2) * (N2 / n2) if n2 > 0 else 0
            effort = difficulty * volume
            
            return {
                'difficulty': round(difficulty, 2),
                'effort': round(effort, 2),
                'volume': round(volume, 2)
            }
            
        except Exception as e:
            print(f"Error calculating Halstead metrics: {e}")
            return {'difficulty': 0, 'effort': 0, 'volume': 0}
    
    def calculate_maintenance_index(self, code: str, halstead_metrics: Dict[str, float], mcCabe: int, sloc: int) -> float:
        """Calculate Maintainability Index"""
        volume = halstead_metrics.get('volume', 0)
        if sloc == 0:
            return 0
        
        try:
            value = 171 - (5.2 * math.log(volume if volume > 0 else 1)) \
                    - (0.23 * mcCabe) \
                    - (16.2 * math.log(sloc if sloc > 0 else 1))
            return round(value, 2)
        except:
            return 0
    
    def calculate_readability(self, code: str) -> float:
        """Calculate readability score"""
        lines = [line.strip() for line in code.splitlines() if line.strip()]
        if not lines:
            return 0.0
        
        avg_length = sum(len(line) for line in lines) / len(lines)
        return round(avg_length, 2)
    
    def calculate_fan_out(self, code: str) -> int:
        """Calculate fan-out (method calls)"""
        method_calls = re.findall(r'\b\w+\s*\(', code)
        keywords = {"if(", "for(", "while(", "switch(", "catch(", "return(", "new("}
        filtered = [m for m in method_calls if m not in keywords]
        return len(filtered)

# Main processing function
def process_data(csv_path,code_column):
    # Read the CSV file
    df = pd.read_csv(csv_path)
    
    # Initialize the metrics calculator
    calculator = CodeMetricsCalculator()
    
    # Create a list to store all metrics
    all_metrics = []
    
    # Process each row
    for index, row in df.iterrows():
        print(f"Processing row {index + 1}/{len(df)}")
        
        # Get the Java code from the 'synthetic_code' column
        java_code = row[code_column]
        
        try:
            # Calculate all metrics
            metrics = calculator.calculate_all_metrics(java_code)
            all_metrics.append(metrics)
        except Exception as e:
            print(f"Error processing row {index}: {e}")
            # Add empty metrics for failed rows
            all_metrics.append({})
    
    # Convert metrics to DataFrame
    metrics_df = pd.DataFrame(all_metrics)
    
    # Combine with original data
    result_df = pd.concat([df, metrics_df], axis=1)
    
    return result_df

In [15]:
import pandas as pd

# List of CSV paths
csv_files = [
    "/kaggle/input/synthetic-data/synthetic_data1.csv",
    "/kaggle/input/synthetic-data/synthetic_data3.csv",
    "/kaggle/input/synthetic-data/synthetic_data3_5.csv",
    "/kaggle/input/synthetic-data/synthetic_data4.csv"
]

# Assuming process_data(csv_path, code_column) is already defined
def process_all_files(csv_files, code_column='synthetic_code', output_file="synthetic_data_with_metrics.csv"):
    all_results = []

    for file in csv_files:
        print(f"Processing {file} ...")
        df_metrics = process_data(file, code_column)
        all_results.append(df_metrics)

    # Combine all processed DataFrames
    result_df = pd.concat(all_results, ignore_index=True)

    # Save to CSV
    result_df.to_csv(output_file, index=False)
    
    print("Processing completed!")
    print(f"Original columns: {list(result_df.columns)}")
    print(f"Number of rows processed: {len(result_df)}")
    print("\nSample of the results:")
    print(result_df.head())
    
    return result_df

# Usage
if __name__ == "__main__":
    final_df = process_all_files(csv_files, code_column='synthetic_code')


Processing /kaggle/input/synthetic-data/synthetic_data1.csv ...
Processing row 1/328
Processing row 2/328
Processing row 3/328
Processing row 4/328
Processing row 5/328
Processing row 6/328
Processing row 7/328
Processing row 8/328
Processing row 9/328
Processing row 10/328
Processing row 11/328
Processing row 12/328
Processing row 13/328
Processing row 14/328
Processing row 15/328
Processing row 16/328
Processing row 17/328
Processing row 18/328
Processing row 19/328
Processing row 20/328
Processing row 21/328
Processing row 22/328
Processing row 23/328
Processing row 24/328
Processing row 25/328
Processing row 26/328
Processing row 27/328
Processing row 28/328
Processing row 29/328
Processing row 30/328
Processing row 31/328
Processing row 32/328
Processing row 33/328
Processing row 34/328
Processing row 35/328
Processing row 36/328
Processing row 37/328
Processing row 38/328
Processing row 39/328
Processing row 40/328
Processing row 41/328
Processing row 42/328
Processing row 43/328

In [16]:
# If you want to save it to a CSV file:
final_df.to_csv("final_synthetic_data.csv", index=False)

In [17]:
import pandas as pd

# List of CSV paths
csv_files = [
    "/kaggle/input/dataset-intern/train_scaled.csv",
    "/kaggle/input/dataset-intern/valid_scaled.csv",
    "/kaggle/input/dataset-intern/test_scaled.csv"
]

# Assuming process_data(csv_path, code_column) is already defined
def process_all_files(csv_files, code_column='code', output_file="scaled_data_with_metrics.csv"):
    all_results = []

    for file in csv_files:
        print(f"Processing {file} ...")
        df_metrics = process_data(file, code_column)  # uses the 'code' column
        all_results.append(df_metrics)

    # Combine all processed DataFrames
    result_df = pd.concat(all_results, ignore_index=True)

    # Save to CSV
    result_df.to_csv(output_file, index=False)
    
    print("Processing completed!")
    print(f"Original columns: {list(result_df.columns)}")
    print(f"Number of rows processed: {len(result_df)}")
    print("\nSample of the results:")
    print(result_df.head())
    
    return result_df

# Usage
if __name__ == "__main__":
    final_df1 = process_all_files(csv_files, code_column='code')

Processing /kaggle/input/dataset-intern/train_scaled.csv ...
Processing row 1/2414
Processing row 2/2414
Processing row 3/2414
Processing row 4/2414
Processing row 5/2414
Processing row 6/2414
Processing row 7/2414
Processing row 8/2414
Processing row 9/2414
Processing row 10/2414
Processing row 11/2414
Processing row 12/2414
Processing row 13/2414
Processing row 14/2414
Processing row 15/2414
Processing row 16/2414
Processing row 17/2414
Processing row 18/2414
Processing row 19/2414
Processing row 20/2414
Processing row 21/2414
Processing row 22/2414
Processing row 23/2414
Processing row 24/2414
Processing row 25/2414
Processing row 26/2414
Processing row 27/2414
Processing row 28/2414
Processing row 29/2414
Processing row 30/2414
Processing row 31/2414
Processing row 32/2414
Processing row 33/2414
Processing row 34/2414
Processing row 35/2414
Processing row 36/2414
Processing row 37/2414
Processing row 38/2414
Processing row 39/2414
Processing row 40/2414
Processing row 41/2414
Proce

In [18]:
# Columns to drop (the short names)
columns_to_drop = ['lc', 'pi', 'ma', 'nbd', 'ml', 'd', 'mi', 'fo', 'r', 'e']

def drop_short_column_names(df):
    """Drop the short column names from the dataframe"""
    # Only drop columns that actually exist in the dataframe
    existing_short_cols = [col for col in columns_to_drop if col in df.columns]
    return df.drop(columns=existing_short_cols)

# Apply to all your dataframes
final_df = drop_short_column_names(final_df)

# Verify the results
print("final_df columns after dropping short names:", list(final_df.columns))

final_df columns after dropping short names: ['orig_code', 'orig_label', 'orig_reason', 'synthetic_id', 'synthetic_code', 'synthetic_label', 'sloc', 'proxy_indentation', 'mcCabe', 'nested_block_depth', 'mcClure', 'mcClure_NVAR', 'mcClure_NCOMP', 'difficulty', 'effort', 'maintainability_index', 'readability', 'fan_out']


In [19]:
final_df.to_csv("final_data.csv", index=False)

In [20]:
final_df=pd.read_csv("/kaggle/input/dataset/final_data.csv")
final_synthetic_df=pd.read_csv("/kaggle/input/dataset/final_synthetic_data.csv")

In [23]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler

def standardize_metrics(df, metrics_columns=None, fit_scaler=True, scaler=None):
    """
    Standardize metrics using sklearn's StandardScaler.
    Adds new columns with suffix '_std'.
    """
    if metrics_columns is None:
        metrics_columns = df.select_dtypes(include=['number']).columns.tolist()

    if fit_scaler:
        scaler = StandardScaler()
        scaled = scaler.fit_transform(df[metrics_columns])
    else:
        if scaler is None:
            raise ValueError("Scaler must be provided if fit_scaler=False")
        scaled = scaler.transform(df[metrics_columns])

    std_df = df.copy()
    for i, col in enumerate(metrics_columns):
        std_df[col + "_std"] = scaled[:, i]

    return std_df, scaler


def normalize_metrics(df, metrics_columns=None, scaler=None):
    """
    Normalize metrics using sklearn's MinMaxScaler (default 0–1).
    Adds new columns with suffix '_norm'.
    """
    if metrics_columns is None:
        metrics_columns = df.select_dtypes(include=['number']).columns.tolist()

    if scaler is None:
        scaler = MinMaxScaler()
        scaled = scaler.fit_transform(df[metrics_columns])
    else:
        scaled = scaler.transform(df[metrics_columns])

    norm_df = df.copy()
    for i, col in enumerate(metrics_columns):
        norm_df[col + "_norm"] = scaled[:, i]

    return norm_df, scaler


def robust_standardize_metrics(df, metrics_columns=None, scaler=None):
    """
    Scale metrics using sklearn's RobustScaler (robust to outliers).
    Adds new columns with suffix '_robust'.
    """
    if metrics_columns is None:
        metrics_columns = df.select_dtypes(include=['number']).columns.tolist()

    if scaler is None:
        scaler = RobustScaler()
        scaled = scaler.fit_transform(df[metrics_columns])
    else:
        scaled = scaler.transform(df[metrics_columns])

    robust_df = df.copy()
    for i, col in enumerate(metrics_columns):
        robust_df[col + "_robust"] = scaled[:, i]

    return robust_df, scaler


In [21]:
def combine_all_scalings(df, metrics_columns=None):
    """
    Run standardize_metrics, normalize_metrics, and robust_standardize_metrics
    and combine their outputs into a single DataFrame.
    """
    # StandardScaler
    std_df, std_scaler = standardize_metrics(df, metrics_columns=metrics_columns, fit_scaler=True)
    
    # MinMaxScaler
    norm_df, norm_scaler = normalize_metrics(df, metrics_columns=metrics_columns)
    
    # RobustScaler
    robust_df, robust_scaler = robust_standardize_metrics(df, metrics_columns=metrics_columns)
    
    # Extract only the scaled columns from each
    std_cols = [c for c in std_df.columns if c.endswith("_std")]
    norm_cols = [c for c in norm_df.columns if c.endswith("_norm")]
    robust_cols = [c for c in robust_df.columns if c.endswith("_robust")]
    
    # Combine everything into one DF
    combined_df = pd.concat([df, std_df[std_cols], norm_df[norm_cols], robust_df[robust_cols]], axis=1)
    
    return combined_df, std_scaler, norm_scaler, robust_scaler


In [24]:
obj1 = combine_all_scalings(final_df)
obj2 = combine_all_scalings(final_synthetic_df)

In [25]:
df_org=obj1[0]
df_syn=obj2[0]

In [26]:
df_org.columns

Index(['project_name', 'project_version', 'label', 'code', 'code_comment',
       'code_no_comment', 'sloc', 'proxy_indentation', 'mcCabe',
       'nested_block_depth', 'mcClure', 'mcClure_NVAR', 'mcClure_NCOMP',
       'difficulty', 'effort', 'maintainability_index', 'readability',
       'fan_out', 'label_std', 'sloc_std', 'proxy_indentation_std',
       'mcCabe_std', 'nested_block_depth_std', 'mcClure_std',
       'mcClure_NVAR_std', 'mcClure_NCOMP_std', 'difficulty_std', 'effort_std',
       'maintainability_index_std', 'readability_std', 'fan_out_std',
       'label_norm', 'sloc_norm', 'proxy_indentation_norm', 'mcCabe_norm',
       'nested_block_depth_norm', 'mcClure_norm', 'mcClure_NVAR_norm',
       'mcClure_NCOMP_norm', 'difficulty_norm', 'effort_norm',
       'maintainability_index_norm', 'readability_norm', 'fan_out_norm',
       'label_robust', 'sloc_robust', 'proxy_indentation_robust',
       'mcCabe_robust', 'nested_block_depth_robust', 'mcClure_robust',
       'mcClure_N

In [27]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

# Define feature groups
raw_feats = ['sloc','proxy_indentation','mcCabe','nested_block_depth','mcClure',
             'mcClure_NVAR','mcClure_NCOMP','difficulty','effort',
             'maintainability_index','readability','fan_out']

std_feats = [f"{col}_std" for col in raw_feats]
norm_feats = [f"{col}_norm" for col in raw_feats]
robust_feats = [f"{col}_robust" for col in raw_feats]

feature_sets = {
    "raw": raw_feats,
    "std": std_feats,
    "norm": norm_feats,
    "robust": robust_feats
}

# Target
y = df_org['label']

results = {}

for scale, feats in feature_sets.items():
    X = df_org[feats]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train XGBoost
    model = xgb.XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    results[scale] = {"accuracy": acc, "f1": f1}

print("📊 Results Comparison:")
for scale, metrics in results.items():
    print(f"{scale}: Acc={metrics['accuracy']:.4f}, F1={metrics['f1']:.4f}")

📊 Results Comparison:
raw: Acc=0.6532, F1=0.5993
std: Acc=0.6532, F1=0.5993
norm: Acc=0.6532, F1=0.5993
robust: Acc=0.6532, F1=0.5993


In [28]:
df_syn.columns

Index(['orig_code', 'orig_label', 'orig_reason', 'synthetic_id',
       'synthetic_code', 'synthetic_label', 'sloc', 'proxy_indentation',
       'mcCabe', 'nested_block_depth', 'mcClure', 'mcClure_NVAR',
       'mcClure_NCOMP', 'difficulty', 'effort', 'maintainability_index',
       'readability', 'fan_out', 'orig_label_std', 'synthetic_id_std',
       'sloc_std', 'proxy_indentation_std', 'mcCabe_std',
       'nested_block_depth_std', 'mcClure_std', 'mcClure_NVAR_std',
       'mcClure_NCOMP_std', 'difficulty_std', 'effort_std',
       'maintainability_index_std', 'readability_std', 'fan_out_std',
       'orig_label_norm', 'synthetic_id_norm', 'sloc_norm',
       'proxy_indentation_norm', 'mcCabe_norm', 'nested_block_depth_norm',
       'mcClure_norm', 'mcClure_NVAR_norm', 'mcClure_NCOMP_norm',
       'difficulty_norm', 'effort_norm', 'maintainability_index_norm',
       'readability_norm', 'fan_out_norm', 'orig_label_robust',
       'synthetic_id_robust', 'sloc_robust', 'proxy_indentat

In [29]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder

# Encode labels properly (ensures 0/1 classes)
le = LabelEncoder()
y = le.fit_transform(df_syn['orig_label'])
print("Classes mapped:", le.classes_)  # e.g. [0 2] → mapped to [0 1]

# Define feature groups
raw_feats = ['sloc','proxy_indentation','mcCabe','nested_block_depth','mcClure',
             'mcClure_NVAR','mcClure_NCOMP','difficulty','effort',
             'maintainability_index','readability','fan_out']

std_feats = [f"{col}_std" for col in raw_feats]
norm_feats = [f"{col}_norm" for col in raw_feats]
robust_feats = [f"{col}_robust" for col in raw_feats]

feature_sets = {
    "raw": raw_feats,
    "std": std_feats,
    "norm": norm_feats,
    "robust": robust_feats
}

results = {}

for scale, feats in feature_sets.items():
    X = df_syn[feats]

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # Train XGBoost
    model = xgb.XGBClassifier(
        n_estimators=200,
        learning_rate=0.1,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        use_label_encoder=False,
        eval_metric="logloss"
    )
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="binary")

    results[scale] = {"accuracy": acc, "f1": f1}

print("📊 Results Comparison (Binary Classes):")
for scale, metrics in results.items():
    print(f"{scale}: Acc={metrics['accuracy']:.4f}, F1={metrics['f1']:.4f}")


Classes mapped: [0 2]
📊 Results Comparison (Binary Classes):
raw: Acc=0.7481, F1=0.7571
std: Acc=0.7481, F1=0.7571
norm: Acc=0.7481, F1=0.7571
robust: Acc=0.7481, F1=0.7571


In [None]:
!pip install tabpfn

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from tabpfn import TabPFNClassifier

# Define feature groups
raw_feats = ['sloc','proxy_indentation','mcCabe','nested_block_depth','mcClure',
             'mcClure_NVAR','mcClure_NCOMP','difficulty','effort',
             'maintainability_index','readability','fan_out']

std_feats = [f"{col}_std" for col in raw_feats]
norm_feats = [f"{col}_norm" for col in raw_feats]
robust_feats = [f"{col}_robust" for col in raw_feats]

feature_sets = {
    "raw": raw_feats,
    "std": std_feats,
    "norm": norm_feats,
    "robust": robust_feats
}

# Target
y = df['label'].astype(int)   # make sure label is int

results = {}

for scale, feats in feature_sets.items():
    X = df[feats].to_numpy()

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # TabPFN classifier
    clf = TabPFNClassifier()
    clf.fit(X_train, y_train)

    # Predictions
    y_pred = clf.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="weighted")

    results[scale] = {"accuracy": acc, "f1": f1}

print("📊 TabPFN Results by Scaling:")
for scale, metrics in results.items():
    print(f"{scale}: Acc={metrics['accuracy']:.4f}, F1={metrics['f1']:.4f}")

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
from tabpfn import TabPFNClassifier

# Encode labels properly (ensures 0/1 classes)
le = LabelEncoder()
y = le.fit_transform(df_syn['orig_label'])
print("Classes mapped:", le.classes_)  # e.g. [0 2] → mapped to [0 1]
print("Unique y values after encoding:", np.unique(y))

# Define feature groups
raw_feats = ['sloc','proxy_indentation','mcCabe','nested_block_depth','mcClure',
             'mcClure_NVAR','mcClure_NCOMP','difficulty','effort',
             'maintainability_index','readability','fan_out']

std_feats = [f"{col}_std" for col in raw_feats]
norm_feats = [f"{col}_norm" for col in raw_feats]
robust_feats = [f"{col}_robust" for col in raw_feats]

feature_sets = {
    "raw": raw_feats,
    "std": std_feats,
    "norm": norm_feats,
    "robust": robust_feats
}

results = {}

for scale, feats in feature_sets.items():
    X = df_syn[feats].to_numpy()

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    # TabPFN classifier
    clf = TabPFNClassifier()
    clf.fit(X_train, y_train)

    # Predictions
    y_pred = clf.predict(X_test)

    # Metrics
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average="binary")

    results[scale] = {"accuracy": acc, "f1": f1}

print("📊 TabPFN Results Comparison (Binary Classes):")
for scale, metrics in results.items():
    print(f"{scale}: Acc={metrics['accuracy']:.4f}, F1={metrics['f1']:.4f}")

In [None]:
df_org.to_csv("/kaggle/working/df_org.csv")
df_syn.to_csv("/kaggle/working/df_syn.csv")