In [25]:
import os
import pandas as pd
import javalang
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [26]:
def read_java_file(filepath):
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()

def clean_code(code):
    import re
    code = re.sub(r'//.*?\n', '\n', code)
    code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)
    code = re.sub(r'\s+', ' ', code).strip()
    return code

def tokenize_code(code):
    try:
        tokens = list(javalang.tokenizer.tokenize(code))
        return ' '.join(token.value for token in tokens)
    except:
        return ''

def preprocess_file(filepath):
    raw = read_java_file(filepath)
    cleaned = clean_code(raw)
    tokens = tokenize_code(cleaned)
    return tokens

In [27]:
def process_plag_dataset(base_path, csv_path, output_csv='similitud_todo.csv'):
    df = pd.read_csv(csv_path)
    df = df[df['source_dataset'].isin(['ir_plag', 'conplag'])]

    resultados = []

    for _, row in df.iterrows():
        dataset = row['source_dataset']
        plagio = row['label']
        file1 = row['file1']
        file2 = row['file2']
        file1_base = os.path.splitext(file1)[0]
        file2_base = os.path.splitext(file2)[0]

        if dataset == 'ir_plag':
            folder = row['folder_name']
            folder_path = os.path.join(base_path, folder)
            path1 = os.path.join(folder_path, 'original.java')
            path2 = os.path.join(folder_path, 'compared.java')

        elif dataset == 'conplag':
            # folder_name not reliable – construct both variants
            folder1 = os.path.join(base_path, f"{file1_base}_{file2_base}")
            folder2 = os.path.join(base_path, f"{file2_base}_{file1_base}")

            if os.path.isdir(folder1):
                folder_path = folder1
            elif os.path.isdir(folder2):
                folder_path = folder2
            else:
                print(f"[❌] Carpeta no encontrada para {file1} y {file2}")
                continue

            path1 = os.path.join(folder_path, file1)
            path2 = os.path.join(folder_path, file2)

        else:
            continue

        if not (os.path.exists(path1) and os.path.exists(path2)):
            print(f"[⚠️] Archivos no encontrados: {path1}, {path2}")
            continue

        code1 = preprocess_file(path1)
        code2 = preprocess_file(path2)

        if not code1 or not code2:
            print(f"[⚠️] Código vacío: {folder_path}")
            continue

        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform([code1, code2])
        sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]

        resultados.append({
            'folder': os.path.basename(folder_path),
            'file1': os.path.basename(path1),
            'file2': os.path.basename(path2),
            'similaridad': sim,
            'es_plagio': plagio,
            'dataset': dataset
        })

    pd.DataFrame(resultados).to_csv(output_csv, index=False)
    # Print head of the DataFrame for verification
    df_resultados = pd.DataFrame(resultados)
    print(df_resultados.head())
    print(f"\n✅ Resultados guardados en {output_csv}")

In [None]:
def normalize_code_ast(code):
    try:
        tree = javalang.parse.parse(code)
        # Normalizar nombres de variables a VAR1, VAR2, etc.
        var_counter = 0
        var_map = {}

        # Recorrer el AST y normalizar
        for path, node in tree.filter(javalang.tree.VariableDeclarator):
            if node.name not in var_map:
                var_map[node.name] = f"VAR{var_counter}"
                var_counter += 1

        # Similar para métodos, clases, etc.
        return normalized_code
    except:
        return code

In [28]:
if __name__ == '__main__':
    BASE_PATH = 'data/splits/train'
    CSV_PATH = 'data/splits/train.csv'
    process_plag_dataset(BASE_PATH, CSV_PATH, output_csv='similitud_todo.csv')

              folder          file1          file2  similaridad  es_plagio  \
0  c57a973e_fa484fdd  c57a973e.java  fa484fdd.java     0.477170          1   
1  44428e63_c850e422  44428e63.java  c850e422.java     0.403690          0   
2  2ff0355e_83935617  2ff0355e.java  83935617.java     0.430788          0   
3  0017d438_9852706b  0017d438.java  9852706b.java     0.755271          1   
4  bdfe8110_c57a973e  bdfe8110.java  c57a973e.java     0.803951          1   

   dataset  
0  conplag  
1  conplag  
2  conplag  
3  conplag  
4  conplag  

✅ Resultados guardados en similitud_todo.csv


In [31]:
import os
import pandas as pd
import javalang
import re
import numpy as np
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from typing import List, Dict, Tuple, Any

class OptimizedJavaAnalyzer:
    """
    Analizador optimizado que combina TF-IDF + AST de manera inteligente
    """

    def __init__(self):
        self.java_keywords = {
            'abstract', 'assert', 'boolean', 'break', 'byte', 'case', 'catch',
            'char', 'class', 'const', 'continue', 'default', 'do', 'double',
            'else', 'enum', 'extends', 'final', 'finally', 'float', 'for',
            'goto', 'if', 'implements', 'import', 'instanceof', 'int',
            'interface', 'long', 'native', 'new', 'package', 'private',
            'protected', 'public', 'return', 'short', 'static', 'strictfp',
            'super', 'switch', 'synchronized', 'this', 'throw', 'throws',
            'transient', 'try', 'void', 'volatile', 'while'
        }

    def read_java_file(self, filepath: str) -> str:
        try:
            with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
                return f.read()
        except Exception as e:
            print(f"❌ Error leyendo {filepath}: {e}")
            return ""

    def clean_code(self, code: str) -> str:
        if not code:
            return ""

        # Eliminar comentarios
        code = re.sub(r'//.*?\n', '\n', code)
        code = re.sub(r'/\*.*?\*/', '', code, flags=re.DOTALL)

        # Normalizar espacios pero mantener estructura
        code = re.sub(r'[ \t]+', ' ', code)
        code = re.sub(r'\n\s*\n', '\n', code)

        return code.strip()

    def tokenize_code_advanced(self, code: str) -> Tuple[str, bool]:
        """Tokenización avanzada con mejor normalización"""
        if not code:
            return "", False

        try:
            tokens = list(javalang.tokenizer.tokenize(code))
            normalized_tokens = []

            # Mapeos para normalización consistente
            identifier_map = {}
            var_counter = 1
            method_counter = 1
            class_counter = 1

            for i, token in enumerate(tokens):
                token_value = token.value
                token_type = type(token).__name__

                # Preservar palabras clave de Java
                if token_value.lower() in self.java_keywords:
                    normalized_tokens.append(token_value.lower())

                # Normalizar identificadores por contexto
                elif token_type == 'Identifier':
                    if token_value not in identifier_map:
                        # Determinar contexto más robusto
                        prev_token = tokens[i-1].value if i > 0 else ""
                        next_token = tokens[i+1].value if i < len(tokens)-1 else ""

                        if prev_token in ['class', 'interface', 'enum']:
                            identifier_map[token_value] = f'CLASS{class_counter}'
                            class_counter += 1
                        elif next_token == '(' or (i < len(tokens)-2 and tokens[i+1].value == ' ' and tokens[i+2].value == '('):
                            identifier_map[token_value] = f'METHOD{method_counter}'
                            method_counter += 1
                        else:
                            identifier_map[token_value] = f'VAR{var_counter}'
                            var_counter += 1

                    normalized_tokens.append(identifier_map[token_value])

                # Preservar estructura importante
                elif token_type in ['Operator', 'Separator']:
                    # Agrupar operadores similares
                    if token_value in ['==', '!=', '<', '>', '<=', '>=']:
                        normalized_tokens.append('COMPARISON')
                    elif token_value in ['+', '-', '*', '/', '%']:
                        normalized_tokens.append('ARITHMETIC')
                    elif token_value in ['&&', '||', '!']:
                        normalized_tokens.append('LOGICAL')
                    else:
                        normalized_tokens.append(token_value)

                # Normalizar literales
                elif token_type in ['Integer', 'FloatingPoint']:
                    normalized_tokens.append('NUMBER')
                elif token_type in ['String', 'Character']:
                    normalized_tokens.append('STRING')
                elif token_type == 'Boolean':
                    normalized_tokens.append('BOOLEAN')
                else:
                    normalized_tokens.append(token_value)

            return ' '.join(normalized_tokens), True

        except Exception as e:
            print(f"⚠️ Error en tokenización: {e}")
            # Fallback simple
            words = re.findall(r'\b\w+\b|[+\-*/=<>!&|{}()\[\];,.]', code)
            return ' '.join(words), False

    def extract_ast_features(self, code: str) -> Dict[str, Any]:
        """Extrae características AST corregidas"""
        features = {
            'ast_success': False,
            'num_classes': 0,
            'num_methods': 0,
            'num_fields': 0,
            'num_statements': 0,
            'num_loops': 0,
            'num_conditionals': 0,
            'num_method_calls': 0,
            'num_assignments': 0,
            'max_depth': 0,
            'total_lines': len([l for l in code.split('\n') if l.strip()]),
            'cyclomatic_complexity': 1
        }

        try:
            tree = javalang.parse.parse(code)
            features['ast_success'] = True
            self._count_ast_nodes(tree, features, 0)
        except Exception as e:
            # Fallback léxico
            features.update(self._lexical_fallback_analysis(code))

        return features

    def _count_ast_nodes(self, node, features: Dict, depth: int):
        """Cuenta nodos del AST de manera robusta"""
        if node is None:
            return

        features['max_depth'] = max(features['max_depth'], depth)
        node_type = type(node).__name__

        # Contar nodos específicos
        if node_type == 'ClassDeclaration':
            features['num_classes'] += 1
        elif node_type == 'MethodDeclaration':
            features['num_methods'] += 1
        elif node_type == 'FieldDeclaration':
            features['num_fields'] += 1
        elif node_type in ['BlockStatement', 'ExpressionStatement', 'ReturnStatement',
                           'LocalVariableDeclaration']:
            features['num_statements'] += 1
        elif node_type in ['ForStatement', 'WhileStatement', 'DoStatement', 'EnhancedForStatement']:
            features['num_loops'] += 1
            features['cyclomatic_complexity'] += 1
        elif node_type in ['IfStatement', 'SwitchStatement']:
            features['num_conditionals'] += 1
            features['cyclomatic_complexity'] += 1
        elif node_type == 'MethodInvocation':
            features['num_method_calls'] += 1
        elif node_type == 'Assignment':
            features['num_assignments'] += 1

        # Recursión
        if hasattr(node, 'children'):
            for child in node.children:
                if child is not None:
                    if isinstance(child, list):
                        for subchild in child:
                            if subchild is not None:
                                self._count_ast_nodes(subchild, features, depth + 1)
                    else:
                        self._count_ast_nodes(child, features, depth + 1)

    def _lexical_fallback_analysis(self, code: str) -> Dict[str, int]:
        """Análisis léxico cuando AST falla"""
        features = {}

        features['num_classes'] = len(re.findall(r'\bclass\s+\w+', code))
        features['num_methods'] = len(re.findall(r'\b\w+\s*\([^)]*\)\s*\{', code))
        features['num_loops'] = (code.count('for(') + code.count('for (') +
                                 code.count('while(') + code.count('while ('))
        features['num_conditionals'] = code.count('if(') + code.count('if (')
        features['num_method_calls'] = len(re.findall(r'\w+\s*\(', code)) - features['num_methods']
        features['num_assignments'] = code.count('=') - code.count('==') - code.count('!=')
        features['cyclomatic_complexity'] = 1 + features['num_loops'] + features['num_conditionals']

        # Profundidad por llaves
        max_braces = 0
        current_braces = 0
        for char in code:
            if char == '{':
                current_braces += 1
                max_braces = max(max_braces, current_braces)
            elif char == '}':
                current_braces -= 1
        features['max_depth'] = max_braces

        return features

    def compute_intelligent_ast_similarity(self, features1: Dict, features2: Dict,
                                           dataset_type: str) -> float:
        """
        Similitud AST inteligente que considera el tipo de dataset
        """
        structural_features = [
            'num_classes', 'num_methods', 'num_fields', 'num_statements',
            'num_loops', 'num_conditionals', 'num_method_calls',
            'num_assignments', 'max_depth', 'cyclomatic_complexity'
        ]

        similarities = []

        for feature in structural_features:
            val1 = features1.get(feature, 0)
            val2 = features2.get(feature, 0)

            if val1 == 0 and val2 == 0:
                similarity = 1.0
            else:
                max_val = max(val1, val2)
                min_val = min(val1, val2)

                if max_val == 0:
                    similarity = 1.0
                else:
                    similarity = min_val / max_val

            similarities.append(similarity)

        # Pesos adaptativos según el dataset
        if dataset_type == 'conplag':
            # Para ConPlag, dar menos peso a similitud estructural
            # (códigos similares por resolver mismo problema)
            weights = [0.05, 0.10, 0.05, 0.15, 0.15, 0.15, 0.10, 0.10, 0.10, 0.05]
        else:  # ir_plag
            # Para IR-Plag, la estructura puede ser más discriminativa
            weights = [0.15, 0.20, 0.10, 0.15, 0.10, 0.10, 0.05, 0.05, 0.05, 0.05]

        weighted_similarity = np.average(similarities, weights=weights)
        return weighted_similarity

def enhanced_tfidf_similarity(code1: str, code2: str,
                              dataset_type: str = 'general') -> float:
    """TF-IDF optimizado con configuración adaptativa"""
    if not code1 or not code2:
        return 0.0

    # Configuración adaptativa
    if dataset_type == 'conplag':
        # Para ConPlag: más n-grams para capturar patrones de código
        ngram_range = (1, 3)
        max_features = 4000
    else:  # ir_plag
        # Para IR-Plag: configuración balanceada
        ngram_range = (1, 2)
        max_features = 3000

    vectorizer = TfidfVectorizer(
        ngram_range=ngram_range,
        max_features=max_features,
        min_df=1,
        max_df=1.0,
        lowercase=False,
        token_pattern=r'\b\w+\b'
    )

    try:
        tfidf_matrix = vectorizer.fit_transform([code1, code2])
        similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
        return similarity
    except:
        return 0.0

def process_dataset_optimized(base_path: str, csv_path: str,
                              output_csv: str = 'similitud_optimized.csv'):
    """Procesa dataset con análisis optimizado"""
    print("🚀 PROCESAMIENTO OPTIMIZADO DEL DATASET")
    print("=" * 45)

    analyzer = OptimizedJavaAnalyzer()
    df = pd.read_csv(csv_path)
    df = df[df['source_dataset'].isin(['ir_plag', 'conplag'])]

    print(f"📊 Procesando {len(df)} pares...")

    resultados = []
    processed = 0

    for _, row in df.iterrows():
        dataset = row['source_dataset']
        plagio = row['label']
        file1 = row['file1']
        file2 = row['file2']
        file1_base = os.path.splitext(file1)[0]
        file2_base = os.path.splitext(file2)[0]

        # Determinar rutas
        if dataset == 'ir_plag':
            folder = row['folder_name']
            folder_path = os.path.join(base_path, folder)
            path1 = os.path.join(folder_path, 'original.java')
            path2 = os.path.join(folder_path, 'compared.java')
        else:  # conplag
            folder1 = os.path.join(base_path, f"{file1_base}_{file2_base}")
            folder2 = os.path.join(base_path, f"{file2_base}_{file1_base}")

            if os.path.isdir(folder1):
                folder_path = folder1
            elif os.path.isdir(folder2):
                folder_path = folder2
            else:
                continue

            path1 = os.path.join(folder_path, file1)
            path2 = os.path.join(folder_path, file2)

        if not (os.path.exists(path1) and os.path.exists(path2)):
            continue

        # Procesar archivos
        raw_code1 = analyzer.read_java_file(path1)
        raw_code2 = analyzer.read_java_file(path2)

        cleaned_code1 = analyzer.clean_code(raw_code1)
        cleaned_code2 = analyzer.clean_code(raw_code2)

        tokens1, success1 = analyzer.tokenize_code_advanced(cleaned_code1)
        tokens2, success2 = analyzer.tokenize_code_advanced(cleaned_code2)

        if not tokens1 or not tokens2:
            continue

        # Calcular similitudes
        tfidf_sim = enhanced_tfidf_similarity(tokens1, tokens2, dataset)

        ast_features1 = analyzer.extract_ast_features(cleaned_code1)
        ast_features2 = analyzer.extract_ast_features(cleaned_code2)
        ast_sim = analyzer.compute_intelligent_ast_similarity(
            ast_features1, ast_features2, dataset
        )

        # Similitud combinada adaptativa
        if dataset == 'conplag':
            # Para ConPlag: más peso a TF-IDF
            combined_sim = 0.85 * tfidf_sim + 0.15 * ast_sim
        else:  # ir_plag
            # Para IR-Plag: balance 70-30
            combined_sim = 0.70 * tfidf_sim + 0.30 * ast_sim

        # Características adicionales para ML
        length_ratio = min(len(tokens1.split()), len(tokens2.split())) / max(len(tokens1.split()), len(tokens2.split()), 1)
        complexity_ratio = min(ast_features1.get('cyclomatic_complexity', 1), ast_features2.get('cyclomatic_complexity', 1)) / max(ast_features1.get('cyclomatic_complexity', 1), ast_features2.get('cyclomatic_complexity', 1))

        resultado = {
            'folder': os.path.basename(folder_path),
            'file1': os.path.basename(path1),
            'file2': os.path.basename(path2),
            'tfidf_similarity': round(tfidf_sim, 4),
            'ast_similarity': round(ast_sim, 4),
            'combined_similarity': round(combined_sim, 4),
            'length_ratio': round(length_ratio, 4),
            'complexity_ratio': round(complexity_ratio, 4),
            'es_plagio': plagio,
            'dataset': dataset,
            'tokenize_success': success1 and success2,
            'ast_success': ast_features1['ast_success'] and ast_features2['ast_success']
        }

        resultados.append(resultado)
        processed += 1

        if processed % 100 == 0:
            print(f"✅ Procesados {processed}/{len(df)} pares...")

    # Guardar y analizar resultados
    df_resultados = pd.DataFrame(resultados)
    df_resultados.to_csv(output_csv, index=False)

    print(f"\n📊 RESULTADOS OPTIMIZADOS:")
    print(f"✅ Pares procesados: {len(df_resultados)}")

    # Análisis por dataset
    for dataset in ['ir_plag', 'conplag']:
        subset = df_resultados[df_resultados['dataset'] == dataset]
        if len(subset) > 0:
            plagiados = subset[subset['es_plagio'] == 1]
            no_plagiados = subset[subset['es_plagio'] == 0]

            print(f"\n📈 {dataset.upper()}:")
            if len(plagiados) > 0:
                print(f"  Plagiados - TF-IDF: {plagiados['tfidf_similarity'].mean():.3f}")
                print(f"  Plagiados - AST: {plagiados['ast_similarity'].mean():.3f}")
                print(f"  Plagiados - Combinado: {plagiados['combined_similarity'].mean():.3f}")
            if len(no_plagiados) > 0:
                print(f"  No plagiados - TF-IDF: {no_plagiados['tfidf_similarity'].mean():.3f}")
                print(f"  No plagiados - AST: {no_plagiados['ast_similarity'].mean():.3f}")
                print(f"  No plagiados - Combinado: {no_plagiados['combined_similarity'].mean():.3f}")

    print(f"\n💾 Resultados guardados en: {output_csv}")
    print(df_resultados.head())

    return df_resultados

def evaluate_thresholds(df_results: pd.DataFrame):
    """Evalúa diferentes thresholds para encontrar el óptimo"""
    print(f"\n🎯 EVALUACIÓN DE THRESHOLDS")
    print("-" * 30)

    features = ['tfidf_similarity', 'ast_similarity', 'combined_similarity']
    thresholds = np.arange(0.3, 0.8, 0.05)

    best_results = {}

    for feature in features:
        best_acc = 0
        best_thresh = 0

        print(f"\n📊 {feature.upper()}:")
        for thresh in thresholds:
            predictions = (df_results[feature] > thresh).astype(int)
            accuracy = (predictions == df_results['es_plagio']).mean()

            if accuracy > best_acc:
                best_acc = accuracy
                best_thresh = thresh

            print(f"  Threshold {thresh:.2f}: Accuracy {accuracy:.3f}")

        best_results[feature] = {'threshold': best_thresh, 'accuracy': best_acc}
        print(f"  🏆 Mejor: {best_thresh:.2f} (Accuracy: {best_acc:.3f})")

    return best_results

if __name__ == '__main__':
    BASE_PATH = 'data/splits/train'
    CSV_PATH = 'data/splits/train.csv'

    # Procesamiento optimizado
    df_results = process_dataset_optimized(BASE_PATH, CSV_PATH)

    # Evaluación de thresholds
    best_thresholds = evaluate_thresholds(df_results)

🚀 PROCESAMIENTO OPTIMIZADO DEL DATASET
📊 Procesando 957 pares...
✅ Procesados 100/957 pares...
✅ Procesados 200/957 pares...
✅ Procesados 300/957 pares...
✅ Procesados 400/957 pares...
✅ Procesados 500/957 pares...
✅ Procesados 600/957 pares...
✅ Procesados 700/957 pares...
✅ Procesados 800/957 pares...
✅ Procesados 900/957 pares...

📊 RESULTADOS OPTIMIZADOS:
✅ Pares procesados: 957

📈 IR_PLAG:
  Plagiados - TF-IDF: 0.678
  Plagiados - AST: 0.866
  Plagiados - Combinado: 0.734
  No plagiados - TF-IDF: 0.667
  No plagiados - AST: 0.885
  No plagiados - Combinado: 0.732

📈 CONPLAG:
  Plagiados - TF-IDF: 0.584
  Plagiados - AST: 0.894
  Plagiados - Combinado: 0.631
  No plagiados - TF-IDF: 0.423
  No plagiados - AST: 0.722
  No plagiados - Combinado: 0.468

💾 Resultados guardados en: similitud_optimized.csv
              folder          file1          file2  tfidf_similarity  \
0  c57a973e_fa484fdd  c57a973e.java  fa484fdd.java            0.3820   
1  44428e63_c850e422  44428e63.java  c85