# This is a sample Jupyter Notebook

Below is an example of a code cell.
Put your cursor into the cell and press Shift+Enter to execute it and select the next one, or click 'Run Cell' button.

Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

To learn more about Jupyter Notebooks in PyCharm, see [help](https://www.jetbrains.com/help/pycharm/ipython-notebook-support.html).
For an overview of PyCharm, go to Help -> Learn IDE features or refer to [our documentation](https://www.jetbrains.com/help/pycharm/getting-started.html).

In [1]:
print("Hello World!")


Hello World!


In [2]:
import pandas as pd
from pathlib import Path

# Ruta base
BASE_PATH = Path("data/splits")

# Cargar CSV
df = pd.read_csv(BASE_PATH / "train.csv")

# Mostrar primeras filas
df.head()

Unnamed: 0,pair_id,folder_name,case_id,case_orig,file1,file2,label,plagiarism_level,source_dataset,comparison_type
0,pair_000857,c57a973e_fa484fdd,p08,problem-8,c57a973e.java,fa484fdd.java,1,contest_level,conplag,contest_submission_pair
1,pair_000806,44428e63_c850e422,p18,problem-18,44428e63.java,c850e422.java,0,none,conplag,contest_submission_pair
2,pair_001063,2ff0355e_83935617,p09,problem-9,2ff0355e.java,83935617.java,0,none,conplag,contest_submission_pair
3,pair_001031,0017d438_9852706b,p09,problem-9,0017d438.java,9852706b.java,1,contest_level,conplag,contest_submission_pair
4,pair_000899,bdfe8110_c57a973e,p08,problem-8,bdfe8110.java,c57a973e.java,1,contest_level,conplag,contest_submission_pair


In [3]:
# Función para leer código
def read_code(folder_name, filename):
    path = BASE_PATH / "train" / folder_name / filename
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        return f.read()

# Agregar columnas con el texto fuente
df['code1'] = df.apply(lambda row: read_code(row['folder_name'], row['file1']), axis=1)
df['code2'] = df.apply(lambda row: read_code(row['folder_name'], row['file2']), axis=1)

df[['code1', 'code2']].head()


Unnamed: 0,code1,code2
0,import java.io.BufferedReader;\nimport java.io...,import java.io.IOException;\nimport java.io.In...
1,import java.util.*;\nimport java.io.*;\n\npubl...,import java.io.*;\nimport java.util.*;\n\npubl...
2,import java.io.*;\nimport java.util.*;\n\npubl...,import java.util.*;\nimport java.lang.*;\nimpo...
3,import java.io.BufferedReader;\nimport java.io...,import java.io.BufferedReader;\nimport java.io...
4,import java.io.BufferedReader;\nimport java.io...,import java.io.BufferedReader;\nimport java.io...


In [4]:
import re
import pandas as pd

def preprocess(code):
    # Eliminar comentarios y normalizar espacios
    code = re.sub(r"//.*|/\*[\s\S]*?\*/", "", code)
    code = re.sub(r"\s+", " ", code)
    return code.strip()

# Tokenizador simple basado en Java
TOKEN_SPECIFICATION = [
    ('KEYWORD',     r'\b(?:abstract|assert|boolean|break|byte|case|catch|char|class|const|continue|default|do|double|else|enum|extends|final|finally|float|for|goto|if|implements|import|instanceof|int|interface|long|native|new|null|package|private|protected|public|return|short|static|strictfp|super|switch|synchronized|this|throw|throws|transient|try|void|volatile|while|true|false)\b'),
    ('IDENTIFIER',  r'\b[a-zA-Z_][a-zA-Z0-9_]*\b'),
    ('NUMBER',      r'\b\d+(\.\d+)?\b'),
    ('STRING',      r'"(\\.|[^"\\])*"'),
    ('CHAR',        r"'(\\.|[^'\\])'"),
    ('OPERATOR',    r'==|!=|<=|>=|\+\+|--|&&|\|\||[-+*/%<>=!&|^~]'),
    ('SEPARATOR',   r'[()\[\]{};,\.]'),
]

token_regex = '|'.join(f'(?P<{name}>{pattern})' for name, pattern in TOKEN_SPECIFICATION)
compiled_re = re.compile(token_regex)

def tokenize(code):
    tokens = []
    for match in compiled_re.finditer(code):
        tokens.append(match.group())
    return tokens

# Aplicar a DataFrame
def preprocess_and_tokenize(code):
    code = preprocess(code)
    tokens = tokenize(code)
    return tokens  # o ' '.join(tokens) si deseas una cadena en lugar de lista

# Suponiendo que df ya está cargado
df['tokens1'] = df['code1'].apply(preprocess_and_tokenize)
df['tokens2'] = df['code2'].apply(preprocess_and_tokenize)


In [5]:
# Convertir listas de tokens en cadenas de texto
df['tokens1_str'] = df['tokens1'].apply(lambda tokens: ' '.join(tokens))
df['tokens2_str'] = df['tokens2'].apply(lambda tokens: ' '.join(tokens))


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Unir textos ya tokenizados
all_texts = df['tokens1_str'].tolist() + df['tokens2_str'].tolist()

# TF-IDF con tokens, respetando mayúsculas y minúsculas
vectorizer = TfidfVectorizer(token_pattern=r'\b\w+\b', lowercase=False)
tfidf_matrix = vectorizer.fit_transform(all_texts)

n = len(df)
tfidf1 = tfidf_matrix[:n]
tfidf2 = tfidf_matrix[n:]

# Similitud del coseno entre cada par
similarities = cosine_similarity(tfidf1, tfidf2).diagonal()
df['similarity'] = similarities

df[['similarity']]


Unnamed: 0,similarity
0,0.272003
1,0.154999
2,0.215748
3,0.388949
4,0.878248
...,...
952,0.978233
953,0.751777
954,0.512311
955,0.389717


In [7]:

from sklearn.metrics import classification_report

# Clasificación simple por umbral
threshold = 0.8
df['predicted'] = (df['similarity'] > threshold).astype(int)

# Evaluación
print(classification_report(df['label'], df['predicted']))


              precision    recall  f1-score   support

           0       0.65      0.96      0.77       534
           1       0.87      0.34      0.48       423

    accuracy                           0.68       957
   macro avg       0.76      0.65      0.63       957
weighted avg       0.74      0.68      0.64       957

