In [None]:
import requests
import time 

BASE_URL = "https://api.hackupm2025.workers.dev"
train_list_endpoint = "/api/v1/patients/train"
i = 1
lista=[]
while True:
    try:
        params_consulta = {
            'page': i,
            'limit': 20,
        }

        url_completa = BASE_URL + train_list_endpoint

        response = requests.get(url_completa, params=params_consulta, timeout=10)

        # 5. Comprueba si la petici√≥n fue exitosa (c√≥digo 200)
        if response.status_code == 200:
            datos = response.json()
            # print(f"URL final solicitada: {response.url}")
            for fila in datos['data']:
                lista.append((fila['patient_id'], fila['has_diabetes'], fila['medical_note']))
            # (Opcional) Muestra cu√°ntos datos reales vinieron
            # Asumiendo que los datos est√°n en una clave 'data')
            
            # ---
            # 1. CORRECCI√ìN DE SINTAXIS Y L√ìGICA:
            #    Mueve el 'break' DENTRO del if de √©xito.
            #    Corrige la sintaxis de acceso al diccionario.
            #    Compara con el booleano 'False', no con el string "false".
            # ---
            if not datos["pagination"]["hasNextPage"]:
                print("No hay m√°s p√°ginas. Saliendo del bucle.")
                break # ¬°√âxito! Salimos del bucle.
            
            # Si llegamos aqu√≠, es que hay m√°s p√°ginas. Incrementamos.
            i += 1

        else:
            # 2. CORRECCI√ìN DE ERROR:
            #    Si la API da un error (ej. 404, 500), debemos parar el bucle.
            print(f"Error: La API devolvi√≥ el c√≥digo {response.status_code}")
            print(f"Respuesta: {response.text}")
            print("Saliendo del bucle debido a un error de la API.")
            break # Salimos del bucle si la API falla

    # 3. CORRECCI√ìN DE EXCEPCI√ìN:
    #    La sintaxis 'Exception or ...' es incorrecta.
    #    Es mejor capturar la excepci√≥n base de 'requests'.
    except requests.exceptions.RequestException as e: 
        print(f"Error de conexi√≥n o red: {e}")
        print(f"No se pudo conectar a '{BASE_URL}'. Saliendo del bucle.")
        break # Salimos si hay un error de conexi√≥n

    # A√±ade una peque√±a pausa para no saturar la API
    #time.sleep(0.5)

print("¬°Datos de entrenamiento obtenidos con √©xito!")

KeyboardInterrupt: 

In [None]:
!pip install scikit-learn

In [None]:
!pip install nltk svgling
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')

In [None]:
!pip install medspacy

In [None]:
import spacy
import medspacy
from medspacy.ner import TargetRule
from spacy.language import Language
from spacy.tokens import Span
from loguru import logger
from spacy.util import filter_spans

logger.disable("PyRuSH")
logger.disable("medspacy")

# ---  Cargar modelo base de spaCy con POS tagging ---
base_nlp = spacy.load("en_core_web_sm")  # o "es_core_news_sm" si es espa√±ol

# ---  Integrar MedSpaCy SOBRE ese modelo y guardar el resultado ---
nlp = medspacy.load(enable=["target_matcher", "context"], nlp=base_nlp)

print("Pipeline despu√©s de cargar MedSpaCy:", nlp.pipe_names)

# ---  A√±adir tus reglas de entidades cl√≠nicas ---
target_rules = [
    TargetRule("HbA1c", "MARKER"),
    TargetRule("BMI", "BMI"),
    TargetRule("glucose", "GLUCOSE"),
    TargetRule("year", "AGE"),
    TargetRule("female", "GENDER"),
    TargetRule("male", "GENDER"),

    # Para determinar fumadores
    TargetRule("smoker", "SMOKE"),
    TargetRule("smoke", "SMOKE"),
    TargetRule("smoking", "SMOKE"),
    TargetRule("smokin", "SMOKE"),

    # Hipertensi√≥n
    TargetRule("hypertension", "HYPERTENSION"),
    TargetRule("hypertensive", "HYPERTENSION"),
    TargetRule("high blood pressure", "HYPERTENSION"),
    TargetRule("HTN", "HYPERTENSION"),

    TargetRule("heart disease", "HEART_DISEASE"),
    TargetRule("coronary artery disease", "HEART_DISEASE"),
    TargetRule("ischemic heart disease", "HEART_DISEASE"),
    TargetRule("cardiovascular disease", "HEART_DISEASE"),
    TargetRule("CVD", "HEART_DISEASE"),
    TargetRule("IHD", "HEART_DISEASE"),
    TargetRule("CAD", "HEART_DISEASE"),
]

nlp.get_pipe("medspacy_target_matcher").add(target_rules)

def span_overlaps_any(span_start, span_end, ents):
    """Devuelve True si el span [span_start, span_end) solapa con alguna entidad en ents."""
    for e in ents:
        # entidad e cubre [e.start, e.end)
        if not (span_end <= e.start or span_start >= e.end):
            return True
    return False

# ---  Tu componente que busca valores adjetivales ---
@Language.component("find_marker_value_bidirectional_safe")
def find_marker_value_bidirectional_safe(doc):
    current_ents = list(doc.ents)  # entidades originales
    new_ents = []

    # √≠ndices ocupados por entidades originales (r√°pido para checks)
    occupied_tokens = set()
    for e in current_ents:
        occupied_tokens.update(range(e.start, e.end))

    for ent in current_ents:
        if ent.label_ not in ["MARKER", "BMI", "GLUCOSE", "AGE"]:
            continue

        # -------- B√öSQUEDA HACIA ADELANTE --------
        window_start = ent.end
        window_end = min(ent.end + 5, len(doc))
        for token in doc[window_start:window_end]:
            # candidate span: [token.i, token.i+1) o incluir adv antes si procede
            start = token.i
            if token.i - 1 >= 0 and doc[token.i - 1].pos_ == "ADV":
                start = token.i - 1
            end = token.i + 1

            # comprobar condiciones pos/num
            is_value = token.like_num or token.pos_ == "ADJ" or token.lower_ in {"high","low","normal","elevated","increased","decreased"}
            if not is_value:
                continue

            # NO crear si solapa con entidades existentes
            if span_overlaps_any(start, end, current_ents):
                # si solapa, saltamos (no intentamos recortar autom√°ticamente)
                continue

            # NO solapar con nuevas entidades que ya hemos a√±adido
            if span_overlaps_any(start, end, new_ents):
                continue

            new_ents.append(Span(doc, start, end, label=f"{ent.label_}_VALUE"))
            break

        # -------- B√öSQUEDA HACIA ATR√ÅS --------
        window_start_back = max(ent.start - 5, 0)
        window_end_back = ent.start
        # iteramos en orden inverso para pillar el adjetivo m√°s cercano
        for token in reversed(doc[window_start_back:window_end_back]):
            start = token.i
            # incluir adv antes si hay (ej. "very high")
            if token.i - 1 >= 0 and doc[token.i - 1].pos_ == "ADV":
                start = token.i - 1
            end = token.i + 1

            is_value = token.like_num or token.pos_ == "ADJ" or token.lower_ in {"high","low","normal","elevated","increased","decreased"}
            if not is_value:
                continue

            # evitar solapamientos con entidades originales/nuevas
            if span_overlaps_any(start, end, current_ents):
                continue
            if span_overlaps_any(start, end, new_ents):
                continue

            new_ents.append(Span(doc, start, end, label=f"{ent.label_}_VALUE"))
            break

    # Busca las unidades de la concentraci√≥n de glucosa
    extra_ents = []
    for ent in new_ents:
        if ent.label_ == "GLUCOSE_VALUE":
            try:
                float(ent.text)
                #print(f"'{ent.text}' es un n√∫mero float.")

                possible_units = {"mg/dl", "mg/dL", "mg / dL", "mmol/L", "mmol/l", "g/L", "mg%", "mg dl", "mg per dL", "mg"}
                
                # Despu√©s de detectar que 'ent.text' es un n√∫mero:
                if ent.end < len(doc):
                    next_token = doc[ent.end]
                
                    # üîç Intentamos varias formas:
                    combined = next_token.text
                    # incluye tambi√©n dos tokens seguidos ("mg" + "/" + "dL")
                    if ent.end + 2 < len(doc):
                        combined2 = next_token.text + doc[ent.end + 1].text + doc[ent.end + 2].text
                        combined2 = combined2.replace(" ", "")
                    else:
                        combined2 = ""
                
                    # Normalizar a min√∫sculas y sin espacios
                    combined = combined.lower().replace(" ", "")
                    if combined in possible_units or combined2.lower() in possible_units:
                        #print(f" ‚Üí Se detecta unidad '{combined}'")
                        new_label = "GLUCOSE_UNITS"
                        extra_ents.append(Span(doc, next_token.i, min(len(doc), next_token.i + 3), label=new_label))

            except ValueError:
                # No es float
                pass
    
    # Combina y filtra solapamientos (filter_spans tambi√©n ayuda si hay igualdad/duplas)
    all_ents = current_ents + new_ents + extra_ents
    try:
        doc.ents = filter_spans(all_ents)
    except Exception as e:
        # diagn√≥stico detallado para depuraci√≥n: imprime spans problem√°ticos
        print("ERROR al asignar doc.ents:", e)
        print("Entidades actuales:")
        for e0 in current_ents:
            print(f"  - {e0.text} [{e0.start},{e0.end}) {e0.label_}")
        print("Entidades nuevas propuestas:")
        for e1 in new_ents:
            print(f"  - {e1.text} [{e1.start},{e1.end}) {e1.label_}")
        # re-raise para que no se silencie
        raise

    return doc


In [None]:
from medspacy.context import ConTextRule
import re

# ---------------------------------------------------
# Asegurar ConText justo despu√©s del target matcher
# ---------------------------------------------------
if "medspacy_context" not in nlp.pipe_names:
    nlp.add_pipe("medspacy_context", after="medspacy_target_matcher")
else:
    nlp.remove_pipe("medspacy_context")
    nlp.add_pipe("medspacy_context", after="medspacy_target_matcher")

# ------------------------------------
# Reglas ConText extra para la estructura de las medical notes
# ------------------------------------
context = nlp.get_pipe("medspacy_context")
context.add([
    # Negaciones gen√©ricas √∫tiles
    ConTextRule("no", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),
    ConTextRule("without", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),
    ConTextRule("free of", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),
    ConTextRule("denies", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=8),
    ConTextRule("denies any", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=8),
    ConTextRule("never", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),
    ConTextRule("no history of", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=10),
    ConTextRule("negative for", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),

    # ‚Äúnon ‚Ä¶‚Äù (variaciones t√≠picas)
    ConTextRule("non", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=5),
    ConTextRule("non-", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=5),
    ConTextRule("non smoking", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("non-smoking", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("non smoker", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("non-smoker", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("non smokin", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),

    # Hist√≥rico (no actual) ‚Äî cuenta como presente para ENFERMEDADES pero no para SMOKE
    ConTextRule("past", "HISTORICAL", direction="FORWARD", max_scope=6),
    ConTextRule("former", "HISTORICAL", direction="FORWARD", max_scope=6),
    ConTextRule("formerly", "HISTORICAL", direction="FORWARD", max_scope=6),
    ConTextRule("ex-smoker", "HISTORICAL", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("history of", "HISTORICAL", direction="FORWARD", max_scope=8),
    ConTextRule("hx of", "HISTORICAL", direction="FORWARD", max_scope=8),
    ConTextRule("h/o", "HISTORICAL", direction="FORWARD", max_scope=8),
    ConTextRule("PMH of", "HISTORICAL", direction="FORWARD", max_scope=8),
])

# ----------------------------------------------------
# votaci√≥n por regex del SMOKE en funci√≥n de lo detectado
# ----------------------------------------------------
NEG_SMOKE_RE = re.compile(
    r"\b(non[-\s]smok\w|never\s+smok\w*|denies\s+smok\w*|no\s+(history\s+of\s+)?smok\w*|not\s+a\s+smoker)\b",
    re.IGNORECASE,
)
HIST_SMOKE_RE = re.compile(
    r"\b(former(ly)?\s+smok\w*|past\s+smok\w*|ex[-\s]smok\w|history\s+of\s+smok\w*)\b",
    re.IGNORECASE,
)
POS_SMOKE_RE = re.compile(
    r"\b(current(ly)?\s+a?\s*smok\w*|smokes\b|smoking\b|is\s+a\s+smoker|smoker\b)\b",
    re.IGNORECASE,
)

@Language.component("smoking_flag_classifier")
def smoking_flag_classifier(doc):
    text = doc.text.lower()
    doc._.smoking_vote = None  # -1=neg, 0=hist, 1=pos, None=indeterminado

    if NEG_SMOKE_RE.search(text):
        doc._.smoking_vote = -1
        return doc
    if HIST_SMOKE_RE.search(text):
        doc._.smoking_vote = 1   
        return doc
    if POS_SMOKE_RE.search(text):
        if not NEG_SMOKE_RE.search(text):
            doc._.smoking_vote = 1
            return doc
    return doc

# Registrar extensi√≥n de Doc para el voto
if not spacy.tokens.Doc.has_extension("smoking_vote"):
    spacy.tokens.Doc.set_extension("smoking_vote", default=None)

In [None]:
nlp.add_pipe("find_marker_value_bidirectional_safe", after="medspacy_target_matcher")
nlp.add_pipe("smoking_flag_classifier", after="medspacy_context")

In [None]:
# --------------------------------------------
# 8) Funciones auxiliares para los flags
# --------------------------------------------
# Para SMOKE (actual): requerimos NO negado, NO familiar, NO hist√≥rico, NO hipot√©tico
def is_asserted_current(ent):
    return not getattr(ent._, "is_negated", False) \
        and not getattr(ent._, "is_family", False) \
        and not getattr(ent._, "is_historical", False) \
        and not getattr(ent._, "is_hypothetical", False)

# Para ENFERMEDADES: contar como presente si NO est√° negado, NO familiar, NO hipot√©tico
def is_present_condition(ent):
    return not getattr(ent._, "is_negated", False) \
        and not getattr(ent._, "is_family", False) \
        and not getattr(ent._, "is_hypothetical", False)

# ----------------------------------------------------------
# 9) C√°lculo de flags (combina voto y entidades ConText)
# ----------------------------------------------------------
def compute_smoker_flag(doc):
    # Prioridad al voto del regex si existe
    if doc._.smoking_vote == -1:
        return 0
    if doc._.smoking_vote == 0:
        return 0
    if doc._.smoking_vote == 1:
        return 1
    # Si no hubo voto, usa entidades + ConText (solo actuales)
    return 1 if any(ent.label_ == "SMOKE" and is_asserted_current(ent) for ent in doc.ents) else 0

def compute_hypertension_flag(doc):
    # Cuenta hist√≥rico como presente mientras no est√© negado/familiar/hipot√©tico
    return 1 if any(ent.label_ == "HYPERTENSION" and is_present_condition(ent) for ent in doc.ents) else 0

def compute_heart_disease_flag(doc):
    # Cuenta hist√≥rico como presente mientras no est√© negado/familiar/hipot√©tico
    return 1 if any(ent.label_ == "HEART_DISEASE" and is_present_condition(ent) for ent in doc.ents) else 0

In [None]:

print("Pipeline actualizado:", nlp.pipe_names)

for i in range(len(lista)):
    if lista[i][0]==4941:
        text = lista[i][2]
        doc = nlp(text)
        print(f"Textos encontrados en el paciente con ID {lista[i][0]}:")
        for ent in doc.ents:
            print(f"Texto: '{ent.text}', Etiqueta: '{ent.label_}'\n")
    
        smoker_flag = compute_smoker_flag(doc)
        hypertension_flag = compute_hypertension_flag(doc)
        heart_disease_flag = compute_heart_disease_flag(doc)
    
        print(f"--> Fumador: {smoker_flag}")
        print(f"--> Hypertension: {hypertension_flag}")
        print(f"--> Heart disease: {heart_disease_flag}")

In [None]:
import numpy as np
import pandas as pd
from word2number import w2n

data_rows = []

for i in range(len(lista)):
    patient_id = lista[i][0]
    has_diabetes = lista[i][1]
    text = lista[i][2]
    doc = nlp(text)

    smoker_flag = compute_smoker_flag(doc)
    hypertension_flag = compute_hypertension_flag(doc)
    heart_disease_flag = compute_heart_disease_flag(doc)
    
    # Creamos un diccionario con el ID y luego rellenamos con las entidades
    row = {"patient_id": patient_id, "has_diabetes": has_diabetes, "smoker": smoker_flag, "hypertension": hypertension_flag, "heart_disease": heart_disease_flag}

    for ent in doc.ents:
        label = ent.label_
        value = ent.text

        # Si esa etiqueta a√∫n no est√° guardada, la a√±adimos
        # (as√≠ no se sobrescribe si ya tiene valor)
        if label not in row and label in ["GENDER", "BMI_VALUE", "MARKER_VALUE", "GLUCOSE_VALUE", "GLUCOSE_UNITS"]:
            row[label] = value
        if label not in row and label in ["AGE_VALUE"]:
            if not isinstance(value, (int, float)):
                row[label] = w2n.word_to_num(value)
            else:
                row[label] = value
    data_rows.append(row)

# Convertimos a DataFrame (las columnas se crear√°n autom√°ticamente)
df = pd.DataFrame(data_rows)

# --- Mapeos definidos ---
bmi_map = {
    "low": 16,
    "decreased": 16
    "normal": 22.5,
    "high": 30,
    "increased": 30,
    "elevated": 30
}

marker_map = {
    "low": 4,
    "decreased": 4,
    "normal": 5,
    "high": 7,
    "increased": 7,
    "elevated": 7
}

glucose_map = {
    "low": 100,
    "low": 100,
    "normal": 150,
    "high": 250,
    "increased": 250,
    "elevated": 250
}

def convert_value(val, mapping):
    """
    Convierte texto seg√∫n el mapeo. 
    Si ya es num√©rico o convertible, devuelve el n√∫mero.
    """
    if pd.isna(val):
        return np.nan
    val_str = str(val).strip().lower()
    # Si es texto conocido ‚Üí asignar n√∫mero
    if val_str in mapping:
        return mapping[val_str]
    # Si es n√∫mero ‚Üí devolver como float
    try:
        return float(val)
    except ValueError:
        return np.nan

# --- Aplicar conversiones ---
df["BMI_VALUE"] = df["BMI_VALUE"].apply(lambda x: convert_value(x, bmi_map))
df["MARKER_VALUE"] = df["MARKER_VALUE"].apply(lambda x: convert_value(x, marker_map))
df["GLUCOSE_VALUE"] = df["GLUCOSE_VALUE"].apply(lambda x: convert_value(x, glucose_map))

# Exportamos
df.to_csv("entidades_por_paciente.csv", index=False, sep=";")
print(" CSV generado: entidades_por_paciente.csv")

#### Entrenamiento del Random Forest

In [110]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

#  # Cargar el archivo CSV
df = pd.read_csv("/kaggle/input/train-2/dataset_2.csv", sep=";")

#  # Eliminar columnas que no aportan informaci√≥n
df = df.drop(columns=["patient_id", "GLUCOSE_UNITS"])

#  # Codificar la variable categ√≥rica GENDER
if "GENDER" in df.columns:
    df["GENDER"] = LabelEncoder().fit_transform(df["GENDER"].astype(str))

#  # Convertir todas las columnas posibles a formato num√©rico (valores no num√©ricos se transforman en NaN)
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

#  # Rellenar los valores faltantes con la media de cada columna num√©rica
df = df.fillna(df.mean(numeric_only=True))

#  # Separar las variables independientes (X) de la variable objetivo (y)
X_train = df.drop(columns=["has_diabetes"])
y_train = df["has_diabetes"]

#  # Dividir los datos en conjuntos de entrenamiento y prueba
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0, random_state=42)

#  # Entrenar el modelo Random Forest con los par√°metros definidos
clf = RandomForestClassifier(n_estimators=431, min_samples_leaf=1, min_samples_split=8, max_depth=8, random_state=0, max_features=None)
clf.fit(X_train, y_train)

#  # Evaluar el rendimiento del modelo con el conjunto de prueba
#accuracy = clf.score(X_test, y_test)
#print(f" Precisi√≥n en test = {accuracy * 100:.2f}%")

# # Mostrar un informe completo de clasificaci√≥n si se desea
#y_pred = clf.predict(X_test)
#print("\nReporte de clasificaci√≥n:")
#print(classification_report(y_test, y_pred))

#### Carga de datos del conjunto de prueba

In [None]:
import requests
import time # Buena pr√°ctica a√±adir una pausa

BASE_URL = "https://api.hackupm2025.workers.dev"
train_list_endpoint = "/api/v1/patients/test"
i = 1
lista=[]
while True:
    try:
        params_consulta = {
            'page': i,
            'limit': 20,
        }

        url_completa = BASE_URL + train_list_endpoint

        response = requests.get(url_completa, params=params_consulta, timeout=10)

        # 5. Comprueba si la petici√≥n fue exitosa (c√≥digo 200)
        if response.status_code == 200:
            datos = response.json()
            # print(f"URL final solicitada: {response.url}")
            for fila in datos['data']:
                lista.append((fila['patient_id'], fila['medical_note']))
            # (Opcional) Muestra cu√°ntos datos reales vinieron
            # Asumiendo que los datos est√°n en una clave 'data')
            
            # ---
            # 1. CORRECCI√ìN DE SINTAXIS Y L√ìGICA:
            #    Mueve el 'break' DENTRO del if de √©xito.
            #    Corrige la sintaxis de acceso al diccionario.
            #    Compara con el booleano 'False', no con el string "false".
            # ---
            if not datos["pagination"]["hasNextPage"]:
                print("No hay m√°s p√°ginas. Saliendo del bucle.")
                break # ¬°√âxito! Salimos del bucle.
            
            # Si llegamos aqu√≠, es que hay m√°s p√°ginas. Incrementamos.
            i += 1

        else:
            # 2. CORRECCI√ìN DE ERROR:
            #    Si la API da un error (ej. 404, 500), debemos parar el bucle.
            print(f"Error: La API devolvi√≥ el c√≥digo {response.status_code}")
            print(f"Respuesta: {response.text}")
            print("Saliendo del bucle debido a un error de la API.")
            break # Salimos del bucle si la API falla

    # 3. CORRECCI√ìN DE EXCEPCI√ìN:
    #    La sintaxis 'Exception or ...' es incorrecta.
    #    Es mejor capturar la excepci√≥n base de 'requests'.
    except requests.exceptions.RequestException as e: 
        print(f"Error de conexi√≥n o red: {e}")
        print(f"No se pudo conectar a '{BASE_URL}'. Saliendo del bucle.")
        break # Salimos si hay un error de conexi√≥n

    # A√±ade una peque√±a pausa para no saturar la API
    #time.sleep(0.5)

print("¬°Datos de prueba obtenidos con √©xito!")

#### Aplicaci√≥n del NLP

In [None]:
import numpy as np
import pandas as pd
from word2number import w2n

data_rows = []

for i in range(len(lista)):
    patient_id = lista[i][0]
    text = lista[i][1]
    doc = nlp(text)

    smoker_flag = compute_smoker_flag(doc)
    hypertension_flag = compute_hypertension_flag(doc)
    heart_disease_flag = compute_heart_disease_flag(doc)
    
    # Creamos un diccionario con el ID y luego rellenamos con las entidades
    row = {"patient_id": patient_id, "smoker": smoker_flag, "hypertension": hypertension_flag, "heart_disease": heart_disease_flag}

    for ent in doc.ents:
        label = ent.label_
        value = ent.text

        # Si esa etiqueta a√∫n no est√° guardada, la a√±adimos
        # (as√≠ no se sobrescribe si ya tiene valor)
        if label not in row and label in ["GENDER", "BMI_VALUE", "MARKER_VALUE", "GLUCOSE_VALUE", "GLUCOSE_UNITS"]:
            row[label] = value
        if label not in row and label in ["AGE_VALUE"]:
            if not isinstance(value, (int, float)):
                row[label] = w2n.word_to_num(value)
            else:
                row[label] = value
    data_rows.append(row)

In [None]:
# Convertimos a DataFrame (las columnas se crear√°n autom√°ticamente)
df = pd.DataFrame(data_rows)

# --- Mapeos definidos ---
bmi_map = {
    "low": 16,
    "decreased": 16,
    "normal": 22.5,
    "high": 30,
    "increased": 30,
    "elevated": 30
}

marker_map = {
    "low": 4,
    "decreased": 4,
    "normal": 5,
    "high": 7,
    "increased": 7,
    "elevated": 7
}

glucose_map = {
    "low": 100,
    "low": 100,
    "normal": 150,
    "high": 250,
    "increased": 250,
    "elevated": 250
}

def convert_value(val, mapping):
    """
    Convierte texto seg√∫n el mapeo. 
    Si ya es num√©rico o convertible, devuelve el n√∫mero.
    """
    if pd.isna(val):
        return np.nan
    val_str = str(val).strip().lower()
    # Si es texto conocido ‚Üí asignar n√∫mero
    if val_str in mapping:
        return mapping[val_str]
    # Si es n√∫mero ‚Üí devolver como float
    try:
        return float(val)
    except ValueError:
        return np.nan

# --- Aplicar conversiones ---
df["BMI_VALUE"] = df["BMI_VALUE"].apply(lambda x: convert_value(x, bmi_map))
df["MARKER_VALUE"] = df["MARKER_VALUE"].apply(lambda x: convert_value(x, marker_map))
df["GLUCOSE_VALUE"] = df["GLUCOSE_VALUE"].apply(lambda x: convert_value(x, glucose_map))

# Exportamos
df.to_csv("test.csv", index=False, sep=";")
print(" CSV generado: test.csv")

#### Inferencia del modelo

In [111]:
df = pd.read_csv("/kaggle/input/test-1/test.csv", sep=";")

patient_ids = df["patient_id"].tolist()

if "GENDER" in df.columns:
    df["GENDER"] = LabelEncoder().fit_transform(df["GENDER"].astype(str))

#  # Convertir todas las columnas posibles a formato num√©rico (valores no num√©ricos se transforman en NaN)
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

#  # Rellenar los valores faltantes con la media de cada columna num√©rica
df = df.fillna(df.mean(numeric_only=True))

X_test = df.drop(columns=["patient_id", "GLUCOSE_UNITS"])

y_pred = clf.predict(X_test)

pred_df = pd.DataFrame({
    "patient_id": [f"patient_{str(p).zfill(5)}" for p in patient_ids],
    "has_diabetes": y_pred
})
print(pred_df.head())
pred_df.to_csv("predicciones.csv", index=False)

      patient_id  has_diabetes
0  patient_37551             0
1  patient_24430             0
2  patient_89346             0
3  patient_88818             1
4  patient_00139             0


#### Optimizaci√≥n de par√°metros con Optuna

In [109]:
import optuna
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# --- (Asumimos que X_train, y_train, X_test, Y_test ya existen) ---
# X_train, y_train = ...
# X_test, Y_test = ...
# -----------------------------------------------------------------

# 1. DEFINE LA FUNCI√ìN OBJETIVO
# Optuna llamar√° a esta funci√≥n en cada "intento" (trial)
def objective(trial):
    
    # --- A. Sugiere los hiperpar√°metros a probar ---
    # Usamos 'suggest_int' para enteros y 'suggest_float' para decimales
    # 'log=True' es bueno para par√°metros que var√≠an exponencialmente
    
    n_estimators = trial.suggest_int('n_estimators', 50, 1000)
    max_depth = trial.suggest_int('max_depth', 2, 32, log=True)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 20)
    
    # Tambi√©n podemos probar 'max_features' que mencionaste antes
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])

    # --- B. Crea el clasificador con esos par√°metros ---
    clf = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        random_state=0  # Fija el random_state para reproducibilidad
    )

    # --- C. Eval√∫a el modelo usando Cross-Validation ---
    # Usamos SOLO X_train y y_train. 
    # cv=3 significa 3-fold cross-validation.
    score = cross_val_score(clf, X_train, y_train, n_jobs=-1, cv=3, scoring='accuracy')
    accuracy = score.mean()
    
    # --- D. Devuelve el score que Optuna debe maximizar ---
    return accuracy

# 2. CREA Y EJECUTA EL ESTUDIO
# direction="maximize" porque queremos la mayor accuracy posible
study = optuna.create_study(direction='maximize')

# n_trials=100 significa que probar√° 100 combinaciones de par√°metros
study.optimize(objective, n_trials=100)

# 3. MUESTRA LOS RESULTADOS
print("Optimizaci√≥n finalizada.")
print("Mejor score (accuracy media en CV):", study.best_value)
print("Mejores par√°metros encontrados:")
print(study.best_params)


# --- 4. PASO FINAL: Entrena el modelo final y eval√∫a en Test ---

# Obtiene los mejores par√°metros del estudio
best_params = study.best_params

# Crea el clasificador final con esos par√°metros
final_clf = RandomForestClassifier(**best_params, random_state=0)

# Entrena con TODOS los datos de entrenamiento
final_clf.fit(X_train, y_train)

# Eval√∫a (¬°ahora s√≠!) en el conjunto de Test
final_accuracy = final_clf.score(X_test, Y_test)

print(f"\nPrecisi√≥n final del modelo optimizado en TEST = {final_accuracy * 100:.2f}%")

[I 2025-11-04 18:12:49,137] A new study created in memory with name: no-name-82ab72d4-7f64-4a01-9c0f-528c9a020343
[I 2025-11-04 18:12:53,129] Trial 0 finished with value: 0.8569999999999999 and parameters: {'n_estimators': 605, 'max_depth': 18, 'min_samples_split': 14, 'min_samples_leaf': 8, 'max_features': 'log2'}. Best is trial 0 with value: 0.8569999999999999.
[I 2025-11-04 18:12:56,028] Trial 1 finished with value: 0.8513333333333333 and parameters: {'n_estimators': 512, 'max_depth': 7, 'min_samples_split': 6, 'min_samples_leaf': 7, 'max_features': None}. Best is trial 0 with value: 0.8569999999999999.
[I 2025-11-04 18:12:58,083] Trial 2 finished with value: 0.835 and parameters: {'n_estimators': 668, 'max_depth': 3, 'min_samples_split': 16, 'min_samples_leaf': 20, 'max_features': 'log2'}. Best is trial 0 with value: 0.8569999999999999.
[I 2025-11-04 18:13:00,434] Trial 3 finished with value: 0.8483333333333333 and parameters: {'n_estimators': 754, 'max_depth': 10, 'min_samples_spl

Optimizaci√≥n finalizada.
Mejor score (accuracy media en CV): 0.8613333333333334
Mejores par√°metros encontrados:
{'n_estimators': 757, 'max_depth': 8, 'min_samples_split': 12, 'min_samples_leaf': 2, 'max_features': 'log2'}


NameError: name 'Y_test' is not defined