In [None]:
import requests
import time 

#------------------------------------------[DATA EXTRACTION]------------------------------------------------

# Definitions and initializations
BASE_URL = "https://api.hackupm2025.workers.dev"
train_list_endpoint = "/api/v1/patients/train"
i = 1
lista=[]
while True:
    try:
        params_consulta = {
            'page': i,
            'limit': 20,
        }

        url_completa = BASE_URL + train_list_endpoint

        # Launch the GET request to the API
        response = requests.get(url_completa, params=params_consulta, timeout=10)

        # If the response is successful, process the data
        if response.status_code == 200:
            datos = response.json()

            # Process and store the data in 'lista'
            for fila in datos['data']:
                lista.append((fila['patient_id'], fila['has_diabetes'], fila['medical_note']))
           
            # We check if there are more pages to fetch or not to make sure we've extracted all data
            if not datos["pagination"]["hasNextPage"]:
                print("No hay más páginas. Saliendo del bucle.")
                break 
            
            # Increment the page number for the next iteration
            i += 1

        else:
           
           # Error handling for non-200 responses
            print(f"Error: La API devolvió el código {response.status_code}")
            print(f"Respuesta: {response.text}")
            print("Saliendo del bucle debido a un error de la API.")
            break 

    # Error handling for connection issues
    except requests.exceptions.RequestException as e: 
        print(f"Error de conexión o red: {e}")
        print(f"No se pudo conectar a '{BASE_URL}'. Saliendo del bucle.")
        break # Salimos si hay un error de conexión

    # To avoid hitting rate limits, we can add a small delay between requests
    #time.sleep(0.5)

print("¡Datos de entrenamiento obtenidos con éxito!")

In [None]:
!pip install scikit-learn

In [None]:
!pip install nltk svgling
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')

In [None]:
!pip install medspacy


In [None]:
import spacy
import medspacy
from medspacy.ner import TargetRule
from spacy.language import Language
from spacy.tokens import Span
from loguru import logger
from spacy.util import filter_spans

#------------------------------------------[NLP PIPELINE SETUP]------------------------------------------------

# Clear logs from MedSpaCy and spaCy to reduce clutter
logger.disable("PyRuSH")
logger.disable("medspacy")

# Initialize base spacy model 
base_nlp = spacy.load("en_core_web_sm")  # o "es_core_news_sm" si es español

# Merge of medspacy with base spacy model
nlp = medspacy.load(enable=["target_matcher", "context"], nlp=base_nlp)

print("Pipeline después de cargar MedSpaCy:", nlp.pipe_names)

# ------------------------------------------[TARGET RULES DEFINITION]------------------------------------------------
target_rules = [

    # To detect markers
    TargetRule("HbA1c", "MARKER"),
    TargetRule("BMI", "BMI"),
    TargetRule("glucose", "GLUCOSE"),
    TargetRule("year", "AGE"),
    TargetRule("female", "GENDER"),
    TargetRule("male", "GENDER"),

    # To detect smoking status
    TargetRule("smoker", "SMOKE"),
    TargetRule("smoke", "SMOKE"),
    TargetRule("smoking", "SMOKE"),
    TargetRule("smokin", "SMOKE"),

    # To detect hypertension
    TargetRule("hypertension", "HYPERTENSION"),
    TargetRule("hypertensive", "HYPERTENSION"),
    TargetRule("high blood pressure", "HYPERTENSION"),
    TargetRule("HTN", "HYPERTENSION"),

    # To detect heart disease
    TargetRule("heart disease", "HEART_DISEASE"),
    TargetRule("coronary artery disease", "HEART_DISEASE"),
    TargetRule("ischemic heart disease", "HEART_DISEASE"),
    TargetRule("cardiovascular disease", "HEART_DISEASE"),
    TargetRule("CVD", "HEART_DISEASE"),
    TargetRule("IHD", "HEART_DISEASE"),
    TargetRule("CAD", "HEART_DISEASE"),
]

# Add the target rules to the medspacy target matcher
nlp.get_pipe("medspacy_target_matcher").add(target_rules)

# ------------------------------------------[CUSTOM COMPONENT DEFINITION]------------------------------------------------

# Aux function to check span overlaps
def span_overlaps_any(span_start, span_end, ents):
    """Devuelve True si el span [span_start, span_end) solapa con alguna entidad en ents."""
    for e in ents:
        # entidad e cubre [e.start, e.end)
        if not (span_end <= e.start or span_start >= e.end):
            return True
    return False

# Custom component to find marker values safely in both directions with overlap checks
@Language.component("find_marker_value_bidirectional_safe")
def find_marker_value_bidirectional_safe(doc):
    current_ents = list(doc.ents)  # copy of current entities to iterate over
    new_ents = [] # list of new entities to add

    # Track occupied token indices to avoid overlaps
    occupied_tokens = set()
    for e in current_ents:
        occupied_tokens.update(range(e.start, e.end))

    # Iterate over current entities to find values
    for ent in current_ents:
        if ent.label_ not in ["MARKER", "BMI", "GLUCOSE", "AGE"]:
            continue

        # For each entity, search forwards for a value that matches criteria
        window_start = ent.end
        window_end = min(ent.end + 5, len(doc))
        for token in doc[window_start:window_end]:
          
            start = token.i
            if token.i - 1 >= 0 and doc[token.i - 1].pos_ == "ADV":
                start = token.i - 1
            end = token.i + 1

           
            is_value = token.like_num or token.pos_ == "ADJ" or token.lower_ in {"high","low","normal","elevated","increased","decreased"}
            if not is_value:
                continue

            
            if span_overlaps_any(start, end, current_ents):
                continue

            
            if span_overlaps_any(start, end, new_ents):
                continue

            # Add new entity
            new_ents.append(Span(doc, start, end, label=f"{ent.label_}_VALUE"))
            break

       # Now search backwards for a value if not found forwards 
        window_start_back = max(ent.start - 5, 0)
        window_end_back = ent.start
       
       # Search backwards for a value that matches criteria for each entity
        for token in reversed(doc[window_start_back:window_end_back]):
            start = token.i
            
        
            if token.i - 1 >= 0 and doc[token.i - 1].pos_ == "ADV":
                start = token.i - 1
            end = token.i + 1

            is_value = token.like_num or token.pos_ == "ADJ" or token.lower_ in {"high","low","normal","elevated","increased","decreased"}
            if not is_value:
                continue

            
            if span_overlaps_any(start, end, current_ents):
                continue
            if span_overlaps_any(start, end, new_ents):
                continue

            new_ents.append(Span(doc, start, end, label=f"{ent.label_}_VALUE"))
            break

    # Additional logic to find GLUCOSE_UNITS after GLUCOSE_VALUE
    extra_ents = []
    for ent in new_ents:
        if ent.label_ == "GLUCOSE_VALUE":
            try:
                float(ent.text)
                

                possible_units = {"mg/dl", "mg/dL", "mg / dL", "mmol/L", "mmol/l", "g/L", "mg%", "mg dl", "mg per dL", "mg"}
                
               
                if ent.end < len(doc):
                    next_token = doc[ent.end]
                
                    
                    combined = next_token.text
                   
                    if ent.end + 2 < len(doc):
                        combined2 = next_token.text + doc[ent.end + 1].text + doc[ent.end + 2].text
                        combined2 = combined2.replace(" ", "")
                    else:
                        combined2 = ""
                
                   # Normalize and check
                    combined = combined.lower().replace(" ", "")
                    if combined in possible_units or combined2.lower() in possible_units:
                       
                        new_label = "GLUCOSE_UNITS"
                        extra_ents.append(Span(doc, next_token.i, min(len(doc), next_token.i + 3), label=new_label))

            except ValueError:
                pass
    
    # Combine all entities and filter spans to avoid overlaps
    all_ents = current_ents + new_ents + extra_ents
    try:
        doc.ents = filter_spans(all_ents)

    # Detailed diagnostic in case of error for debugging: print problematic spans
    except Exception as e:
       
        print("ERROR al asignar doc.ents:", e)
        print("Entidades actuales:")
        for e0 in current_ents:
            print(f"  - {e0.text} [{e0.start},{e0.end}) {e0.label_}")
        print("Entidades nuevas propuestas:")
        for e1 in new_ents:
            print(f"  - {e1.text} [{e1.start},{e1.end}) {e1.label_}")
        
        raise

    return doc


In [None]:
from medspacy.context import ConTextRule
import re

#-------------------------------------------[PIPELINE CUSTOMIZATION]------------------------------------------------
if "medspacy_context" not in nlp.pipe_names:
    nlp.add_pipe("medspacy_context", after="medspacy_target_matcher")
else:
    nlp.remove_pipe("medspacy_context")
    nlp.add_pipe("medspacy_context", after="medspacy_target_matcher")

#--------------------------------------------[CONTEXT RULES DEFINITION]------------------------------------------------
context = nlp.get_pipe("medspacy_context")
context.add([
    # Generic useful negations
    ConTextRule("no", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),
    ConTextRule("without", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),
    ConTextRule("free of", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),
    ConTextRule("denies", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=8),
    ConTextRule("denies any", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=8),
    ConTextRule("never", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),
    ConTextRule("no history of", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=10),
    ConTextRule("negative for", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=6),

    # “non …” (typical variations)
    ConTextRule("non", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=5),
    ConTextRule("non-", "NEGATED_EXISTENCE", direction="FORWARD", max_scope=5),
    ConTextRule("non smoking", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("non-smoking", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("non smoker", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("non-smoker", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("non smokin", "NEGATED_EXISTENCE", direction="BIDIRECTIONAL", max_scope=3),

    # Historical smoking status indicators
    ConTextRule("past", "HISTORICAL", direction="FORWARD", max_scope=6),
    ConTextRule("former", "HISTORICAL", direction="FORWARD", max_scope=6),
    ConTextRule("formerly", "HISTORICAL", direction="FORWARD", max_scope=6),
    ConTextRule("ex-smoker", "HISTORICAL", direction="BIDIRECTIONAL", max_scope=3),
    ConTextRule("history of", "HISTORICAL", direction="FORWARD", max_scope=8),
    ConTextRule("hx of", "HISTORICAL", direction="FORWARD", max_scope=8),
    ConTextRule("h/o", "HISTORICAL", direction="FORWARD", max_scope=8),
    ConTextRule("PMH of", "HISTORICAL", direction="FORWARD", max_scope=8),
])

#--------------------------------------------------[SMOKING STATUS CLASSIFIER (regrex expressions)]--------------------------------------------------
NEG_SMOKE_RE = re.compile(
    r"\b(non[-\s]smok\w|never\s+smok\w*|denies\s+smok\w*|no\s+(history\s+of\s+)?smok\w*|not\s+a\s+smoker)\b",
    re.IGNORECASE,
)
HIST_SMOKE_RE = re.compile(
    r"\b(former(ly)?\s+smok\w*|past\s+smok\w*|ex[-\s]smok\w|history\s+of\s+smok\w*)\b",
    re.IGNORECASE,
)
POS_SMOKE_RE = re.compile(
    r"\b(current(ly)?\s+a?\s*smok\w*|smokes\b|smoking\b|is\s+a\s+smoker|smoker\b)\b",
    re.IGNORECASE,
)

# Custom component to classify smoking status based on regex patterns
@Language.component("smoking_flag_classifier")
def smoking_flag_classifier(doc):
    text = doc.text.lower()
    doc._.smoking_vote = None  # -1=neg, 0=hist, 1=pos, None=indeterminado

    if NEG_SMOKE_RE.search(text):
        doc._.smoking_vote = -1
        return doc
    if HIST_SMOKE_RE.search(text):
        doc._.smoking_vote = 1   
        return doc
    if POS_SMOKE_RE.search(text):
        if not NEG_SMOKE_RE.search(text):
            doc._.smoking_vote = 1
            return doc
    return doc

# Register the custom Doc extension for smoking vote
if not spacy.tokens.Doc.has_extension("smoking_vote"):
    spacy.tokens.Doc.set_extension("smoking_vote", default=None)

In [None]:
# Add custom components to the pipeline
nlp.add_pipe("find_marker_value_bidirectional_safe", after="medspacy_target_matcher")
nlp.add_pipe("smoking_flag_classifier", after="medspacy_context")

In [None]:
#----------------------------------------------------------[AUXILIARY FUNCTIONS FOR FLAGS]----------------------------------------------------------
# Smoke (current): require NOT negated, NOT family, NOT historical, NOT hypothetical
def is_asserted_current(ent):
    return not getattr(ent._, "is_negated", False) \
        and not getattr(ent._, "is_family", False) \
        and not getattr(ent._, "is_historical", False) \
        and not getattr(ent._, "is_hypothetical", False)

# Illnesses (hypertension, heart disease): count historical as present if NOT negated, NOT family, NOT hypothetical
def is_present_condition(ent):
    return not getattr(ent._, "is_negated", False) \
        and not getattr(ent._, "is_family", False) \
        and not getattr(ent._, "is_hypothetical", False)

# ---------------------------------------------------------[CALCULATION OF FLAGS (combines vote and ConText entities)]---------------------------------------------------------
def compute_smoker_flag(doc):
    # Prioridad al voto del regex si existe
    if doc._.smoking_vote == -1:
        return 0
    if doc._.smoking_vote == 0:
        return 0
    if doc._.smoking_vote == 1:
        return 1
    # if there was no vote, use entities + ConText (only current)
    return 1 if any(ent.label_ == "SMOKE" and is_asserted_current(ent) for ent in doc.ents) else 0

def compute_hypertension_flag(doc):
    # Account historical as present if NOT negated/family/hypothetical
    return 1 if any(ent.label_ == "HYPERTENSION" and is_present_condition(ent) for ent in doc.ents) else 0

def compute_heart_disease_flag(doc):
    # Account historical as present if NOT negated/family/hypothetical
    return 1 if any(ent.label_ == "HEART_DISEASE" and is_present_condition(ent) for ent in doc.ents) else 0

In [None]:
print("Pipeline actualizado:", nlp.pipe_names)

#------------------------------------------[TESTING ON PATIENTS]------------------------------------------------
for i in range(len(lista)):
    if lista[i][0]==4941:
        text = lista[i][2]
        doc = nlp(text)
        print(f"Textos encontrados en el paciente con ID {lista[i][0]}:")
        for ent in doc.ents:
            print(f"Texto: '{ent.text}', Etiqueta: '{ent.label_}'\n")
    
        smoker_flag = compute_smoker_flag(doc)
        hypertension_flag = compute_hypertension_flag(doc)
        heart_disease_flag = compute_heart_disease_flag(doc)
    
        print(f"--> Fumador: {smoker_flag}")
        print(f"--> Hypertension: {hypertension_flag}")
        print(f"--> Heart disease: {heart_disease_flag}")

In [None]:
import numpy as np
import pandas as pd
from word2number import w2n

#------------------------------------------[DATA NORMALIZATION AND EXPORT]------------------------------------------------

data_rows = []

for i in range(len(lista)):
    patient_id = lista[i][0]
    has_diabetes = lista[i][1]
    text = lista[i][2]
    doc = nlp(text)

    smoker_flag = compute_smoker_flag(doc)
    hypertension_flag = compute_hypertension_flag(doc)
    heart_disease_flag = compute_heart_disease_flag(doc)
    
    # Create a dictionary with the ID and then fill it with entities
    row = {"patient_id": patient_id, "has_diabetes": has_diabetes, "smoker": smoker_flag, "hypertension": hypertension_flag, "heart_disease": heart_disease_flag}

    for ent in doc.ents:
        label = ent.label_
        value = ent.text

        # if the label is not already stored, we add it (so it is not overwritten if it already has value)
        if label not in row and label in ["GENDER", "BMI_VALUE", "MARKER_VALUE", "GLUCOSE_VALUE", "GLUCOSE_UNITS"]:
            row[label] = value
        if label not in row and label in ["AGE_VALUE"]:
            if not isinstance(value, (int, float)):
                row[label] = w2n.word_to_num(value)
            else:
                row[label] = value
    data_rows.append(row)

# Convert to DataFrame (columns will be created automatically)
df = pd.DataFrame(data_rows)

# -------------------------------------[ NORMALIZATION MAPS AND FUNCTIONS ]------------------------------------------------
bmi_map = {
    "low": 16,
    "decreased": 16
    "normal": 22.5,
    "high": 30,
    "increased": 30,
    "elevated": 30
}

marker_map = {
    "low": 4,
    "decreased": 4,
    "normal": 5,
    "high": 7,
    "increased": 7,
    "elevated": 7
}

glucose_map = {
    "low": 100,
    "low": 100,
    "normal": 150,
    "high": 250,
    "increased": 250,
    "elevated": 250
}

# Aux function to convert values based on mapping or numeric conversion
def convert_value(val, mapping):
    """
    Convierte texto según el mapeo. 
    Si ya es numérico o convertible, devuelve el número.
    """
    if pd.isna(val):
        return np.nan
    val_str = str(val).strip().lower()
    # Si es texto conocido → asignar número
    if val_str in mapping:
        return mapping[val_str]
    # Si es número → devolver como float
    try:
        return float(val)
    except ValueError:
        return np.nan

# Apply conversions
df["BMI_VALUE"] = df["BMI_VALUE"].apply(lambda x: convert_value(x, bmi_map))
df["MARKER_VALUE"] = df["MARKER_VALUE"].apply(lambda x: convert_value(x, marker_map))
df["GLUCOSE_VALUE"] = df["GLUCOSE_VALUE"].apply(lambda x: convert_value(x, glucose_map))

# Export
df.to_csv("entidades_por_paciente.csv", index=False, sep=";")
print(" CSV generado: entidades_por_paciente.csv")

#### Entrenamiento del Random Forest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset to train the model
df = pd.read_csv("/kaggle/input/train-2/dataset_2.csv", sep=";")

# Eliminate columns that do not provide information
df = df.drop(columns=["patient_id", "GLUCOSE_UNITS"])

# Codex the categorical variable GENDER
if "GENDER" in df.columns:
    df["GENDER"] = LabelEncoder().fit_transform(df["GENDER"].astype(str))

# Conversion of all possible columns to numeric format (non-numeric values are transformed to NaN)
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

#  Fill missing values with the mean of each column numeric
df = df.fillna(df.mean(numeric_only=True))

#  Separate independent variables (X) from the target variable (y)
X_train = df.drop(columns=["has_diabetes"])
y_train = df["has_diabetes"]

#  Divide the dataset into training and testing sets (80% train, 20% test)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0, random_state=42)

#  Train the Random Forest model with defined parameters
clf = RandomForestClassifier(n_estimators=431, min_samples_leaf=1, min_samples_split=8, max_depth=8, random_state=0, max_features=None)
clf.fit(X_train, y_train)

#  Test the model and print accuracy 
#accuracy = clf.score(X_test, y_test)
#print(f" Precisión en test = {accuracy * 100:.2f}%")

# Show detailed classification report
#y_pred = clf.predict(X_test)
#print("\nReporte de clasificación:")
#print(classification_report(y_test, y_pred))

#### Carga de datos del conjunto de prueba

In [None]:
import requests
import time 

BASE_URL = "https://api.hackupm2025.workers.dev"
train_list_endpoint = "/api/v1/patients/test"
i = 1
lista=[]
while True:
    try:
        params_consulta = {
            'page': i,
            'limit': 20,
        }

        url_completa = BASE_URL + train_list_endpoint

        response = requests.get(url_completa, params=params_consulta, timeout=10)

      
        if response.status_code == 200:
            datos = response.json()
           
            for fila in datos['data']:
                lista.append((fila['patient_id'], fila['medical_note']))
            
            if not datos["pagination"]["hasNextPage"]:
                print("No hay más páginas. Saliendo del bucle.")
                break 
            
           
            i += 1

        else:
           
            print(f"Error: La API devolvió el código {response.status_code}")
            print(f"Respuesta: {response.text}")
            print("Saliendo del bucle debido a un error de la API.")
            break 

    
    except requests.exceptions.RequestException as e: 
        print(f"Error de conexión o red: {e}")
        print(f"No se pudo conectar a '{BASE_URL}'. Saliendo del bucle.")
        break 

   
    #time.sleep(0.5)

print("¡Datos de prueba obtenidos con éxito!")

#### Aplicación del NLP

In [None]:
import numpy as np
import pandas as pd
from word2number import w2n

data_rows = []

for i in range(len(lista)):
    patient_id = lista[i][0]
    text = lista[i][1]
    doc = nlp(text)

    smoker_flag = compute_smoker_flag(doc)
    hypertension_flag = compute_hypertension_flag(doc)
    heart_disease_flag = compute_heart_disease_flag(doc)
    
   
    row = {"patient_id": patient_id, "smoker": smoker_flag, "hypertension": hypertension_flag, "heart_disease": heart_disease_flag}

    for ent in doc.ents:
        label = ent.label_
        value = ent.text

        
        if label not in row and label in ["GENDER", "BMI_VALUE", "MARKER_VALUE", "GLUCOSE_VALUE", "GLUCOSE_UNITS"]:
            row[label] = value
        if label not in row and label in ["AGE_VALUE"]:
            if not isinstance(value, (int, float)):
                row[label] = w2n.word_to_num(value)
            else:
                row[label] = value
    data_rows.append(row)

In [None]:

df = pd.DataFrame(data_rows)


bmi_map = {
    "low": 16,
    "decreased": 16,
    "normal": 22.5,
    "high": 30,
    "increased": 30,
    "elevated": 30
}

marker_map = {
    "low": 4,
    "decreased": 4,
    "normal": 5,
    "high": 7,
    "increased": 7,
    "elevated": 7
}

glucose_map = {
    "low": 100,
    "low": 100,
    "normal": 150,
    "high": 250,
    "increased": 250,
    "elevated": 250
}

def convert_value(val, mapping):
    """
    Convert text according to mapping. 
    If it is already numeric or convertible, return the number.
    """
    if pd.isna(val):
        return np.nan
    val_str = str(val).strip().lower()
   
    if val_str in mapping:
        return mapping[val_str]
    
    try:
        return float(val)
    except ValueError:
        return np.nan


df["BMI_VALUE"] = df["BMI_VALUE"].apply(lambda x: convert_value(x, bmi_map))
df["MARKER_VALUE"] = df["MARKER_VALUE"].apply(lambda x: convert_value(x, marker_map))
df["GLUCOSE_VALUE"] = df["GLUCOSE_VALUE"].apply(lambda x: convert_value(x, glucose_map))


df.to_csv("test.csv", index=False, sep=";")
print(" CSV generado: test.csv")

#### Inferencia del modelo

In [None]:

# Read the CSV file using ';' as separator and load it into a DataFrame
df = pd.read_csv("/kaggle/input/test-1/test.csv", sep=";")

# Extract the 'patient_id' column and convert it to a list for later use
patient_ids = df["patient_id"].tolist()

# If the column 'GENDER' exists, encode it into numeric values using LabelEncoder
if "GENDER" in df.columns:
    df["GENDER"] = LabelEncoder().fit_transform(df["GENDER"].astype(str))

# Convert all columns to numeric format
# Non-numeric values will be converted to NaN
for col in df.columns:
    df[col] = pd.to_numeric(df[col], errors="coerce")

# Fill missing values (NaN) with the mean of each numeric column
df = df.fillna(df.mean(numeric_only=True))

# Prepare the test set by dropping columns that should not be used for prediction
X_test = df.drop(columns=["patient_id", "GLUCOSE_UNITS"])

# Predict using the pre-trained classifier (clf)
y_pred = clf.predict(X_test)

# Create a DataFrame with the prediction results:
# Format patient IDs with leading zeros and include the prediction
pred_df = pd.DataFrame({
    "patient_id": [f"patient_{str(p).zfill(5)}" for p in patient_ids],
    "has_diabetes": y_pred
})

