## üë• Autores

Este proyecto fue desarrollado por:

| Nombre Completo                     | N√∫mero de Identificaci√≥n | Carrera Universitaria              |
| ----------------------------------- | -----------------------: | ---------------------------------- |
| **Yorladys Argumedo Lozano**        | `1038824209`            | Ingenier√≠a Industrial Virtual      |
| **Sebastian Gabriel Castro**        | `1029720632`            | Ingenier√≠a Industrial Virtual      |

---


In [None]:
# ==============================================================================
# NOTEBOOK 04: MODELO CON KNN (K-Nearest Neighbors)
# ==============================================================================

# ------------------------------------------------------------------------------
# JUSTIFICACI√ìN
# ------------------------------------------------------------------------------
# "Probamos un algoritmo basado en instancia (vecinos cercanos) para ver si
# estudiantes con caracter√≠sticas similares tienden a tener el mismo rendimiento."

import pandas as pd
import numpy as np
import os
import warnings

# Librer√≠a espec√≠fica del modelo
from sklearn.neighbors import KNeighborsClassifier

# Preprocesamiento
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Configuraci√≥n
warnings.filterwarnings('ignore')

# ==============================================================================
# 1. CARGA DE DATOS (L√≥gica Colab)
# ==============================================================================
print("üìÇ Configurando entorno de datos...")

if os.path.exists('train.csv') and os.path.exists('test.csv'):
    print("‚úÖ Archivos detectados localmente.")
    base_path = '.'
else:
    print("‚ö†Ô∏è Archivos no encontrados. Descargando de Kaggle...")
    if not os.path.exists('kaggle.json'):
        print("Por favor, sube el archivo 'kaggle.json':")
        from google.colab import files
        uploaded = files.upload()

    !mkdir -p ~/.kaggle
    !cp kaggle.json ~/.kaggle/
    !chmod 600 ~/.kaggle/kaggle.json
    !kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia
    !unzip -q -o udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip
    base_path = '.'

df_train = pd.read_csv(os.path.join(base_path, 'train.csv'))
df_test = pd.read_csv(os.path.join(base_path, 'test.csv'))
test_ids = df_test['ID']

# ==============================================================================
# 2. PREPROCESAMIENTO (Igual al modelo soluci√≥n)
# ==============================================================================
def enrich_data(df):
    df_eng = df.copy()
    df_eng['NUM_NULOS'] = df_eng.isnull().sum(axis=1)
    cols_tiene = [c for c in df.columns if 'TIENE' in c.upper()]
    df_eng['INDICE_RIQUEZA'] = 0
    for col in cols_tiene:
        mapper = {'Si': 1, 'No': 0, 'S√ç': 1, 'NO': 0, 'si': 1, 'no': 0}
        df_eng[col] = df_eng[col].map(mapper)
        df_eng['INDICE_RIQUEZA'] += df_eng[col].fillna(0)
    return df_eng

df_train = enrich_data(df_train)
df_test = enrich_data(df_test)

target_col = 'RENDIMIENTO_GLOBAL'
label_map = {'bajo': 0, 'medio-bajo': 1, 'medio-alto': 2, 'alto': 3}
inverse_map = {0: 'bajo', 1: 'medio-bajo', 2: 'medio-alto', 3: 'alto'}

X = df_train.drop(columns=[target_col, 'ID'], errors='ignore')
y = df_train[target_col].map(label_map)
X_kaggle = df_test.drop(columns=['ID'], errors='ignore')

num_cols = X.select_dtypes(include=['number']).columns.tolist()
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# Pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) # CRUCIAL para KNN (distancias)
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='MISSING')),
    ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, num_cols),
        ('cat', categorical_transformer, cat_cols)
    ],
    verbose_feature_names_out=False
)

# ==============================================================================
# 3. DEFINICI√ìN DEL MODELO KNN
# ==============================================================================
print("‚öôÔ∏è Configurando KNN...")

# Usamos 5 vecinos como se solicit√≥
# n_jobs=-1 usa todos los procesadores para calcular distancias m√°s r√°pido
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

pipeline_knn = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', knn_model)
])

# ==============================================================================
# 4. ENTRENAMIENTO Y PREDICCI√ìN
# ==============================================================================
print("üèãÔ∏è Iniciando entrenamiento KNN...")
pipeline_knn.fit(X, y)
print("‚úÖ Entrenamiento completado.")

print("üîÆ Generando predicciones...")
y_pred_indices = pipeline_knn.predict(X_kaggle)
y_pred_text = [inverse_map[v] for v in y_pred_indices]

submission = pd.DataFrame({
    'ID': test_ids,
    'RENDIMIENTO_GLOBAL': y_pred_text
})

filename = 'submission_knn.csv'
submission.to_csv(filename, index=False)
print(f"üèÜ Archivo '{filename}' generado.")