# **99 - Solución del Modelo**

In [1]:
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/ai4eng.v1/main/content/init.py
import init; init.init(force_download=False); init.get_weblink()

replicating local resources


In [3]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '.'
!chmod 600 ./kaggle.json
!kaggle competitions download -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia

Downloading udea-ai-4-eng-20252-pruebas-saber-pro-colombia.zip to /content
  0% 0.00/29.9M [00:00<?, ?B/s]
100% 29.9M/29.9M [00:00<00:00, 1.25GB/s]


In [4]:
!unzip udea*.zip > /dev/null

In [5]:
!wc *.csv

   296787    296787   4716673 submission_example.csv
   296787   4565553  59185238 test.csv
   692501  10666231 143732437 train.csv
  1286075  15528571 207634348 total


# **Importación de librerías y carga de archivos**

In [6]:
# Librerías básicas
import pandas as pd
import numpy as np

# Preprocesamiento y modelo
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import HistGradientBoostingClassifier

# Subir archivos desde tu PC (train.csv, test.csv, sample_submission.csv)
#from google.colab import files
#uploaded = files.upload()

# **Carga de datos y exploración básica**

In [7]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
sample_submission = pd.read_csv("submission_example.csv")

print("Dimensiones train:", train.shape)
print("Dimensiones test:", test.shape)
train.head()

Dimensiones train: (692500, 21)
Dimensiones test: (296786, 20)


Unnamed: 0,ID,PERIODO_ACADEMICO,E_PRGM_ACADEMICO,E_PRGM_DEPARTAMENTO,E_VALORMATRICULAUNIVERSIDAD,E_HORASSEMANATRABAJA,F_ESTRATOVIVIENDA,F_TIENEINTERNET,F_EDUCACIONPADRE,F_TIENELAVADORA,...,E_PRIVADO_LIBERTAD,E_PAGOMATRICULAPROPIO,F_TIENECOMPUTADOR,F_TIENEINTERNET.1,F_EDUCACIONMADRE,RENDIMIENTO_GLOBAL,INDICADOR_1,INDICADOR_2,INDICADOR_3,INDICADOR_4
0,904256,20212,ENFERMERIA,BOGOTÁ,Entre 5.5 millones y menos de 7 millones,Menos de 10 horas,Estrato 3,Si,Técnica o tecnológica incompleta,Si,...,N,No,Si,Si,Postgrado,medio-alto,0.322,0.208,0.31,0.267
1,645256,20212,DERECHO,ATLANTICO,Entre 2.5 millones y menos de 4 millones,0,Estrato 3,No,Técnica o tecnológica completa,Si,...,N,No,Si,No,Técnica o tecnológica incompleta,bajo,0.311,0.215,0.292,0.264
2,308367,20203,MERCADEO Y PUBLICIDAD,BOGOTÁ,Entre 2.5 millones y menos de 4 millones,Más de 30 horas,Estrato 3,Si,Secundaria (Bachillerato) completa,Si,...,N,No,No,Si,Secundaria (Bachillerato) completa,bajo,0.297,0.214,0.305,0.264
3,470353,20195,ADMINISTRACION DE EMPRESAS,SANTANDER,Entre 4 millones y menos de 5.5 millones,0,Estrato 4,Si,No sabe,Si,...,N,No,Si,Si,Secundaria (Bachillerato) completa,alto,0.485,0.172,0.252,0.19
4,989032,20212,PSICOLOGIA,ANTIOQUIA,Entre 2.5 millones y menos de 4 millones,Entre 21 y 30 horas,Estrato 3,Si,Primaria completa,Si,...,N,No,Si,Si,Primaria completa,medio-bajo,0.316,0.232,0.285,0.294


# **Instalación de CatBoost con soporte para GPU**

In [8]:
!pip install catboost -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25h

# **Importación del modelo CatBoost y definición de variables base**

In [9]:
from catboost import CatBoostClassifier, Pool

target_col = "RENDIMIENTO_GLOBAL"

y = train[target_col]
X = train.drop(columns=[target_col])

#Identificación de columnas numéricas y categóricas
num_cols = ["ID", "INDICADOR_1", "INDICADOR_2", "INDICADOR_3", "INDICADOR_4"]
cat_cols = [c for c in X.columns if c not in num_cols]

# Asegurar tipos correctos
train[num_cols] = train[num_cols].apply(pd.to_numeric, errors="coerce")
test[num_cols]  = test[num_cols].apply(pd.to_numeric, errors="coerce")

# Categóricas como strings y sin NaN
for c in cat_cols:
    X[c] = X[c].astype(str).fillna("MISSING")
    test[c] = test[c].astype(str).fillna("MISSING")

# **División del dataset en entrenamiento y validación**

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y,
    test_size=0.20,
    random_state=42,
    stratify=y
)

# **Preparación de Pools para CatBoost (manejo de categóricas)**

In [11]:
cat_features_idx = [X.columns.get_loc(c) for c in cat_cols]

train_pool = Pool(X_train, y_train, cat_features=cat_features_idx)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features_idx)

# **Definición del modelo CatBoost con GPU y entrenamiento**

In [12]:
model = CatBoostClassifier(
    task_type="GPU",
    devices="0",
    iterations=1200,
    learning_rate=0.05,
    depth=8,
    l2_leaf_reg=3,
    random_seed=42,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    bootstrap_type="Bernoulli",
    subsample=0.8,
    verbose=200
)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 0.3778285	test: 0.3757978	best: 0.3757978 (0)	total: 60.2ms	remaining: 1m 12s
200:	learn: 0.4292816	test: 0.4244910	best: 0.4245993 (199)	total: 7.49s	remaining: 37.2s
400:	learn: 0.4394116	test: 0.4295812	best: 0.4296606 (397)	total: 13.2s	remaining: 26.2s
600:	learn: 0.4460054	test: 0.4311408	best: 0.4311480 (527)	total: 20.3s	remaining: 20.3s
800:	learn: 0.4521697	test: 0.4315379	best: 0.4320433 (738)	total: 26s	remaining: 12.9s
1000:	learn: 0.4582022	test: 0.4317617	best: 0.4322888 (934)	total: 33.2s	remaining: 6.6s
1199:	learn: 0.4639892	test: 0.4318917	best: 0.4322888 (934)	total: 38.9s	remaining: 0us
bestTest = 0.4322888087
bestIteration = 934
Shrink model to first 935 iterations.


<catboost.core.CatBoostClassifier at 0x780ca33c4740>

# **Evaluación del modelo en el conjunto de validación**

In [13]:
y_pred_valid = model.predict(X_valid)
print("Accuracy:", accuracy_score(y_valid, y_pred_valid))
print(classification_report(y_valid, y_pred_valid))

Accuracy: 0.43228880866425995
              precision    recall  f1-score   support

        alto       0.55      0.62      0.58     35124
        bajo       0.46      0.55      0.50     34597
  medio-alto       0.33      0.28      0.30     34324
  medio-bajo       0.34      0.27      0.30     34455

    accuracy                           0.43    138500
   macro avg       0.42      0.43      0.42    138500
weighted avg       0.42      0.43      0.42    138500



# **Entrenamiento final con todos los datos (modelo definitivo)**

In [14]:
full_pool = Pool(X, y, cat_features=cat_features_idx)
model.fit(full_pool)

0:	learn: 0.3775249	total: 59.7ms	remaining: 1m 11s
200:	learn: 0.4289040	total: 6.44s	remaining: 32s
400:	learn: 0.4393776	total: 14.3s	remaining: 28.5s
600:	learn: 0.4446671	total: 20.6s	remaining: 20.5s
800:	learn: 0.4497300	total: 28.3s	remaining: 14.1s
1000:	learn: 0.4546426	total: 36.2s	remaining: 7.19s
1199:	learn: 0.4598556	total: 42.5s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x780ca33c4740>

# **Predicción sobre el conjunto test y preparación del archivo de envío**

In [15]:
# Predicciones en test
test_pool = Pool(test, cat_features=cat_features_idx)
test_pred = model.predict(test_pool)

# Aseguramos que sea 1D
import numpy as np
test_pred = np.array(test_pred).reshape(-1)

print("Shape de test_pred:", test_pred.shape)

# Asignar al sample_submission
sample_submission[target_col] = test_pred
sample_submission.to_csv("my_submission_catboost.csv", index=False)

print("Archivo listo: my_submission_catboost.csv")
sample_submission.head()

Shape de test_pred: (296786,)
Archivo listo: my_submission_catboost.csv


Unnamed: 0,ID,RENDIMIENTO_GLOBAL
0,550236,bajo
1,98545,medio-alto
2,499179,alto
3,782980,bajo
4,785185,bajo


# **Envío de la predicción a Kaggle**

In [None]:
!kaggle competitions submit \
  -c udea-ai-4-eng-20252-pruebas-saber-pro-colombia \
  -f my_submission_catboost.csv \
  -m "versión catboost GPU"

  0% 0.00/4.06M [00:00<?, ?B/s]100% 4.06M/4.06M [00:00<00:00, 16.2MB/s]
Successfully submitted to UDEA/ai4eng 20252 - Pruebas Saber Pro Colombia