# 02 - Preprocesado de Datos

In [1]:
!wget --no-cache -O init.py -q https://raw.githubusercontent.com/rramosp/ai4eng.v1/main/content/init.py
import init; init.init(force_download=False); init.get_weblink()

# Cargar el archivo CSV

In [2]:
# Importar librerías necesarias

import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

from google.colab import files
uploaded = files.upload()

# Cargar el dataset
train = pd.read_csv("train.csv")

print("Dimensiones del dataset:", train.shape)
train.head()


Saving train.csv to train (1).csv
Dimensiones del dataset: (692500, 21)


Unnamed: 0,ID,PERIODO_ACADEMICO,E_PRGM_ACADEMICO,E_PRGM_DEPARTAMENTO,E_VALORMATRICULAUNIVERSIDAD,E_HORASSEMANATRABAJA,F_ESTRATOVIVIENDA,F_TIENEINTERNET,F_EDUCACIONPADRE,F_TIENELAVADORA,...,E_PRIVADO_LIBERTAD,E_PAGOMATRICULAPROPIO,F_TIENECOMPUTADOR,F_TIENEINTERNET.1,F_EDUCACIONMADRE,RENDIMIENTO_GLOBAL,INDICADOR_1,INDICADOR_2,INDICADOR_3,INDICADOR_4
0,904256,20212,ENFERMERIA,BOGOTÁ,Entre 5.5 millones y menos de 7 millones,Menos de 10 horas,Estrato 3,Si,Técnica o tecnológica incompleta,Si,...,N,No,Si,Si,Postgrado,medio-alto,0.322,0.208,0.31,0.267
1,645256,20212,DERECHO,ATLANTICO,Entre 2.5 millones y menos de 4 millones,0,Estrato 3,No,Técnica o tecnológica completa,Si,...,N,No,Si,No,Técnica o tecnológica incompleta,bajo,0.311,0.215,0.292,0.264
2,308367,20203,MERCADEO Y PUBLICIDAD,BOGOTÁ,Entre 2.5 millones y menos de 4 millones,Más de 30 horas,Estrato 3,Si,Secundaria (Bachillerato) completa,Si,...,N,No,No,Si,Secundaria (Bachillerato) completa,bajo,0.297,0.214,0.305,0.264
3,470353,20195,ADMINISTRACION DE EMPRESAS,SANTANDER,Entre 4 millones y menos de 5.5 millones,0,Estrato 4,Si,No sabe,Si,...,N,No,Si,Si,Secundaria (Bachillerato) completa,alto,0.485,0.172,0.252,0.19
4,989032,20212,PSICOLOGIA,ANTIOQUIA,Entre 2.5 millones y menos de 4 millones,Entre 21 y 30 horas,Estrato 3,Si,Primaria completa,Si,...,N,No,Si,Si,Primaria completa,medio-bajo,0.316,0.232,0.285,0.294


# Revisión rápida de estructura

In [3]:
print("\nInformación general del dataset:\n")
print(train.info())

print("\nValores nulos por columna:\n")
print(train.isnull().sum())


Información general del dataset:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 692500 entries, 0 to 692499
Data columns (total 21 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   ID                           692500 non-null  int64  
 1   PERIODO_ACADEMICO            692500 non-null  int64  
 2   E_PRGM_ACADEMICO             692500 non-null  object 
 3   E_PRGM_DEPARTAMENTO          692500 non-null  object 
 4   E_VALORMATRICULAUNIVERSIDAD  686213 non-null  object 
 5   E_HORASSEMANATRABAJA         661643 non-null  object 
 6   F_ESTRATOVIVIENDA            660363 non-null  object 
 7   F_TIENEINTERNET              665871 non-null  object 
 8   F_EDUCACIONPADRE             669322 non-null  object 
 9   F_TIENELAVADORA              652727 non-null  object 
 10  F_TIENEAUTOMOVIL             648877 non-null  object 
 11  E_PRIVADO_LIBERTAD           692500 non-null  object 
 12  E_PAGOMATRICULAPROPIO  

# Eliminación de columnas irrelevantes o duplicadas

In [4]:
# Eliminamos ID, ya que no aporta a la predicción
if 'ID' in train.columns:
    train.drop('ID', axis=1, inplace=True)

# Eliminamos duplicado de internet si existe
if 'F_TIENEINTERNET.1' in train.columns:
    train.drop('F_TIENEINTERNET.1', axis=1, inplace=True)

# Limpieza de valores faltantes

In [5]:
for col in train.columns:
    if train[col].dtype == 'object':
        train[col].fillna(train[col].mode()[0], inplace=True)
    else:
        train[col].fillna(train[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].mode()[0], inplace=True)


# Corrección de tipos numéricos

In [6]:
numeric_candidates = ['E_VALORMATRICULAUNIVERSIDAD', 'E_HORASSEMANATRABAJA']
for col in numeric_candidates:
    if col in train.columns:
        train[col] = (
            train[col].astype(str)
            .replace('[^0-9]', '', regex=True)
            .replace('', np.nan)
            .astype(float)
        )
        train[col].fillna(train[col].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train[col].fillna(train[col].median(), inplace=True)


# Estandarizar texto

In [7]:
train = train.apply(lambda x: x.str.strip().str.lower() if x.dtype == "object" else x)

# Separar variable objetivo

In [8]:
y = train['RENDIMIENTO_GLOBAL']
X = train.drop('RENDIMIENTO_GLOBAL', axis=1)

# Codificar variables categóricas

In [9]:
X = pd.get_dummies(X, drop_first=True)

# Normalización numérica

In [10]:
num_cols = train.select_dtypes(include=['float64', 'int64']).columns

scaler = StandardScaler()
train[num_cols] = scaler.fit_transform(train[num_cols])

# Codificar etiquetas (target)

In [14]:
y = y.map({
    'bajo': 0,
    'medio-bajo': 1,
    'medio-alto': 2,
    'alto': 3
})

# División entrenamiento / validación

In [13]:
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nTamaños:")
print("Entrenamiento:", X_train.shape)
print("Validación:", X_val.shape)


Tamaños:
Entrenamiento: (554000, 1017)
Validación: (138500, 1017)


# Guardar archivos procesados

In [15]:
X_train.to_csv("X_train_preprocesado.csv", index=False)
X_val.to_csv("X_val_preprocesado.csv", index=False)
y_train.to_csv("y_train_preprocesado.csv", index=False)
y_val.to_csv("y_val_preprocesado.csv", index=False)

print("\n Preprocesamiento completado correctamente.")


 Preprocesamiento completado correctamente.
