# Desafío de Predicción de Salarios

## Tabla de Contenidos

1. [Introducción](#introduccion)
2. [Configuración Inicial](#configuracion)
3. [Carga y Descripción de Datos](#datos)
4. [Análisis Exploratorio](#analisis)
5. [Preprocesamiento y Features](#preprocesamiento)
6. [Modelo Base](#baseline)
7. [Modelo Principal](#modelo)
8. [Evaluación y Métricas](#evaluacion)
9. [Visualización de Resultados](#resultados)
10. [Conclusiones](#conclusiones)

In [4]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Configurar paths
notebook_path = os.path.abspath('')
project_root = os.path.dirname(notebook_path)

# Agregar el directorio raíz al path
if project_root not in sys.path:
    sys.path.append(project_root)

# Imprimir información de debug
print("=== Configuración de Paths ===")
print(f"Notebook path: {notebook_path}")
print(f"Project root: {project_root}")

# Importar funciones del proyecto
try:
    from src.data.make_dataset import load_data, verify_data_integrity
    from src.data.preprocess import clean_data, encode_categorical_variables, scale_numerical_features, split_data
    from src.features.build_features import preprocess_and_engineer_features
    from src.models.train_model import train_random_forest_model, create_dummy_model
    from src.models.evaluate_model import evaluate_model
    print("\n✅ Módulos importados correctamente")
except Exception as e:
    print(f"\n❌ Error importando módulos: {e}")
    raise

# Configuración de visualización
sns.set_palette('husl')
%matplotlib inline

=== Configuración de Paths ===
Notebook path: /Users/sredondo/Desarrollo/HYMIE/testDS/salary-prediction-challenge/notebooks
Project root: /Users/sredondo/Desarrollo/HYMIE/testDS/salary-prediction-challenge

✅ Módulos importados correctamente


In [5]:
# Cargar los datos
data_path = os.path.join(project_root,  'data', 'raw', 'salary_data.csv')
print(f"Intentando cargar datos desde: {data_path}")

try:
    df = pd.read_csv(data_path)
    print("\n✅ Datos cargados correctamente")
    
    print("\nInformación del Dataset:")
    print("-" * 50)
    print(f"Número de filas: {df.shape[0]}")
    print(f"Número de columnas: {df.shape[1]}")
    print("\nColumnas disponibles:")
    for col in df.columns:
        print(f"- {col}: {df[col].dtype}")
    
    print("\nPrimeras filas del dataset:")
    display(df.head())
    
    print("\nEstadísticas descriptivas:")
    display(df.describe())
    
    print("\nValores faltantes:")
    missing_values = df.isnull().sum()
    display(missing_values[missing_values > 0])
    
except Exception as e:
    print(f"\n❌ Error durante la carga de datos: {e}")
    raise

Intentando cargar datos desde: /Users/sredondo/Desarrollo/HYMIE/testDS/salary-prediction-challenge/data/raw/salary_data.csv

✅ Datos cargados correctamente

Información del Dataset:
--------------------------------------------------
Número de filas: 388
Número de columnas: 18

Columnas disponibles:
- Job Title: object
- Rating: float64
- Company Name: object
- Location: object
- Size: object
- Type of ownership: object
- Industry: object
- Sector: object
- Competitors: object
- hourly: int64
- employer_provided: int64
- min_salary: int64
- max_salary: int64
- avg_salary: float64
- company_txt: object
- job_state: object
- same_state: int64
- age: int64

Primeras filas del dataset:


Unnamed: 0,Job Title,Rating,Company Name,Location,Size,Type of ownership,Industry,Sector,Competitors,hourly,employer_provided,min_salary,max_salary,avg_salary,company_txt,job_state,same_state,age
0,Data Scientist,3.8,Tecolote Research\n3.8,"Albuquerque, NM",501 to 1000 employees,Company - Private,Aerospace & Defense,Aerospace & Defense,-1,0,0,53,91,72.0,Tecolote Research\n,NM,0,47
1,Healthcare Data Scientist,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD",10000+ employees,Other Organization,Health Care Services & Hospitals,Health Care,-1,0,0,63,112,87.5,University of Maryland Medical System\n,MD,0,36
2,Data Scientist,4.8,KnowBe4\n4.8,"Clearwater, FL",501 to 1000 employees,Company - Private,Security Services,Business Services,-1,0,0,80,90,85.0,KnowBe4,FL,1,10
3,Data Scientist,3.8,PNNL\n3.8,"Richland, WA",1001 to 5000 employees,Government,Energy,"Oil, Gas, Energy & Utilities","Oak Ridge National Laboratory, National Renewa...",0,0,56,97,76.5,PNNL\n,WA,1,55
4,Data Scientist,2.9,Affinity Solutions\n2.9,"New York, NY",51 to 200 employees,Company - Private,Advertising & Marketing,Business Services,"Commerce Signals, Cardlytics, Yodlee",0,0,86,143,114.5,Affinity Solutions\n,NY,1,22



Estadísticas descriptivas:


Unnamed: 0,Rating,hourly,employer_provided,min_salary,max_salary,avg_salary,same_state,age
count,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0
mean,3.682732,0.028351,0.018041,74.159794,127.368557,100.764175,0.559278,47.146907
std,0.732265,0.166186,0.133272,31.476585,45.610991,37.961686,0.497115,56.80891
min,-1.0,0.0,0.0,15.0,16.0,15.5,0.0,-1.0
25%,3.4,0.0,0.0,53.0,97.0,76.5,0.0,10.0
50%,3.7,0.0,0.0,71.0,124.0,97.75,1.0,24.0
75%,4.1,0.0,0.0,91.0,159.0,124.0,1.0,56.5
max,5.0,1.0,1.0,202.0,306.0,254.0,1.0,276.0



Valores faltantes:


Series([], dtype: int64)

In [6]:
# Preprocesamiento y features
try:
    print("Iniciando preprocesamiento...")
    df_processed = preprocess_and_engineer_features(df.copy())
    
    print("\n✅ Preprocesamiento completado")
    print(f"Shape original: {df.shape}")
    print(f"Shape después del preprocesamiento: {df_processed.shape}")
    
    print("\nColumnas generadas:")
    for col in df_processed.columns:
        print(f"- {col}: {df_processed[col].dtype}")
        
except Exception as e:
    print(f"\n❌ Error durante el preprocesamiento: {e}")
    raise

Iniciando preprocesamiento...

Iniciando preprocesamiento con shape: (388, 18)
Columnas iniciales: ['Job Title', 'Rating', 'Company Name', 'Location', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Competitors', 'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary', 'company_txt', 'job_state', 'same_state', 'age']

=== Normalizando nombres de columnas ===
Nombres originales: ['Job Title', 'Rating', 'Company Name', 'Location', 'Size', 'Type of ownership', 'Industry', 'Sector', 'Competitors', 'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary', 'company_txt', 'job_state', 'same_state', 'age']
Nombres normalizados: ['job_title', 'rating', 'company_name', 'location', 'size', 'type_of_ownership', 'industry', 'sector', 'competitors', 'hourly', 'employer_provided', 'min_salary', 'max_salary', 'avg_salary', 'company_txt', 'job_state', 'same_state', 'age']

=== Columnas luego de normalizar en preprocess_and_engineer_features ===
Columnas disponibles:
[

ValueError: Las siguientes columnas aún no son numéricas: ['experience_level_Junior', 'experience_level_Mid', 'experience_level_Senior', 'salary_category_Very Low', 'salary_category_Low', 'salary_category_Medium', 'salary_category_High', 'salary_category_Very High']

In [None]:
# Dividir datos y entrenar modelos
try:
    # Split de datos
    print("Dividiendo datos...")
    X_train, X_test, y_train, y_test = split_data(df_processed)
    print("✅ Datos divididos correctamente")
    print(f"X_train shape: {X_train.shape}")
    print(f"X_test shape: {X_test.shape}")
    
    # Modelo base
    print("\nEntrenando modelo base...")
    dummy_model = create_dummy_model(X_train, y_train)
    baseline_metrics = evaluate_model(dummy_model, X_test, y_test)
    print("✅ Modelo base evaluado")
    
    # Modelo principal
    print("\nEntrenando Random Forest...")
    rf_model = train_random_forest_model(X_train, y_train)
    rf_metrics = evaluate_model(rf_model, X_test, y_test)
    print("✅ Random Forest evaluado")
    
except Exception as e:
    print(f"\n❌ Error durante el entrenamiento/evaluación: {e}")
    raise

In [None]:
# Visualización de resultados
try:
    # Predicciones vs Valores Reales
    y_pred = rf_model.predict(X_test)
    
    plt.figure(figsize=(10, 6))
    plt.scatter(y_test, y_pred, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
    plt.xlabel('Salarios Reales')
    plt.ylabel('Salarios Predichos')
    plt.title('Predicciones vs Valores Reales')
    
    # Calcular y mostrar métricas en el gráfico
    plt.text(0.05, 0.95, 
             f"RMSE: {rf_metrics['rmse']:.2f}\n" 
             f"R²: {rf_metrics['r2']:.3f}\n"
             f"MAPE: {rf_metrics['mape']:.2f}%",
             transform=plt.gca().transAxes,
             bbox=dict(facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    plt.show()
    
    print("\nIntervalos de Confianza:")
    print(f"RMSE: {rf_metrics['rmse_ci']}")
    print(f"R²: {rf_metrics['r2_ci']}")
    print(f"MAPE: {rf_metrics['mape_ci']}")
    
except Exception as e:
    print(f"\n❌ Error durante la visualización: {e}")
    raise

## Conclusiones

1. El modelo Random Forest muestra una mejora significativa sobre el modelo base, como se evidencia en las métricas:
   - RMSE más bajo
   - R² más alto
   - MAPE más bajo

2. Los intervalos de confianza confirman la robustez de los resultados

3. Las predicciones son más precisas en el rango medio de salarios

Todo el código detallado está disponible en los módulos del directorio `src/`.