In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler


In [3]:
import os
os.chdir(r"C:\Users\dsala\Downloads")

In [30]:

# Cargar la base de datos
base = pd.read_csv('GEIH2018_filtered_data.csv')

# Crear la variable 'Log_salario'
base['Log_salario'] = np.log(base['y_total_m_ha'])

# Eliminar las variables que tienen colinealidad con la dependiente
base_reducida = base.drop(columns=['y_total_m_ha', 'y_total_m', 'ingtot', 'Log_salario', 'dominio','orden','secuencia_p','directorio','clase','p6240','p7040','depto','mes'])

In [31]:


categorical_vars = ['estrato1', 'p6210', 'relab', 'sizeFirm']

# Crear dummies solo para las variables categóricas especificadas, eliminando la original
base_dummies = pd.get_dummies(base_reducida, columns=categorical_vars, drop_first=True)



In [34]:
base_dummies['p6920']

0        1
1        1
2        1
3        2
4        1
        ..
12821    2
12822    2
12823    1
12824    1
12825    1
Name: p6920, Length: 12826, dtype: int64

In [35]:
numericas = ['age',  'p6426', 'p6870', 'p7070', 
                'maxEducLevel', 'totalHoursWorked']

# Crear el cuadrado de las variables numéricas
for i in numericas:
    base_dummies[f'{i}^2'] = base_reducida[i] ** 2

In [37]:
base_dummies['Log_salario']=base['Log_salario']


In [38]:
# Definir la variable dependiente (Log_salario)
X = base_dummies.drop(columns=['Log_salario']) 
y = base_dummies['Log_salario']

seed=202013547

# Dividir la base de datos en conjunto de entrenamiento y prueba (70% - 30%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [44]:
# Convertir todas las columnas de tipo 'bool' a 'int'
X_train = X_train.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)
X_test = X_test.apply(lambda x: x.astype(int) if x.dtype == 'bool' else x)




backward elimination para reducir X

In [45]:
def backward_elimination(X, y, significance_level=0.05):
    features = list(X.columns)
    while len(features) > 0:
        X_with_const = sm.add_constant(X[features])
        model = sm.OLS(y, X_with_const).fit()
        pvalues = model.pvalues.iloc[1:]  # Excluir la constante
        max_pval = pvalues.max()
        if max_pval > significance_level:
            excluded_feature = pvalues.idxmax()
            features.remove(excluded_feature)
        else:
            break
    return features

# Aplicar Backward Elimination
selected_features_backward = backward_elimination(X_train, y_train)
print("Variables seleccionadas por Backward Elimination:", selected_features_backward)


Variables seleccionadas por Backward Elimination: ['const', 'age', 'sex', 'p6426', 'p6870', 'p7070', 'maxEducLevel', 'totalHoursWorked', 'formal', 'informal', 'cuentaPropia', 'microEmpresa', 'estrato1_3', 'estrato1_4', 'estrato1_5', 'estrato1_6', 'p6210_3', 'p6210_4', 'p6210_6', 'p6210_9', 'relab_2', 'relab_4', 'relab_5', 'sizeFirm_3', 'sizeFirm_4', 'sizeFirm_5', 'age^2', 'p6426^2', 'p6870^2', 'p7070^2', 'maxEducLevel^2', 'totalHoursWorked^2']


In [64]:
segundo_back=backward_elimination(X_train[selected_features_backward],y_train)

In [73]:
segundo_back

['const',
 'age',
 'sex',
 'p6426',
 'p6870',
 'p7070',
 'maxEducLevel',
 'totalHoursWorked',
 'formal',
 'informal',
 'cuentaPropia',
 'microEmpresa',
 'estrato1_3',
 'estrato1_4',
 'estrato1_5',
 'estrato1_6',
 'p6210_3',
 'p6210_4',
 'p6210_6',
 'p6210_9',
 'relab_2',
 'relab_4',
 'relab_5',
 'sizeFirm_3',
 'sizeFirm_4',
 'sizeFirm_5',
 'age^2',
 'p6426^2',
 'p6870^2',
 'maxEducLevel^2',
 'totalHoursWorked^2']

LASSO para reducir más las variables y empezar a probar modelos

In [74]:

X_train_selected = X_train[segundo_back[1:]]
X_test_selected = X_test[segundo_back[1:]]

In [79]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
import numpy as np
import statsmodels.api as sm

# Escalar solo las variables seleccionadas
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_selected)
X_test_scaled = scaler.transform(X_test_selected)

# Aplicar LassoCV para seleccionar variables
lasso = LassoCV(cv=5, random_state=0).fit(X_train_scaled, y_train)

# Identificar las variables que sobrevivieron al Lasso
variables_lasso = np.array(X_train_selected.columns)[lasso.coef_ != 0]
print("Variables seleccionadas por Lasso:", variables_lasso)

# Ajustar el modelo final con las variables seleccionadas por Lasso
X_train_lasso = X_train[variables_lasso]
X_test_lasso = X_test[variables_lasso]

# Reentrenar el modelo con las variables seleccionadas
modelo_lasso = sm.OLS(y_train, sm.add_constant(X_train_lasso)).fit()

# Resultados del modelo Lasso
print(modelo_lasso.summary())

# Predecir en el conjunto de prueba
y_pred_lasso = modelo_lasso.predict(sm.add_constant(X_test_lasso))

# Evaluar el modelo (RMSE)
rmse_lasso = np.sqrt(mean_squared_error(y_test, y_pred_lasso))
print(f"RMSE del modelo con Lasso: {rmse_lasso:.4f}")


Variables seleccionadas por Lasso: ['age' 'sex' 'p6426' 'p6870' 'p7070' 'maxEducLevel' 'totalHoursWorked'
 'formal' 'informal' 'cuentaPropia' 'microEmpresa' 'estrato1_3'
 'estrato1_4' 'estrato1_5' 'estrato1_6' 'p6210_3' 'p6210_4' 'p6210_6'
 'relab_2' 'relab_4' 'relab_5' 'sizeFirm_3' 'sizeFirm_5' 'age^2' 'p6426^2'
 'p6870^2' 'totalHoursWorked^2']
                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.583
Model:                            OLS   Adj. R-squared:                  0.582
Method:                 Least Squares   F-statistic:                     501.6
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        12:05:56   Log-Likelihood:                -6774.7
No. Observations:                8978   AIC:                         1.360e+04
Df Residuals:                    8952   BIC:                         1.379e+04
Df Model:           

Se volvió a correr un backward sobre las sobrevivientes de lasso para reducir aún más

In [80]:
tercer_back=backward_elimination(X_train[variables_lasso],y_train)

In [81]:
len(tercer_back)

23

finalmente se quitaron variables con una alta correlación

In [82]:
# Calcular la matriz de correlación
correlacion = X_train[tercer_back].corr().abs()

# Seleccionar características con alta correlación 
umbral_correlacion = 0.9
columnas_eliminar = set()
for i in range(len(correlacion.columns)):
    for j in range(i):
        if correlacion.iloc[i, j] > umbral_correlacion:
            colname = correlacion.columns[i]
            columnas_eliminar.add(colname)

# Eliminar las variables con alta correlación
X_train_ultimo = X_train[tercer_back].drop(columns=columnas_eliminar)
print(f"Variables después de eliminar alta correlación: {X_train_ultimo.columns.tolist()}")


Variables después de eliminar alta correlación: ['age', 'sex', 'p6426', 'p6870', 'p7070', 'maxEducLevel', 'totalHoursWorked', 'formal', 'cuentaPropia', 'microEmpresa', 'estrato1_3', 'estrato1_4', 'estrato1_5', 'estrato1_6', 'p6210_6', 'relab_2', 'relab_5']


In [102]:
X_train_ultimo.columns

Index(['age', 'sex', 'p6426', 'p6870', 'p7070', 'maxEducLevel',
       'totalHoursWorked', 'formal', 'cuentaPropia', 'microEmpresa',
       'estrato1_3', 'estrato1_4', 'estrato1_5', 'estrato1_6', 'p6210_6',
       'relab_2', 'relab_5'],
      dtype='object')

Con las vraibels sobrevivientes de todos estos porcesos se empiezan a probar los siguientes modelos

modelo 0

In [85]:
# Definir las variables del modelo 
columnas_finales = X_train_ultimo.columns.tolist()

# Seleccionar las mismas variabackwardbles en el conjunto de prueba
X_test_reducido = X_test[columnas_finales]

# Añadir la constante 
X_train_reducido_const = sm.add_constant(X_train_ultimo)
X_test_reducido_const = sm.add_constant(X_test_reducido)

# Ajustar el modelo de regresión lineal
modelo_0 = sm.OLS(y_train, X_train_reducido_const).fit()

# tabla
print(modelo_0.summary())

# Realizar predicciones en el conjunto de prueba
y_pred = modelo_0.predict(X_test_reducido_const)

# Evaluar el modelo (RMSE)
rmse_0 = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE modelo 0 : {rmse_0:.4f}")


                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.561
Model:                            OLS   Adj. R-squared:                  0.560
Method:                 Least Squares   F-statistic:                     673.0
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        12:10:26   Log-Likelihood:                -7012.8
No. Observations:                8978   AIC:                         1.406e+04
Df Residuals:                    8960   BIC:                         1.419e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                8.1305      0.063  



In [86]:
X_train_ultimo.columns

Index(['age', 'sex', 'p6426', 'p6870', 'p7070', 'maxEducLevel',
       'totalHoursWorked', 'formal', 'cuentaPropia', 'microEmpresa',
       'estrato1_3', 'estrato1_4', 'estrato1_5', 'estrato1_6', 'p6210_6',
       'relab_2', 'relab_5'],
      dtype='object')

modelo 1

In [87]:
X_train_nl1 = X_train[['age', 'totalHoursWorked', 'sex', 'p6210_6','formal']].copy()
X_test_nl1 = X_test[['age', 'totalHoursWorked', 'sex', 'p6210_6','formal']].copy()

X_train_nl1['age^2'] = X_train_nl1['age'] ** 2
X_train_nl1['totalHoursWorked^2'] = X_train_nl1['totalHoursWorked'] ** 2

X_test_nl1['age^2'] = X_test_nl1['age'] ** 2
X_test_nl1['totalHoursWorked^2'] = X_test_nl1['totalHoursWorked'] ** 2


X_train_nl1_const = sm.add_constant(X_train_nl1)
X_test_nl1_const = sm.add_constant(X_test_nl1)

modelo_1 = sm.OLS(y_train, X_train_nl1_const).fit()
y_pred_1 = modelo_1.predict(X_test_nl1_const)
rmse_1 = mean_squared_error(y_test, y_pred_1, squared=False)

print("Modelo 1 Summary:")
print(modelo_1.summary())
print(f"RMSE modelo 1 : {rmse_1:.4f}")

Modelo 1 Summary:
                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.421
Model:                            OLS   Adj. R-squared:                  0.421
Method:                 Least Squares   F-statistic:                     933.3
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        12:13:07   Log-Likelihood:                -8250.2
No. Observations:                8978   AIC:                         1.652e+04
Df Residuals:                    8970   BIC:                         1.657e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const             



Modelo 2

In [88]:
X_train_nl2 = X_train[['p7070', 'maxEducLevel', 'cuentaPropia']].copy()
X_test_nl2 = X_test[['p7070', 'maxEducLevel', 'cuentaPropia']].copy()

X_train_nl2['p7070^2'] = X_train_nl2['p7070'] ** 2
X_train_nl2['maxEducLevel^2'] = X_train_nl2['maxEducLevel'] ** 2

X_test_nl2['p7070^2'] = X_test_nl2['p7070'] ** 2
X_test_nl2['maxEducLevel^2'] = X_test_nl2['maxEducLevel'] ** 2

X_train_nl2_const = sm.add_constant(X_train_nl2)
X_test_nl2_const = sm.add_constant(X_test_nl2)

modelo_2 = sm.OLS(y_train, X_train_nl2_const).fit()
y_pred_2 = modelo_2.predict(X_test_nl2_const)
rmse_2 = mean_squared_error(y_test, y_pred_2, squared=False)

print("Modelo 2 Summary:")
print(modelo_2.summary())
print(f"RMSE Modelo 2: {rmse_2:.4f}")

Modelo 2 Summary:
                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.261
Model:                            OLS   Adj. R-squared:                  0.261
Method:                 Least Squares   F-statistic:                     634.4
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        12:14:03   Log-Likelihood:                -9347.4
No. Observations:                8978   AIC:                         1.871e+04
Df Residuals:                    8972   BIC:                         1.875e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              9.1993 



Modelo 3

In [94]:
X_train_nl3 = X_train[['age', 'p6426', 'sex', 'formal']].copy()
X_test_nl3 = X_test[['age', 'p6426', 'sex', 'formal']].copy()

X_train_nl3['age^2'] = X_train_nl3['age'] ** 2
X_train_nl3['p6426^2'] = X_train_nl3['p6426'] ** 2

X_test_nl3['age^2'] = X_test_nl3['age'] ** 2
X_test_nl3['p6426^2'] = X_test_nl3['p6426'] ** 2

X_train_nl3_const = sm.add_constant(X_train_nl3)
X_test_nl3_const = sm.add_constant(X_test_nl3)

modelo_3 = sm.OLS(y_train, X_train_nl3_const).fit()
y_pred_3 = modelo_3.predict(X_test_nl3_const)
rmse_3 = mean_squared_error(y_test, y_pred_3, squared=False)

print("Modelo 3 Summary:")
print(modelo_3.summary())
print(f"RMSE Modelo 3: {rmse_3:.4f}")


Modelo 3 Summary:
                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.231
Model:                            OLS   Adj. R-squared:                  0.231
Method:                 Least Squares   F-statistic:                     450.1
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        12:21:23   Log-Likelihood:                -9525.1
No. Observations:                8978   AIC:                         1.906e+04
Df Residuals:                    8971   BIC:                         1.911e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.2991      0.070  



modelo 7

In [98]:
X_train_nl7 = X_train[['age', 'totalHoursWorked', 'formal', 'estrato1_3', 'estrato1_4', 'estrato1_5', 'estrato1_6', 'p6210_6',
       'relab_2', 'relab_5', 'p6426', 'age^2', 'totalHoursWorked^2']].copy()
X_train_nl7['age_formal'] = X_train['age'] * X_train['formal']
X_train_nl7['totalHoursWorked2_estrato3'] = X_train['totalHoursWorked^2'] * X_train['estrato1_3']
X_train_nl7['p6426_age2'] = X_train['p6426'] * X_train['age^2']

X_test_nl7 = X_test[['age', 'totalHoursWorked', 'formal', 'estrato1_3', 'estrato1_4', 'estrato1_5', 'estrato1_6', 'p6210_6',
       'relab_2', 'relab_5', 'p6426', 'age^2', 'totalHoursWorked^2']].copy()
X_test_nl7['age_formal'] = X_test['age'] * X_test['formal']
X_test_nl7['totalHoursWorked2_estrato1'] = X_test['totalHoursWorked^2'] * X_test['estrato1_3']
X_test_nl7['p6426_age2'] = X_test['p6426'] * X_test['age^2']

# Ajustar el modelo
modelo_7 = sm.OLS(y_train, sm.add_constant(X_train_nl7)).fit()
y_pred_7 = modelo_7.predict(sm.add_constant(X_test_nl7))

# Calcular el RMSE
rmse_7 = mean_squared_error(y_test, y_pred_7, squared=False)

print(f"RMSE Modelo 7: {rmse_7:.4f}")

RMSE Modelo 7: 0.5553




In [108]:
print(modelo_7.summary())

                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.540
Model:                            OLS   Adj. R-squared:                  0.539
Method:                 Least Squares   F-statistic:                     658.3
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        12:31:28   Log-Likelihood:                -7217.6
No. Observations:                8978   AIC:                         1.447e+04
Df Residuals:                    8961   BIC:                         1.459e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               

In [100]:
modelos = ['Modelo 0','Modelo 1', 'Modelo 2', 'Modelo 3', 'Modelo 7']
rmses = [rmse_0, rmse_1, rmse_2, rmse_3, rmse_7] 

# Crear un DataFrame a partir de las listas
tabla_rmse = pd.DataFrame({
    'Nombre del Modelo': modelos,
    'RMSE': rmses
})

In [101]:
tabla_rmse.sort_values(by='RMSE', ascending=True)

Unnamed: 0,Nombre del Modelo,RMSE
0,Modelo 0,0.546178
4,Modelo 7,0.555257
1,Modelo 1,0.617017
2,Modelo 2,0.700284
3,Modelo 3,0.7038


Con estos resultados se utilizan el modelo 0 y el 7 para el siguiente punto de LOOCV

In [110]:
print(modelo_0.summary())

                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.561
Model:                            OLS   Adj. R-squared:                  0.560
Method:                 Least Squares   F-statistic:                     673.0
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        13:26:48   Log-Likelihood:                -7012.8
No. Observations:                8978   AIC:                         1.406e+04
Df Residuals:                    8960   BIC:                         1.419e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
const                8.1305      0.063  

In [111]:
print(modelo_1.summary())

                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.421
Model:                            OLS   Adj. R-squared:                  0.421
Method:                 Least Squares   F-statistic:                     933.3
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        13:27:02   Log-Likelihood:                -8250.2
No. Observations:                8978   AIC:                         1.652e+04
Df Residuals:                    8970   BIC:                         1.657e+04
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                  8.0285      0

In [112]:
print(modelo_3.summary())

                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.231
Model:                            OLS   Adj. R-squared:                  0.231
Method:                 Least Squares   F-statistic:                     450.1
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        13:27:12   Log-Likelihood:                -9525.1
No. Observations:                8978   AIC:                         1.906e+04
Df Residuals:                    8971   BIC:                         1.911e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          7.2991      0.070    103.536      0.0

In [113]:
print(modelo_7.summary())

                            OLS Regression Results                            
Dep. Variable:            Log_salario   R-squared:                       0.540
Model:                            OLS   Adj. R-squared:                  0.539
Method:                 Least Squares   F-statistic:                     658.3
Date:                Sun, 15 Sep 2024   Prob (F-statistic):               0.00
Time:                        13:27:44   Log-Likelihood:                -7217.6
No. Observations:                8978   AIC:                         1.447e+04
Df Residuals:                    8961   BIC:                         1.459e+04
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------------
const               