In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge, RidgeCV, Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer # Import SimpleImputer for handling NaNs

In [None]:
df = pd.read_excel('predictores_nd.xlsx')
df.head()

Unnamed: 0,ID,SEXO,EDAD,PERTENENCIA_ETNICA,gp_desplaz,gp_migrant,gp_indigen,LOC_RES,REGIMEN_AFILIACION,TIPO_TB,CONDICION_INGRESO,RESULTADO_BK_RECOD,CONDICION_VIH,Consumidor_SPA,Desnutricion,Tabaquismo,Enf_Mental,PERDIDA_SEGUIMIENTO
0,1,M,25,OTRO,NO,NO,NO,FDB,P,PULMONAR,NUEVO,POSITIVO,NEGATIVO,NO,NO,NO,NO,NO
1,2,M,66,OTRO,NO,NO,NO,SO,S,PULMONAR,OPT,POSITIVO,NEGATIVO,NO,NO,NO,NO,NO
2,3,M,38,OTRO,NO,NO,NO,SO,S,PULMONAR,NUEVO,POSITIVO,NEGATIVO,NO,SI,NO,NO,NO
3,4,M,67,OTRO,NO,NO,NO,SO,S,PULMONAR,NUEVO,POSITIVO,NEGATIVO,NO,NO,SI,NO,NO
4,5,F,25,OTRO,NO,NO,NO,SO,S,PULMONAR,NUEVO,POSITIVO,NEGATIVO,NO,NO,NO,NO,NO


In [None]:
print(df.columns)

Index(['ID', 'SEXO', 'EDAD', 'PERTENENCIA_ETNICA', 'gp_desplaz', 'gp_migrant',
       'gp_indigen', 'LOC_RES', 'REGIMEN_AFILIACION', 'TIPO_TB',
       'CONDICION_INGRESO', 'RESULTADO_BK_RECOD', 'CONDICION_VIH',
       'Consumidor_SPA', 'Desnutricion', 'Tabaquismo', 'Enf_Mental',
       'PERDIDA_SEGUIMIENTO'],
      dtype='object')


In [None]:
categoric_columns = ['SEXO', 'PERTENENCIA_ETNICA', 'gp_desplaz', 'gp_migrant', 'gp_indigen', 'LOC_RES', 'REGIMEN_AFILIACION', 'TIPO_TB', 'CONDICION_INGRESO', 'RESULTADO_BK_RECOD','CONDICION_VIH', 'Consumidor_SPA','Desnutricion', 'Tabaquismo', 'Enf_Mental']
columns = list(df.columns)
numeric_columns = [i for i in columns if i not in categoric_columns]

In [None]:
df['PERDIDA_SEGUIMIENTO'] = df['PERDIDA_SEGUIMIENTO'].map({'NO': 0, 'SI': 1})
df = pd.DataFrame(df)

In [None]:
numeric_columns.remove('ID')

In [None]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.4-py2.py3-none-any.whl.metadata (8.0 kB)
Downloading category_encoders-2.6.4-py2.py3-none-any.whl (82 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m82.0/82.0 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.6.4


In [None]:
import category_encoders as ce
encoder = ce.BinaryEncoder(drop_invariant=False, return_df=True) # Use ce.BinaryEncoder
encoder.fit(df[categoric_columns])

In [None]:
from tabulate import tabulate
#print(tabulate(encoder.head(), headers='keys', tablefmt='psql'))
encoded_data = encoder.transform(df[categoric_columns])
print(tabulate(encoded_data.head(), headers='keys', tablefmt='psql'))
#LOC_RES_encoded = encoder.transform(df[categoric_columns])
#LOC_RES_encoded

+----+----------+----------+------------------------+------------------------+------------------------+----------------+----------------+----------------+----------------+----------------+----------------+-------------+-------------+-------------+------------------------+------------------------+------------------------+-------------+-------------+-----------------------+-----------------------+-----------------------+------------------------+------------------------+------------------------+-------------------+-------------------+--------------------+--------------------+------------------+------------------+----------------+----------------+----------------+----------------+
|    |   SEXO_0 |   SEXO_1 |   PERTENENCIA_ETNICA_0 |   PERTENENCIA_ETNICA_1 |   PERTENENCIA_ETNICA_2 |   gp_desplaz_0 |   gp_desplaz_1 |   gp_migrant_0 |   gp_migrant_1 |   gp_indigen_0 |   gp_indigen_1 |   LOC_RES_0 |   LOC_RES_1 |   LOC_RES_2 |   REGIMEN_AFILIACION_0 |   REGIMEN_AFILIACION_1 |   REGIMEN_AFILIA

In [None]:
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = StandardScaler()
numerical_features_scaled = scaler.fit_transform(df[numeric_columns])
scaled_num = numerical_features_scaled

In [None]:
encoded_cat = encoder.transform(df[categoric_columns])
scaled_num = scaler.transform(df[numeric_columns])
scaled_num_df = pd.DataFrame(scaled_num, columns=numeric_columns, index=df.index)

In [None]:
# Avoid including 'PERDIDA_SEGUIMIENTO' twice during concatenation
df2 = pd.concat([scaled_num_df, encoded_cat, df[['PERDIDA_SEGUIMIENTO']]], axis=1)
# Alternatively, you could drop the duplicate column after concatenation:
df2 = pd.concat([df[['PERDIDA_SEGUIMIENTO']], scaled_num_df, encoded_cat], axis=1)
df2 = df2.loc[:,~df2.columns.duplicated()]
print(df2['PERDIDA_SEGUIMIENTO'].value_counts())

PERDIDA_SEGUIMIENTO
0    9364
1     738
Name: count, dtype: int64


In [None]:
print(df2.shape)

(10102, 37)


In [None]:
print(df2.columns)

Index(['PERDIDA_SEGUIMIENTO', 'EDAD', 'SEXO_0', 'SEXO_1',
       'PERTENENCIA_ETNICA_0', 'PERTENENCIA_ETNICA_1', 'PERTENENCIA_ETNICA_2',
       'gp_desplaz_0', 'gp_desplaz_1', 'gp_migrant_0', 'gp_migrant_1',
       'gp_indigen_0', 'gp_indigen_1', 'LOC_RES_0', 'LOC_RES_1', 'LOC_RES_2',
       'REGIMEN_AFILIACION_0', 'REGIMEN_AFILIACION_1', 'REGIMEN_AFILIACION_2',
       'TIPO_TB_0', 'TIPO_TB_1', 'CONDICION_INGRESO_0', 'CONDICION_INGRESO_1',
       'CONDICION_INGRESO_2', 'RESULTADO_BK_RECOD_0', 'RESULTADO_BK_RECOD_1',
       'RESULTADO_BK_RECOD_2', 'CONDICION_VIH_0', 'CONDICION_VIH_1',
       'Consumidor_SPA_0', 'Consumidor_SPA_1', 'Desnutricion_0',
       'Desnutricion_1', 'Tabaquismo_0', 'Tabaquismo_1', 'Enf_Mental_0',
       'Enf_Mental_1'],
      dtype='object')


In [None]:
# prompt: Como identificar la transformacion realizada por la funcion encoder para la variable LOC_RES del dataframe df

# Get the feature names after encoding for 'LOC_RES'
encoded_loc_res_columns = [col for col in encoded_data.columns if 'LOC_RES' in col]

# Print the mapping between original 'LOC_RES' values and their encoded values
for original_value in df['LOC_RES'].unique():
  encoded_values = LOC_RES_encoded[LOC_RES_encoded.index.isin(df[df['LOC_RES'] == original_value].index)].values.tolist()
  print(f"Original LOC_RES value: {original_value}")
  print(f"Encoded values: {encoded_values}")

# You can also print the mapping directly from the encoder:
print("\nMapping from encoder:")
print(encoder.mapping)


NameError: name 'LOC_RES_encoded' is not defined

## **LASSO**

In [None]:
# prompt: Generar un modelo LAASO para reduccion de caracteristicas a partir de df2 probando diferentes valores de alpha

X = df2.drop('PERDIDA_SEGUIMIENTO', axis=1)
y = df2['PERDIDA_SEGUIMIENTO']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

alphas = [0.001, 0.01, 0.1, 1, 10, 50, 100]  # Different values of alpha to try

for alpha in alphas:
  lasso = Lasso(alpha=alpha)
  lasso.fit(X_train, y_train)

  # Get the coefficients of the Lasso model
  coefficients = lasso.coef_

  # Print the features with non-zero coefficients (selected features)
  selected_features_lasso = X.columns[coefficients != 0]
  print(f"For alpha = {alpha}:")
  print("Selected Features:", selected_features_lasso)
  print("coefficients:", lasso.coef_)
  print("Number of selected features:", len(selected_features_lasso))
  print("------------------------------------")


For alpha = 0.001:
Selected Features: Index(['EDAD', 'PERTENENCIA_ETNICA_1', 'gp_migrant_0', 'gp_indigen_0',
       'gp_indigen_1', 'LOC_RES_0', 'LOC_RES_1', 'LOC_RES_2',
       'REGIMEN_AFILIACION_0', 'REGIMEN_AFILIACION_2', 'TIPO_TB_0',
       'TIPO_TB_1', 'CONDICION_INGRESO_0', 'CONDICION_INGRESO_2',
       'RESULTADO_BK_RECOD_1', 'RESULTADO_BK_RECOD_2', 'CONDICION_VIH_0',
       'CONDICION_VIH_1', 'Consumidor_SPA_0', 'Consumidor_SPA_1'],
      dtype='object')
coefficients: [-2.00365212e-02 -0.00000000e+00  0.00000000e+00  0.00000000e+00
  7.29988727e-03 -0.00000000e+00  0.00000000e+00 -0.00000000e+00
  2.09318132e-02 -0.00000000e+00  2.56692575e-01 -4.07053023e-16
  8.82718497e-03  2.23370405e-02  7.42586704e-04  2.49931025e-02
 -0.00000000e+00 -1.19290531e-02 -1.40140160e-02  1.17124023e-18
  8.30112282e-02  0.00000000e+00 -4.87040465e-02 -0.00000000e+00
 -3.71297955e-03  1.22476832e-02  1.54100040e-02 -8.74574873e-04
  4.77063451e-02 -5.05337127e-17  0.00000000e+00 -0.00000000e+0

In [None]:
# prompt: OBTENER LAS MEDIDAS DE BIC Y AIC DEL MODELO lasso con valor de  alpha 0.001

import statsmodels.api as sm

X_train_with_constant = sm.add_constant(X_train)
lasso = Lasso(alpha=0.001)
lasso.fit(X_train, y_train)
y_pred_train = lasso.predict(X_train)

model_lasso = sm.OLS(y_train, X_train_with_constant).fit()

# Calculate BIC and AIC
bic = model_lasso.bic
aic = model_lasso.aic

print(f"BIC: {bic}")
print(f"AIC: {aic}")


BIC: 503.7211458943258
AIC: 311.53594452940706


In [None]:
# prompt: OBTENER LAS MEDIDAS DE BIC Y AIC DEL MODELO lasso con valor de  alpha 0.01

X_train_with_constant = sm.add_constant(X_train)
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)
y_pred_train = lasso.predict(X_train)

model_2 = sm.OLS(y_train, X_train_with_constant).fit()

# Calculate BIC and AIC
bic = model_2.bic
aic = model_2.aic

print(f"BIC: {bic}")
print(f"AIC: {aic}")


BIC: 503.7211458943258
AIC: 311.53594452940706


## **REGRESION LOGISTICA**

In [None]:
# prompt: Generar un modelo de regresion logistica para reduccion de caracteristicas a partir de df2

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

# Define X and y
X = df2.drop('PERDIDA_SEGUIMIENTO', axis=1)
y = df2['PERDIDA_SEGUIMIENTO']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a LogisticRegression model with L1 penalty (for feature selection)
logistic_model = LogisticRegression(penalty='l1', solver='liblinear', C=1)

# Fit the model to the training data
logistic_model.fit(X_train, y_train)

# Use SelectFromModel to select features based on the logistic regression coefficients
selector = SelectFromModel(logistic_model, prefit=True)

# Transform the training and testing data to keep only selected features
X_train_selected = selector.transform(X_train)
X_test_selected = selector.transform(X_test)

# Get the indices of selected features
selected_feature_indices = selector.get_support(indices=True)

# Get the names of selected features
selected_features_logit = X.columns[selected_feature_indices]

print("Selected Features:", selected_features_logit)
print("Number of selected features:", len(selected_features_logit))

# You can now train a new logistic regression model or any other model using X_train_selected and X_test_selected


Selected Features: Index(['EDAD', 'SEXO_0', 'PERTENENCIA_ETNICA_1', 'PERTENENCIA_ETNICA_2',
       'gp_desplaz_1', 'gp_migrant_1', 'gp_indigen_0', 'gp_indigen_1',
       'LOC_RES_0', 'LOC_RES_1', 'LOC_RES_2', 'REGIMEN_AFILIACION_0',
       'REGIMEN_AFILIACION_1', 'REGIMEN_AFILIACION_2', 'TIPO_TB_0',
       'CONDICION_INGRESO_0', 'CONDICION_INGRESO_1', 'CONDICION_INGRESO_2',
       'RESULTADO_BK_RECOD_0', 'RESULTADO_BK_RECOD_1', 'RESULTADO_BK_RECOD_2',
       'CONDICION_VIH_0', 'CONDICION_VIH_1', 'Consumidor_SPA_1',
       'Tabaquismo_1', 'Enf_Mental_1'],
      dtype='object')
Number of selected features: 26




In [None]:
# prompt: OBTENER LAS MEDIDAS DE BIC Y AIC DEL MODELO  logistic_model

import statsmodels.api as sm

# Add a constant term to the predictor variables
X_train_with_constant = sm.add_constant(X_train_selected)

# Fit the logistic regression model using statsmodels
logit_model = sm.Logit(y_train, X_train_with_constant).fit()

# Get the AIC and BIC
aic = logit_model.aic
bic = logit_model.bic

print("AIC:", aic)
print("BIC:", bic)


Optimization terminated successfully.
         Current function value: 0.225795
         Iterations 8
AIC: 3245.1863436843178
BIC: 3423.644030666028


## **LOGIT + FORWARD**

In [None]:
# prompt: generar un modelo de regresion logistica y utilizando forward reduccir las variables de df2 forma iterativa hasta obtener las caracteristicas con alfa  mayor o igual a 0.05

import statsmodels.api as sm
import pandas as pd
import numpy as np

def forward_regression(X, y, threshold_in, correlation_threshold=0.9):
  """
  Perform forward regression to select features based on p-values,
  handling multicollinearity.

  Args:
    X: The feature matrix (pandas DataFrame).
    y: The target variable (pandas Series).
    threshold_in: The p-value threshold for including a feature.
    correlation_threshold: The correlation threshold to consider features
                            as highly correlated.

  Returns:
    A list of selected features.
  """

  initial_features = []
  included = list(initial_features)
  while True:
    changed = False
    excluded = list(set(X.columns) - set(included))
    new_pval = pd.Series(index=excluded)
    for new_column in excluded:
      # Check for high correlation with existing features
      if included:  # Check if 'included' is not empty
          correlations = X[included + [new_column]].corr().abs()
          # Get correlations between 'new_column' and 'included' features
          new_column_correlations = correlations[new_column].drop(new_column, errors='ignore')
          # If any correlation exceeds the threshold, skip this feature
          if (new_column_correlations > correlation_threshold).any():
              print(f"Skipping {new_column} due to high correlation with existing features.")
              continue  # Skip to the next feature

      # If no high correlation, proceed with model fitting
      try:
          model_forward = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included + [new_column]]))).fit(disp=0)
          new_pval[new_column] = model_forward.pvalues[new_column]
      except np.linalg.LinAlgError:
          print(f"Skipping {new_column} due to singularity issue.")
          continue  # Skip to the next feature

    # If all remaining features have p-values above the threshold or cause singularity:
    if new_pval.empty or (new_pval >= threshold_in).all():
      break

    best_pval = new_pval.min()
    if best_pval < threshold_in:
      best_feature = new_pval.idxmin()
      included.append(best_feature)
      changed = True
      print('Add  {:30} with p-value {:.6}'.format(best_feature, best_pval))
    if not changed:
      break
  return included

# Define X and y
X = df2.drop('PERDIDA_SEGUIMIENTO', axis=1)
y = df2['PERDIDA_SEGUIMIENTO']

# Perform forward regression with a threshold of 0.05 and correlation threshold of 0.9
selected_features_forward = forward_regression(X, y, threshold_in=0.05, correlation_threshold=0.9)

print("\nSelected Features:", selected_features_forward)
print("Number of selected features:", len(selected_features_forward))


Add  gp_indigen_0                   with p-value 2.1496e-121
Skipping gp_indigen_1 due to high correlation with existing features.
Add  EDAD                           with p-value 8.45743e-26
Skipping gp_indigen_1 due to high correlation with existing features.
Add  CONDICION_INGRESO_2            with p-value 8.25122e-17
Skipping gp_indigen_1 due to high correlation with existing features.
Add  gp_migrant_0                   with p-value 8.06996e-07
Skipping gp_migrant_1 due to high correlation with existing features.
Skipping gp_indigen_1 due to high correlation with existing features.
Add  Consumidor_SPA_1               with p-value 0.000127599
Skipping gp_migrant_1 due to high correlation with existing features.
Skipping Consumidor_SPA_0 due to high correlation with existing features.
Skipping gp_indigen_1 due to high correlation with existing features.
Add  CONDICION_INGRESO_0            with p-value 0.000422293
Skipping gp_migrant_1 due to high correlation with existing features.


In [None]:
# prompt: OBTENER LAS MEDIDAS DE BIC Y AIC DEL MODELO forward_regression

import statsmodels.api as sm

# Define X and y using the selected features
X_selected = df2[selected_features_forward]
y = df2['PERDIDA_SEGUIMIENTO']

# Fit the logistic regression model using statsmodels
model_forward = sm.Logit(y, sm.add_constant(X_selected)).fit()

# Print the AIC and BIC values
print(f"AIC: {model_forward.aic}")
print(f"BIC: {model_forward.bic}")


Optimization terminated successfully.
         Current function value: 0.224050
         Iterations 7
AIC: 4560.708248069925
BIC: 4683.4565560214005


## **LOGIT + BACKWARD**

In [None]:
# prompt: generar un modelo de regresion logistica y utilizando backward reducir las variables de df2 forma iterativa hasta obtener las caracteristicas con alfa  mayor o igual a 0.05

import statsmodels.api as sm

def backward_regression(X, y, threshold_out):
  """
  Performs backward regression to select features based on p-values.

  Args:
    X: The feature matrix (pandas DataFrame).
    y: The target variable (pandas Series).
    threshold_out: The p-value threshold for excluding a feature.

  Returns:
    A list of selected features.
  """

  included = list(X.columns)
  while True:
    changed = False
    model_backward = sm.Logit(y, sm.add_constant(pd.DataFrame(X[included]))).fit(disp=0)
    # Use model.pvalues to get p-values for each feature
    pvalues = model_backward.pvalues.iloc[1:]  # Exclude the constant term
    worst_pval = pvalues.max()  # Find the highest p-value
    if worst_pval > threshold_out:
      changed = True
      worst_feature = pvalues.idxmax()
      included.remove(worst_feature)
      print('Drop {:30} with p-value {:.6}'.format(worst_feature, worst_pval))
    if not changed:
      break
  return included

# Define X and y
X = df2.drop('PERDIDA_SEGUIMIENTO', axis=1)
y = df2['PERDIDA_SEGUIMIENTO']

# Perform backward regression with a threshold of 0.05
selected_features_backward = backward_regression(X, y, threshold_out=0.05)

print("\nSelected Features:", selected_features_backward)
print("Number of selected features:", len(selected_features_backward))


Drop SEXO_1                         with p-value 1.0
Drop TIPO_TB_1                      with p-value 1.0
Drop gp_migrant_0                   with p-value 1.0
Drop RESULTADO_BK_RECOD_0           with p-value 0.981638
Drop Tabaquismo_0                   with p-value 1.0
Drop Desnutricion_1                 with p-value 1.0
Drop gp_indigen_1                   with p-value 1.0
Drop Enf_Mental_0                   with p-value 1.0
Drop Desnutricion_0                 with p-value 0.988353
Drop Consumidor_SPA_1               with p-value 1.0
Drop PERTENENCIA_ETNICA_1           with p-value 0.9441
Drop PERTENENCIA_ETNICA_0           with p-value 0.862766
Drop gp_desplaz_0                   with p-value 1.0
Drop CONDICION_INGRESO_2            with p-value 0.822628
Drop TIPO_TB_0                      with p-value 0.287857
Drop REGIMEN_AFILIACION_1           with p-value 0.260448
Drop LOC_RES_2                      with p-value 0.265708
Drop SEXO_0                         with p-value 0.136673
Dro

In [None]:
# prompt: OBTENER LAS MEDIDAS DE BIC Y AIC DEL MODELO backward_regression

# Define X and y
X = df2.drop('PERDIDA_SEGUIMIENTO', axis=1)
y = df2['PERDIDA_SEGUIMIENTO']

# Perform backward regression with a threshold of 0.05
selected_features_backward = backward_regression(X, y, threshold_out=0.05)

# Fit the final model with the selected features
X_selected_b = X[selected_features_backward]
model = sm.Logit(y, sm.add_constant(X_selected_b)).fit(disp=0)

# Print AIC and BIC
print("\nAIC:", model.aic)
print("BIC:", model.bic)


Drop SEXO_1                         with p-value 1.0
Drop TIPO_TB_1                      with p-value 1.0
Drop gp_migrant_0                   with p-value 1.0
Drop RESULTADO_BK_RECOD_0           with p-value 0.981638
Drop Tabaquismo_0                   with p-value 1.0
Drop Desnutricion_1                 with p-value 1.0
Drop gp_indigen_1                   with p-value 1.0
Drop Enf_Mental_0                   with p-value 1.0
Drop Desnutricion_0                 with p-value 0.988353
Drop Consumidor_SPA_1               with p-value 1.0
Drop PERTENENCIA_ETNICA_1           with p-value 0.9441
Drop PERTENENCIA_ETNICA_0           with p-value 0.862766
Drop gp_desplaz_0                   with p-value 1.0
Drop CONDICION_INGRESO_2            with p-value 0.822628
Drop TIPO_TB_0                      with p-value 0.287857
Drop REGIMEN_AFILIACION_1           with p-value 0.260448
Drop LOC_RES_2                      with p-value 0.265708
Drop SEXO_0                         with p-value 0.136673
Dro

## **LOGIT + STEPWISE**

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.datasets import make_classification

# **Original X and y from df2 are preserved**
X_original = df2.drop('PERDIDA_SEGUIMIENTO', axis=1)
y_original = df2['PERDIDA_SEGUIMIENTO']

# Simulate a DataFrame (using original dimensions as a guide)
X_simulated, y_simulated = make_classification(n_samples=X_original.shape[0], n_features=X_original.shape[1], random_state=42)
df_simulated = pd.DataFrame(X_simulated, columns=[f'feature_{i}' for i in range(X_original.shape[1])])
df_simulated['target'] = y_simulated

def stepwise_selection(X, y, threshold_in=0.05, threshold_out=0.05):
    """
    Performs stepwise selection (forward selection followed by backward elimination)
    to select features based on p-values.
    """
    initial_features = X.columns.tolist()
    best_features = []

    while len(initial_features) > 0:
        changed = False

        # Forward selection
        for feature in initial_features:
            try:
                # **Added try-except block to handle LinAlgError**
                model_stepwise = sm.Logit(y, sm.add_constant(X[best_features + [feature]])).fit(disp=0)
                p_value = model_stepwise.pvalues[feature]
                if p_value < threshold_in:
                    best_features.append(feature)
                    changed = True
            except np.linalg.LinAlgError:
                # **If singular matrix, skip the feature**
                print(f"Skipping {feature} due to singularity issue.")
                continue  # Skip to the next feature

        if not changed:
            break

        # Backward elimination
        try:
            # **Added try-except block to handle LinAlgError**
            model_stepwise = sm.Logit(y, sm.add_constant(X[best_features])).fit(disp=0)
            p_values = model_stepwise.pvalues.iloc[1:]  # Exclude the constant
            max_p_value = p_values.max()

            if max_p_value >= threshold_out:
                feature_to_remove = p_values.idxmax()
                best_features.remove(feature_to_remove)
                changed = True

        except np.linalg.LinAlgError:
            # **If singular matrix, break the loop**
            print("Singularity issue during backward elimination. Stopping.")
            break

        if not changed:
            break

        initial_features = [f for f in initial_features if f not in best_features]

    return best_features

# **Use the simulated data for stepwise selection**
X = df_simulated.drop('target', axis=1)
y = df_simulated['target']

# Llamar a la función de selección stepwise
selected_features_stepwise = stepwise_selection(X, y)

print("Características seleccionadas:", selected_features_stepwise)

Características seleccionadas: ['feature_23', 'feature_33', 'feature_31']


In [None]:
# prompt: OBTENER LAS MEDIDAS DE BIC Y AIC DEL MODELO stepwise_selectio

# Assuming you have the selected features from stepwise_selection
selected_features = stepwise_selection(X, y)

# Fit the model with selected features
X_selected = X[selected_features_stepwise]
model_stepwise = sm.Logit(y, sm.add_constant(X_selected)).fit(disp=0)

# Get BIC and AIC
bic = model_stepwise.bic
aic = model_stepwise.aic

print(f"BIC: {bic}")
print(f"AIC: {aic}")


BIC: 3014.961159737377
AIC: 2986.079204925265


## **LOGIT + OLS (Ordinary Least Squares)**

In [None]:
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

# Crear el modelo de regresión logística
def logistic_regression_with_ols(df, target, alpha=0.05):
    # Copia del dataframe original
    df2 = df.copy()

    # Definir la variable dependiente (objetivo) y las independientes (features)
    X = df2.drop(columns=['PERDIDA_SEGUIMIENTO'])
    y = df2['PERDIDA_SEGUIMIENTO']

    # Agregar constante para el intercepto en OLS
    X_ols = sm.add_constant(X)

    # Iterar hasta que todas las variables tengan p-value <= alpha
    while True:
        # Aplicar OLS
        ols_model = sm.OLS(y, X_ols).fit()
        p_values = ols_model.pvalues

        # Verificar si hay alguna variable con p-value mayor al umbral
        max_p_value = p_values.max()

        if max_p_value > alpha:
            # Obtener la variable con el mayor p-value
            feature_to_remove = p_values.idxmax()

            # Eliminar la característica con el p-value más alto
            if feature_to_remove == 'const':
                print("El intercepto tiene un p-valor alto, pero no lo eliminaremos.")
                break
            else:
                X_ols = X_ols.drop(columns=[feature_to_remove])
                print(f"Eliminando variable '{feature_to_remove}' con p-valor {max_p_value:.4f}")
        else:
            # Si todas las variables tienen p-value <= alpha, terminar el proceso
            break

    # Después de la selección de características, ajustar el modelo de regresión logística final
    final_features = X_ols.columns.drop('const')
    X_final = df2[final_features]

    log_reg = LogisticRegression()
    log_reg.fit(X_final, y)

    return log_reg, ols_model.summary(), final_features

# Ejemplo de uso:
# df2 es el DataFrame, y 'target_column' es el nombre de la columna objetivo.
logistic_model, ols_summary, selected_features = logistic_regression_with_ols(df2, 'PERDIDA_SEGUIMIENTO')

# Mostrar las características seleccionadas
print(f"Características finales: {selected_features}")
print(ols_summary)

Eliminando variable 'gp_indigen_1' con p-valor 0.9011
Eliminando variable 'SEXO_1' con p-valor 0.9709
Eliminando variable 'TIPO_TB_1' con p-valor 0.9490
Eliminando variable 'Enf_Mental_1' con p-valor 0.8740
Eliminando variable 'Consumidor_SPA_1' con p-valor 0.9083
El intercepto tiene un p-valor alto, pero no lo eliminaremos.
Características finales: Index(['EDAD', 'SEXO_0', 'PERTENENCIA_ETNICA_0', 'PERTENENCIA_ETNICA_1',
       'PERTENENCIA_ETNICA_2', 'gp_desplaz_0', 'gp_desplaz_1', 'gp_migrant_0',
       'gp_migrant_1', 'gp_indigen_0', 'LOC_RES_0', 'LOC_RES_1', 'LOC_RES_2',
       'REGIMEN_AFILIACION_0', 'REGIMEN_AFILIACION_1', 'REGIMEN_AFILIACION_2',
       'TIPO_TB_0', 'CONDICION_INGRESO_0', 'CONDICION_INGRESO_1',
       'CONDICION_INGRESO_2', 'RESULTADO_BK_RECOD_0', 'RESULTADO_BK_RECOD_1',
       'RESULTADO_BK_RECOD_2', 'CONDICION_VIH_0', 'CONDICION_VIH_1',
       'Consumidor_SPA_0', 'Desnutricion_0', 'Desnutricion_1', 'Tabaquismo_0',
       'Tabaquismo_1', 'Enf_Mental_0'],
      d

## **Recursive Feature Elimination (RFE)**

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
import statsmodels.api as sm

# Suponiendo que tienes el dataframe df2 con las características X y la variable objetivo y
X = df2.drop(columns='PERDIDA_SEGUIMIENTO')
y = df2['PERDIDA_SEGUIMIENTO']

# Modelo de regresión logística
logreg = LogisticRegression(max_iter=1000)

# Recursive Feature Elimination (RFE)
rfe = RFE(logreg, n_features_to_select=15)  # Se selecciona 1 característica a la vez
rfe = rfe.fit(X, y)

# Obtener las características seleccionadas
selected_features_RFE = X.columns[rfe.support_]

# Ajustar el modelo con las características seleccionadas
X_rfe = X[selected_features_RFE]
X_rfe = sm.add_constant(X_rfe)  # Añadir la constante para el intercepto

# Modelo de regresión logística con las características seleccionadas
model_RFE = sm.Logit(y, X_rfe).fit()

# Filtrar características con p-values > 0.05 (Changed from >= to >)
while True:
    p_values = model_RFE.pvalues
    # Check if any p-value is greater than 0.05
    if any(p > 0.05 for p in p_values.drop(index='const', errors='ignore')):  # Exclude the constant
        # Eliminar la característica con el mayor p-value (excluding the constant)
        max_p_value_feature = p_values.drop(index='const', errors='ignore').idxmax()  # Exclude the constant
        X_rfe = X_rfe.drop(columns=[max_p_value_feature])
        model_RFE = sm.Logit(y, X_rfe).fit()
    else:
        break  # Exit the loop if all p-values are <= 0.05


print("Las características seleccionadas con p-value <= 0.05 son:") # Changed message to reflect the correct condition
print(X_rfe.columns)

Optimization terminated successfully.
         Current function value: 0.225998
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.226082
         Iterations 8
Optimization terminated successfully.
         Current function value: 0.226213
         Iterations 9
Optimization terminated successfully.
         Current function value: 0.226213
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.226213
         Iterations 7
Optimization terminated successfully.
         Current function value: 0.226339
         Iterations 7
Las características seleccionadas con p-value <= 0.05 son:
Index(['const', 'EDAD', 'gp_desplaz_1', 'gp_migrant_1', 'gp_indigen_1',
       'REGIMEN_AFILIACION_0', 'CONDICION_INGRESO_0', 'CONDICION_INGRESO_1',
       'CONDICION_VIH_1', 'Consumidor_SPA_0', 'Enf_Mental_0'],
      dtype='object')


In [None]:
# prompt: OBTENER LAS MEDIDAS DE BIC Y AIC DEL MODELO MODEL_RFE

# Calculate BIC and AIC for the model_RFE
bic_rfe = model_RFE.bic
aic_rfe = model_RFE.aic

print(f"BIC (model_RFE): {bic_rfe}")
print(f"AIC (model_RFE): {aic_rfe}")


BIC (model_RFE): 4674.37840208763
AIC (model_RFE): 4594.953026354322


In [None]:
# prompt: generar un dataframe que consolide las caracteristicas seleccionadas por los modelos LASSO, regresion logística, backward, forward, stepwise y OLS

# Listas para almacenar las características seleccionadas por cada modelo
lasso_features = [['EDAD', 'PERTENENCIA_ETNICA_1', 'gp_migrant_0', 'gp_indigen_0', 'gp_indigen_1', 'LOC_RES_0', 'LOC_RES_1', 'LOC_RES_2', 'REGIMEN_AFILIACION_0', 'REGIMEN_AFILIACION_2', 'TIPO_TB_0', 'TIPO_TB_1', 'CONDICION_INGRESO_0', 'CONDICION_INGRESO_2', 'RESULTADO_BK_RECOD_1', 'RESULTADO_BK_RECOD_2', 'CONDICION_VIH_0', 'CONDICION_VIH_1', 'Consumidor_SPA_0', 'Consumidor_SPA_1']]
logreg_features = ['EDAD', 'gp_desplaz_1', 'gp_migrant_1', 'gp_indigen_0', 'gp_indigen_1','REGIMEN_AFILIACION_0', 'REGIMEN_AFILIACION_1', 'CONDICION_INGRESO_0','CONDICION_INGRESO_1', 'RESULTADO_BK_RECOD_0', 'RESULTADO_BK_RECOD_2', 'CONDICION_VIH_1', 'Consumidor_SPA_1', 'Tabaquismo_1', 'Enf_Mental_0']
backward_features = ['gp_indigen_0', 'EDAD', 'CONDICION_INGRESO_2', 'gp_migrant_0', 'Consumidor_SPA_1', 'CONDICION_INGRESO_0', 'LOC_RES_1', 'LOC_RES_0', 'REGIMEN_AFILIACION_2', 'REGIMEN_AFILIACION_0', 'CONDICION_VIH_1', 'gp_desplaz_0', 'CONDICION_INGRESO_1', 'RESULTADO_BK_RECOD_2', 'CONDICION_VIH_0', 'Enf_Mental_0']
forward_features = ['EDAD', 'gp_desplaz_1', 'gp_migrant_1', 'gp_indigen_0', 'LOC_RES_0', 'LOC_RES_1', 'REGIMEN_AFILIACION_0', 'REGIMEN_AFILIACION_2', 'CONDICION_INGRESO_0', 'CONDICION_INGRESO_1', 'RESULTADO_BK_RECOD_2', 'CONDICION_VIH_0', 'CONDICION_VIH_1', 'Consumidor_SPA_0', 'Enf_Mental_1']
stepwise_features = ['CONDICION_INGRESO_1', 'Consumidor_SPA_1', 'Desnutricion_1']
ols_features = ['EDAD', 'gp_desplaz_0', 'gp_desplaz_1', 'gp_migrant_0', 'gp_migrant_1', 'gp_indigen_0', 'gp_indigen_1', 'LOC_RES_0', 'LOC_RES_1', 'REGIMEN_AFILIACION_0', 'REGIMEN_AFILIACION_1', 'TIPO_TB_0', 'TIPO_TB_1', 'CONDICION_INGRESO_0', 'CONDICION_INGRESO_1', 'RESULTADO_BK_RECOD_0', 'RESULTADO_BK_RECOD_1', 'CONDICION_VIH_0', 'CONDICION_VIH_1', 'Consumidor_SPA_0', 'Consumidor_SPA_1', 'Desnutricion_0', 'Desnutricion_1', 'Tabaquismo_0', 'Tabaquismo_1', 'Enf_Mental_0', 'Enf_Mental_1']
rfe_features = ['EDAD', 'gp_desplaz_1', 'gp_migrant_1', 'gp_indigen_1', 'REGIMEN_AFILIACION_0', 'CONDICION_INGRESO_0', 'CONDICION_INGRESO_1', 'CONDICION_VIH_1', 'Consumidor_SPA_0', 'Enf_Mental_0']

# Crear un diccionario para almacenar los datos del DataFrame
data = {
    'Modelo': ['LASSO', 'Regresión Logística', 'Backward', 'Forward', 'Stepwise', 'OLS', 'RFE'],
    'Características Seleccionadas': [lasso_features, logreg_features, backward_features, forward_features, stepwise_features, ols_features, rfe_features],
    'Número de Características': [len(lasso_features), len(logreg_features), len(backward_features), len(forward_features), len(stepwise_features), len(ols_features), len(rfe_features)]
}

In [None]:
# Crear el DataFrame
comparison_df = pd.DataFrame(data)
comparison_df

Unnamed: 0,Modelo,Características Seleccionadas,Número de Características
0,LASSO,"[[EDAD, PERTENENCIA_ETNICA_1, gp_migrant_0, gp...",1
1,Regresión Logística,"[EDAD, gp_desplaz_1, gp_migrant_1, gp_indigen_...",15
2,Backward,"[gp_indigen_0, EDAD, CONDICION_INGRESO_2, gp_m...",16
3,Forward,"[EDAD, gp_desplaz_1, gp_migrant_1, gp_indigen_...",15
4,Stepwise,"[CONDICION_INGRESO_1, Consumidor_SPA_1, Desnut...",3
5,OLS,"[EDAD, gp_desplaz_0, gp_desplaz_1, gp_migrant_...",27
6,RFE,"[EDAD, gp_desplaz_1, gp_migrant_1, gp_indigen_...",10
