#DATOS DE PANEL 

Para mantener la interpretabilidad del modelo, pero siendo conscientes de la alta correlación entre variables, se decidió hacer pruebas con modelos econométricos como Efectos Fijos (FE) y efectos Aleatorios (RE) y se implementó un test de Hausman para reforzar la selección de estos modelos. Con estos resultados se espera identificar a partir de un modelo mixto las variables que realmente afectan la publicaicón de datos.

In [2]:
# =============================================================================
# 1. IMPORTACIÓN DE LIBRERÍAS
# =============================================================================
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import statsmodels.api as sm
from linearmodels.panel import PanelOLS, RandomEffects
from linearmodels.panel import compare

from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

# Métricas
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# =============================================================================
# 2. CARGA Y PREPARACIÓN DE DATOS
# =============================================================================

url = "https://raw.githubusercontent.com/rortizgeo/Maestria_CD_Proyecto-Aplicado/main/Data_final.csv"
df = pd.read_csv(url)

df["ds"] = pd.to_datetime(df["year"].astype(str), format="%Y")

TARGET = "occurrenceCount_publisher"
df[TARGET] = np.log1p(df[TARGET])

cols_drop = [ "countryCode", "areas_protegidas"]
df = df.drop(columns=cols_drop)

df = df.sort_values(["country", "ds"]).reset_index(drop=True)

# =============================================================================
# 3. FEATURE ENGINEERING TEMPORAL
# =============================================================================

def create_temporal_features(df, features, lags=[1,3,5], roll=[1,3,5]):
    df_copy = df.copy()
    for f in features:
        for L in lags:
            df_copy[f"{f}_lag{L}"] = df_copy.groupby("country")[f].shift(L)
        for r in roll:
            df_copy[f"{f}_rollmean{r}"] = (
                df_copy.groupby("country")[f].shift(1).rolling(r, min_periods=1).mean()
            )
    return df_copy

features_to_lag = [
    'gasto_RD_pib', 'efectividad_gobierno', 'uso_internet', 'pib_per_capita',
    'gbif_member', 'ogp_membership'
]

df_feat = create_temporal_features(df, features_to_lag)

# =============================================================================
# 4. VARIABLES FE/RE
# =============================================================================

vars_validas = [
    'gasto_RD_pib', 'efectividad_gobierno',
    'art_cientificos', 'uso_internet', 'pib_per_capita',
    'inscripcion_primaria', 'inscripcion_secundaria',
    'inscripcion_terciaria', 'gasto_educacion_gobierno',
    'gasto_educacion_pib', 'investigadores_RD'
]

vars_validas = [v for v in vars_validas if v in df_feat.columns]

panel_df = df_feat.set_index(["country", "year"])

y = panel_df[TARGET]
X = sm.add_constant(panel_df[vars_validas])

# =============================================================================
# 5. MODELOS FE/RE + HAUSMAN
# =============================================================================

print("\n===== MODELO FE =====")
fe_mod = PanelOLS(y, X, entity_effects=True)
fe_res = fe_mod.fit(cov_type="clustered", cluster_entity=True)
print(fe_res)

print("\n===== MODELO RE =====")
re_mod = RandomEffects(y, X)
re_res = re_mod.fit()
print(re_res)

print("\n===== PRUEBA DE HAUSMAN =====")
print(compare({"FE": fe_res, "RE": re_res}))

# =============================================================================
# 6. LASSO GLOBAL
# =============================================================================

print("\n===== LASSO GLOBAL =====")

lasso_df = df_feat.dropna()
X_lasso = lasso_df[vars_validas]
y_lasso = lasso_df[TARGET]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_lasso)

lasso_global = LassoCV(cv=5, random_state=42).fit(X_scaled, y_lasso)

print("Mejor alpha global:", lasso_global.alpha_)

coef_global = pd.DataFrame({
    "variable": vars_validas,
    "coef": lasso_global.coef_
}).sort_values("coef", key=abs, ascending=False)

print("\n===== COEFICIENTES LASSO GLOBAL =====")
print(coef_global)

# =============================================================================
# 7. LASSO POR ENTIDAD (COUNTRY)
# =============================================================================

print("\n===== LASSO POR PAÍS =====")
coef_por_pais = {}

for pais, df_country in lasso_df.groupby("country"):
    Xc = df_country[vars_validas]
    yc = df_country[TARGET]

    if len(df_country) < 10:
        continue  # ignorar países con pocas observaciones

    Xc_scaled = StandardScaler().fit_transform(Xc)
    lasso_c = LassoCV(cv=3).fit(Xc_scaled, yc)

    coef_df = pd.DataFrame({
        "variable": vars_validas,
        "coef": lasso_c.coef_
    }).sort_values("coef", key=abs, ascending=False)

    coef_por_pais[pais] = coef_df

print("LASSO por país generado. Ejemplo país:", list(coef_por_pais.keys())[0])
print(coef_por_pais[list(coef_por_pais.keys())[0]])





===== MODELO FE =====
                              PanelOLS Estimation Summary                              
Dep. Variable:     occurrenceCount_publisher   R-squared:                        0.5217
Estimator:                          PanelOLS   R-squared (Between):             -0.1871
No. Observations:                        656   R-squared (Within):               0.5217
Date:                       Sat, Nov 29 2025   R-squared (Overall):              0.0960
Time:                               19:44:48   Log-likelihood                   -1505.6
Cov. Estimator:                    Clustered                                           
                                               F-statistic:                      59.881
Entities:                                 41   P-value                           0.0000
Avg Obs:                              16.000   Distribution:                  F(11,604)
Min Obs:                              16.000                                           
Max Obs: 

In [7]:
import linearmodels
print(linearmodels.__version__)

6.1
