In [5]:
# =============================================================================
# 1. IMPORTACIÓN DE LIBRERÍAS
# =============================================================================
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Modelos de panel
import statsmodels.api as sm
from linearmodels.panel import PanelOLS, RandomEffects
from linearmodels.panel import compare

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.model_selection import TimeSeriesSplit

# Métricas
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# =============================================================================
# 2. CARGA Y PREPARACIÓN DE DATOS
# =============================================================================

url = "https://raw.githubusercontent.com/rortizgeo/Maestria_CD_Proyecto-Aplicado/main/Data_final.csv"
df = pd.read_csv(url)

df["ds"] = pd.to_datetime(df["year"].astype(str), format="%Y")

# Log-transform del target
TARGET = "occurrenceCount_publisher"
df[TARGET] = np.log1p(df[TARGET])

# Eliminamos columnas usadas en PCA y otras que ya no corresponden
cols_drop = ["Overall score", "countryCode", "areas_protegidas"]
df = df.drop(columns=cols_drop)

df = df.sort_values(["country", "ds"]).reset_index(drop=True)

# =============================================================================
# 3. FEATURE ENGINEERING TEMPORAL
# =============================================================================

def create_temporal_features(df, features, lags=[1,3,5], roll=[1,3,5]):
    df_copy = df.copy()
    for f in features:
        for L in lags:
            df_copy[f"{f}_lag{L}"] = df_copy.groupby("country")[f].shift(L)
        for r in roll:
            df_copy[f"{f}_rollmean{r}"] = (
                df_copy.groupby("country")[f].shift(1).rolling(r, min_periods=1).mean()
            )
    return df_copy

features_to_lag = [
    'gasto_RD_pib', 'efectividad_gobierno', 'uso_internet', 'pib_per_capita',
    'gbif_member', 'ogp_membership'
]

df_feat = create_temporal_features(df, features_to_lag)

# =============================================================================
# 4. SELECCIÓN DE VARIABLES PARA FE/RE (Opción A)
# =============================================================================

vars_validas_fe = [
    'areas_protegidas',
    'gasto_RD_pib',
    'efectividad_gobierno',
    'art_cientificos',
    'uso_internet',
    'pib_per_capita',
    'inscripcion_primaria',
    'inscripcion_secundaria',
    'inscripcion_terciaria',
    'gasto_educacion_gobierno',
    'gasto_educacion_pib',
    'investigadores_RD'
]

# Filtramos columnas realmente presentes
vars_validas_fe = [v for v in vars_validas_fe if v in df_feat.columns]

print("Variables válidas para FE/RE:")
print(vars_validas_fe)

# =============================================================================
# 5. PREPARACIÓN PARA MODELOS DE PANEL
# =============================================================================

panel_df = df_feat.set_index(["country", "year"])

y = panel_df[TARGET]
X = panel_df[vars_validas_fe]

X = sm.add_constant(X)

# =============================================================================
# 6. MODELO FE, RE Y HAUSMAN
# =============================================================================

print("\n===== MODELO FE =====")
fe = PanelOLS(y, X, entity_effects=True)
fe_res = fe.fit(cov_type="clustered", cluster_entity=True)
print(fe_res)

print("\n===== MODELO RE =====")
re = RandomEffects(y, X)
re_res = re.fit()
print(re_res)

# Prueba de Hausman
print("\n===== PRUEBA DE HAUSMAN =====")
from linearmodels.panel import compare
print(compare({"FE": fe_res, "RE": re_res}))

# =============================================================================
# 7. MODELO LASSO CON VALIDACIÓN DE SERIES DE TIEMPO
# =============================================================================

print("\n===== MODELO LASSO =====")

lasso_df = df_feat.copy()
lasso_df = lasso_df.dropna()

X_lasso = lasso_df[vars_validas_fe]
y_lasso = lasso_df[TARGET]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_lasso)

tscv = TimeSeriesSplit(n_splits=5)

lasso = LassoCV(cv=tscv, random_state=42).fit(X_scaled, y_lasso)

print("Mejor alpha encontrado:", lasso.alpha_)

# Predicciones in-sample
y_pred_lasso = lasso.predict(X_scaled)

# Revertimos transformación del target
y_true_orig = np.expm1(y_lasso)
y_pred_orig = np.expm1(y_pred_lasso)

print("\n===== MÉTRICAS LASSO =====")
print("MAE  :", mean_absolute_error(y_true_orig, y_pred_orig))
print("RMSE :", np.sqrt(mean_squared_error(y_true_orig, y_pred_orig)))
print("R2   :", r2_score(y_true_orig, y_pred_orig))

# Ranking de importancia
coef_df = pd.DataFrame({
    "variable": vars_validas_fe,
    "coef": lasso.coef_
}).sort_values("coef", key=abs, ascending=False)

print("\n===== IMPORTANCIA DE VARIABLES (LASSO) =====")
print(coef_df)



Variables válidas para FE/RE:
['gasto_RD_pib', 'efectividad_gobierno', 'art_cientificos', 'uso_internet', 'pib_per_capita', 'inscripcion_primaria', 'inscripcion_secundaria', 'inscripcion_terciaria', 'gasto_educacion_gobierno', 'gasto_educacion_pib', 'investigadores_RD']

===== MODELO FE =====
                              PanelOLS Estimation Summary                              
Dep. Variable:     occurrenceCount_publisher   R-squared:                        0.5217
Estimator:                          PanelOLS   R-squared (Between):             -0.1871
No. Observations:                        656   R-squared (Within):               0.5217
Date:                       Mon, Nov 17 2025   R-squared (Overall):              0.0960
Time:                               07:55:43   Log-likelihood                   -1505.6
Cov. Estimator:                    Clustered                                           
                                               F-statistic:                      59.881
En

In [2]:
# =============================================================================
# 1. IMPORTACIÓN DE LIBRERÍAS
# =============================================================================
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

import statsmodels.api as sm
from linearmodels.panel import PanelOLS, RandomEffects
from linearmodels.panel import compare


from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV

# Métricas
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# =============================================================================
# 2. CARGA Y PREPARACIÓN DE DATOS
# =============================================================================

url = "https://raw.githubusercontent.com/rortizgeo/Maestria_CD_Proyecto-Aplicado/main/Data_final.csv"
df = pd.read_csv(url)

df["ds"] = pd.to_datetime(df["year"].astype(str), format="%Y")

TARGET = "occurrenceCount_publisher"
df[TARGET] = np.log1p(df[TARGET])

cols_drop = ["Overall score", "countryCode", "areas_protegidas"]
df = df.drop(columns=cols_drop)

df = df.sort_values(["country", "ds"]).reset_index(drop=True)

# =============================================================================
# 3. FEATURE ENGINEERING TEMPORAL
# =============================================================================

def create_temporal_features(df, features, lags=[1,3,5], roll=[1,3,5]):
    df_copy = df.copy()
    for f in features:
        for L in lags:
            df_copy[f"{f}_lag{L}"] = df_copy.groupby("country")[f].shift(L)
        for r in roll:
            df_copy[f"{f}_rollmean{r}"] = (
                df_copy.groupby("country")[f].shift(1).rolling(r, min_periods=1).mean()
            )
    return df_copy

features_to_lag = [
    'gasto_RD_pib', 'efectividad_gobierno', 'uso_internet', 'pib_per_capita',
    'gbif_member', 'ogp_membership'
]

df_feat = create_temporal_features(df, features_to_lag)

# =============================================================================
# 4. VARIABLES FE/RE
# =============================================================================

vars_validas = [
    'gasto_RD_pib', 'efectividad_gobierno',
    'art_cientificos', 'uso_internet', 'pib_per_capita',
    'inscripcion_primaria', 'inscripcion_secundaria',
    'inscripcion_terciaria', 'gasto_educacion_gobierno',
    'gasto_educacion_pib', 'investigadores_RD'
]

vars_validas = [v for v in vars_validas if v in df_feat.columns]

panel_df = df_feat.set_index(["country", "year"])

y = panel_df[TARGET]
X = sm.add_constant(panel_df[vars_validas])

# =============================================================================
# 5. MODELOS FE/RE + HAUSMAN
# =============================================================================

print("\n===== MODELO FE =====")
fe_mod = PanelOLS(y, X, entity_effects=True)
fe_res = fe_mod.fit(cov_type="clustered", cluster_entity=True)
print(fe_res)

print("\n===== MODELO RE =====")
re_mod = RandomEffects(y, X)
re_res = re_mod.fit()
print(re_res)

print("\n===== PRUEBA DE HAUSMAN =====")
print(compare({"FE": fe_res, "RE": re_res}))

# =============================================================================
# 6. LASSO GLOBAL
# =============================================================================

print("\n===== LASSO GLOBAL =====")

lasso_df = df_feat.dropna()
X_lasso = lasso_df[vars_validas]
y_lasso = lasso_df[TARGET]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_lasso)

lasso_global = LassoCV(cv=5, random_state=42).fit(X_scaled, y_lasso)

print("Mejor alpha global:", lasso_global.alpha_)

coef_global = pd.DataFrame({
    "variable": vars_validas,
    "coef": lasso_global.coef_
}).sort_values("coef", key=abs, ascending=False)

print("\n===== COEFICIENTES LASSO GLOBAL =====")
print(coef_global)

# =============================================================================
# 7. LASSO POR ENTIDAD (COUNTRY)
# =============================================================================

print("\n===== LASSO POR PAÍS =====")
coef_por_pais = {}

for pais, df_country in lasso_df.groupby("country"):
    Xc = df_country[vars_validas]
    yc = df_country[TARGET]

    if len(df_country) < 10:
        continue  # ignorar países con pocas observaciones

    Xc_scaled = StandardScaler().fit_transform(Xc)
    lasso_c = LassoCV(cv=3).fit(Xc_scaled, yc)

    coef_df = pd.DataFrame({
        "variable": vars_validas,
        "coef": lasso_c.coef_
    }).sort_values("coef", key=abs, ascending=False)

    coef_por_pais[pais] = coef_df

print("LASSO por país generado. Ejemplo país:", list(coef_por_pais.keys())[0])
print(coef_por_pais[list(coef_por_pais.keys())[0]])

# =============================================================================
# ARELLANO–BOND MANUAL (compatibilidad con linearmodels antiguos)
# =============================================================================

import numpy as np
from linearmodels.panel import PanelGMM

print("\n===== MODELO ARELLANO–BOND (Manual con PanelGMM) =====")

# Usamos panel_df (ya indexado por country, year)
df_ab = panel_df.dropna().copy()

# Variable dependiente y explicativas
y = df_ab[TARGET]
X = df_ab[vars_validas_fe]

# Construimos diferencias: Δy y ΔX (primeras diferencias)
dy = y.groupby(level=0).diff()
dX = X.groupby(level=0).diff()

# Eliminamos primeras observaciones por país (NaN)
dy = dy.dropna()
dX = dX.dropna()

# -----------------------------------------------------------------------------
# Construcción de instrumentos:
#   Para Arellano–Bond, Δy_{it−1} se instrumenta con niveles y_{it-2}, y_{it-3}, …
# -----------------------------------------------------------------------------

# Creamos el modelo GMM
abond = PanelGMM(dy, dX, None, weights='robust')

# Para cada variable en X:
for col in X.columns:
    # Agregamos instrumentos internos: niveles rezagados 2+ (mínimo estándar AB)
    abond.addinstruments(X[col].groupby(level=0).shift(2))

# Instrumentos para la variable dependiente rezagada
abond.addinstruments(y.groupby(level=0).shift(2))

# Ajustamos modelo GMM en diferencias
abond_res = abond.fit()
print(abond_res)



===== MODELO FE =====
                              PanelOLS Estimation Summary                              
Dep. Variable:     occurrenceCount_publisher   R-squared:                        0.5217
Estimator:                          PanelOLS   R-squared (Between):             -0.1871
No. Observations:                        656   R-squared (Within):               0.5217
Date:                       Mon, Nov 17 2025   R-squared (Overall):              0.0960
Time:                               09:21:21   Log-likelihood                   -1505.6
Cov. Estimator:                    Clustered                                           
                                               F-statistic:                      59.881
Entities:                                 41   P-value                           0.0000
Avg Obs:                              16.000   Distribution:                  F(11,604)
Min Obs:                              16.000                                           
Max Obs: 

ImportError: cannot import name 'PanelGMM' from 'linearmodels.panel' (/Users/ricardoortiz/opt/anaconda3/envs/tf_m1/lib/python3.9/site-packages/linearmodels/panel/__init__.py)