In [None]:
# Notebook para extrair notícias do GDELT por país e combinar com métricas de impacto de satélites e hubs.

# Seção 1: Configuração e Importação de Bibliotecas

import sys, os, math, time, json, re, datetime as dt
from datetime import datetime, timedelta

import requests
import pandas as pd
import numpy as np

# Imports com instalação preguiçosa apenas se faltarem; evita AttributeErrors parciais
try:
    import pycountry
except ImportError:
    import subprocess, sys as _sys
    subprocess.run([_sys.executable, '-m', 'pip', 'install', 'pycountry'], check=False)
    import pycountry

try:
    import plotly.express as px
    import plotly  # para versão correta
except ImportError:
    import subprocess, sys as _sys
    subprocess.run([_sys.executable, '-m', 'pip', 'install', 'plotly'], check=False)
    import plotly.express as px
    import plotly

# Geopandas é pesado; só instala se realmente necessário
try:
    import geopandas as gpd
except ImportError:
    import subprocess, sys as _sys
    subprocess.run([_sys.executable, '-m', 'pip', 'install', 'geopandas'], check=False)
    try:
        import geopandas as gpd
    except Exception as _e:
        gpd = None
        print('Geopandas indisponível (opcional):', _e)

try:
    import ipywidgets as widgets
except ImportError:
    import subprocess, sys as _sys
    subprocess.run([_sys.executable, '-m', 'pip', 'install', 'ipywidgets'], check=False)
    try:
        import ipywidgets as widgets
    except Exception as _e:
        widgets = None
        print('ipywidgets indisponível:', _e)

# Feather não é crítico; tratamos como opcional
try:
    import feather
except ImportError:
    try:
        import subprocess, sys as _sys
        subprocess.run([_sys.executable, '-m', 'pip', 'install', 'feather-format', 'pyarrow'], check=False)
        import feather
    except Exception as _e:
        feather = None
        print('Feather indisponível (usar parquet):', _e)

pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 140)

# Versões (corrigido plotly.__version__)
plotly_version = getattr(plotly, '__version__', 'N/D')
print('Versões: pandas', pd.__version__, 'requests', requests.__version__, 'plotly', plotly_version)

DATA_CACHE_DIR = 'cache'
os.makedirs(DATA_CACHE_DIR, exist_ok=True)

Collecting pycountry
  Downloading pycountry-24.6.1-py3-none-any.whl.metadata (12 kB)
Downloading pycountry-24.6.1-py3-none-any.whl (6.3 MB)
   ---------------------------------------- 0.0/6.3 MB ? eta -:--:--
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   ---------------------------------------  6.3/6.3 MB 55.3 MB/s eta 0:00:01
   -----------------


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting plotly
  Downloading plotly-6.4.0-py3-none-any.whl.metadata (8.5 kB)
Collecting narwhals>=1.15.1 (from plotly)
  Downloading narwhals-2.11.0-py3-none-any.whl.metadata (11 kB)
Downloading plotly-6.4.0-py3-none-any.whl (9.9 MB)
   ---------------------------------------- 0.0/9.9 MB ? eta -:--:--
   ---------------------------------------  9.7/9.9 MB 46.5 MB/s eta 0:00:01
   ---------------------------------------  9.7/9.9 MB 46.5 MB/s eta 0:00:01
   ---------------------------------------- 9.9/9.9 MB 22.0 MB/s eta 0:00:00
Downloading narwhals-2.11.0-py3-none-any.whl (423 kB)
Installing collected packages: narwhals, plotly

   ---------------------------------------- 0/2 [narwhals]
   ---------------------------------------- 0/2 [narwhals]
   ---------------------------------------- 0/2 [narwhals]
   ---------------------------------------- 0/2 [narwhals]
   ---------------------------------------- 0/2 [narwhals]
   ---------------------------------------- 0/2 [narwhals]
   ---


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting geopandas
  Downloading geopandas-1.0.1-py3-none-any.whl.metadata (2.2 kB)
Collecting pyogrio>=0.7.2 (from geopandas)
  Downloading pyogrio-0.11.1-cp39-cp39-win_amd64.whl.metadata (5.4 kB)
Collecting pyproj>=3.3.0 (from geopandas)
  Downloading pyproj-3.6.1-cp39-cp39-win_amd64.whl.metadata (31 kB)
Collecting shapely>=2.0.0 (from geopandas)
  Downloading shapely-2.0.7-cp39-cp39-win_amd64.whl.metadata (7.1 kB)
Downloading geopandas-1.0.1-py3-none-any.whl (323 kB)
Downloading pyogrio-0.11.1-cp39-cp39-win_amd64.whl (19.2 MB)
   ---------------------------------------- 0.0/19.2 MB ? eta -:--:--
   ---- ----------------------------------- 2.1/19.2 MB 11.8 MB/s eta 0:00:02
   ------ --------------------------------- 3.1/19.2 MB 10.3 MB/s eta 0:00:02
   --------- ------------------------------ 4.7/19.2 MB 7.3 MB/s eta 0:00:02
   ------------- -------------------------- 6.3/19.2 MB 8.2 MB/s eta 0:00:02
   ----------------- ---------------------- 8.4/19.2 MB 8.0 MB/s eta 0:00:02
   --


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting feather-format
  Downloading feather-format-0.4.1.tar.gz (3.2 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting pyarrow>=0.4.0 (from feather-format)
  Downloading pyarrow-21.0.0-cp39-cp39-win_amd64.whl.metadata (3.4 kB)
Downloading pyarrow-21.0.0-cp39-cp39-win_amd64.whl (26.2 MB)
   ---------------------------------------- 0.0/26.2 MB ? eta -:--:--
   ---------------- ----------------------- 10.7/26.2 MB 56.1 MB/s eta 0:00:01
   ------------------------------------ --- 23.9/26.2 MB 60.5 MB/s eta 0:00:01
   ---------------------------------------- 26.2/26.2 MB 46.2 MB/s eta 0:00:00
Building wheels for collected packages: feather-format
  Building wheel for feather-format (pyproject.toml


[notice] A new release of pip is available: 25.1.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


AttributeError: module 'plotly.express' has no attribute '__version__'

In [None]:
# Seção 2: Definição de Parâmetros (Países, Datas, Pesos de Impacto)

ISO3_COUNTRIES = ['BRA','USA','DEU','FRA','CHN','RUS','ZAF','IND','GBR','CAN']  # pode ajustar

DATA_INICIO = datetime.utcnow() - timedelta(days=7)

DATA_FIM = datetime.utcnow()

FREQUENCIA = 'D'  # 'D' diária, 'W' semanal

ALPHA = 0.5  # peso notícias

BETA = 0.3   # peso satélites

GAMMA = 0.2  # peso hubs



def set_pesos(alpha:float, beta:float, gamma:float):

    s = alpha+beta+gamma

    if not math.isclose(s,1.0):

        alpha, beta, gamma = alpha/s, beta/s, gamma/s

    return alpha, beta, gamma



ALPHA, BETA, GAMMA = set_pesos(ALPHA,BETA,GAMMA)

print('Pesos normalizados:', ALPHA, BETA, GAMMA)

In [None]:
# Seção 3: Funções de Consulta à API GDELT (Eventos por País)

GDELT_BASE = "https://api.gdeltproject.org/api/v2/events/"



def montar_parametros(country_iso3:str, dt_inicio:datetime, dt_fim:datetime, table:'str'='events'):

    # GDELT usa formato yyyymmddhhmmss; vamos varrer por dia/intervalos

    start = dt_inicio.strftime('%Y%m%d%H%M%S')

    end = dt_fim.strftime('%Y%m%d%H%M%S')

    params = {

        'query': f"sourceCountry:{country_iso3}",  # simplificado; pode expandir (theme, actor, etc.)

        'mode': 'EventOnly',

        'maxrecords': 250,

        'format': 'JSON',

        'startdatetime': start,

        'enddatetime': end

    }

    return params



def consultar_gdelt(country_iso3:str, dt_inicio:datetime, dt_fim:datetime, retries:int=3, sleep_sec:float=1.5):

    params = montar_parametros(country_iso3, dt_inicio, dt_fim)

    url = GDELT_BASE + 'api'  # placeholder (ajustar conforme endpoint exato de v2 se diferente)

    # Nota: GDELT v2 events real endpoint: https://api.gdeltproject.org/api/v2/summary/summary? (ou tone analysis). Mantido simples.

    for tentativa in range(retries):

        try:

            r = requests.get(GDELT_BASE + 'api', params=params, timeout=30)

            if r.status_code == 200:

                data = r.json()

                # Estrutura fictícia para exemplo; adaptar ao schema verdadeiro.

                eventos = data.get('events', [])

                if not eventos:

                    return pd.DataFrame()

                df = pd.json_normalize(eventos)

                df['country_iso3_query'] = country_iso3

                return df

            else:

                time.sleep(sleep_sec)

        except Exception as e:

            print(f'Erro tentativa {tentativa+1} país {country_iso3}:', e)

            time.sleep(sleep_sec)

    return pd.DataFrame()



def coletar_intervalos(country_iso3:str, ini:datetime, fim:datetime, step_hours:int=6):

    atual = ini

    frames = []

    while atual < fim:

        proximo = min(atual + timedelta(hours=step_hours), fim)

        dfp = consultar_gdelt(country_iso3, atual, proximo)

        if not dfp.empty:

            frames.append(dfp)

        atual = proximo

    if frames:

        full = pd.concat(frames, ignore_index=True)

    else:

        full = pd.DataFrame()

    return full


In [None]:
# Seção 4: Coleta e Consolidação de Eventos GDELT

def coletar_todos_paises(paises, ini, fim, step_hours=6):

    todos = []

    for p in paises:

        print('Coletando', p)

        dfp = coletar_intervalos(p, ini, fim, step_hours=step_hours)

        if not dfp.empty:

            todos.append(dfp)

    if todos:

        full = pd.concat(todos, ignore_index=True)

    else:

        full = pd.DataFrame()

    return full



raw_news_df = coletar_todos_paises(ISO3_COUNTRIES, DATA_INICIO, DATA_FIM)

print('Registros de notícias obtidos:', len(raw_news_df))



# Normalização de datas e país

if not raw_news_df.empty:

    # Supondo campo timecode (placeholder) ou similar

    if 'timecode' in raw_news_df.columns:

        raw_news_df['event_dt'] = pd.to_datetime(raw_news_df['timecode'], errors='coerce')

    else:

        raw_news_df['event_dt'] = DATA_INICIO  # fallback

    raw_news_df['country_iso3'] = raw_news_df.get('country_iso3_query')


In [None]:
# Seção 5: Limpeza e Normalização dos Dados de Notícias

def limpar_noticias(df:pd.DataFrame):

    if df.empty:

        return df

    col_map = {

        'GoldsteinScale':'goldstein',

        'NumMentions':'num_mentions',

        'AvgTone':'avg_tone'

    }

    for orig, novo in col_map.items():

        if orig in df.columns:

            df[novo] = pd.to_numeric(df[orig], errors='coerce')

        else:

            df[novo] = np.nan

    # Intensidade de notícia (exemplo simplificado)

    df['news_intensity'] = df['goldstein'].fillna(0) * (1 + (df['avg_tone'].fillna(0)/100))

    df = df.drop_duplicates()

    return df



news_df = limpar_noticias(raw_news_df.copy())

print('Após limpeza:', news_df.shape)

In [None]:
# Seção 6: Carregamento de Dados de Satélites e Hubs (placeholders)

# Aqui simulamos dados; na integração real fazer leitura de Postgres ou arquivos produzidos por dbt.

def carregar_satellites():

    # Simulado: métricas diárias por país

    dias = pd.date_range(DATA_INICIO.date(), DATA_FIM.date(), freq='D')

    rows = []

    for p in ISO3_COUNTRIES:

        for d in dias:

            rows.append({'country_iso3':p,'date':d,'sat_value':np.random.rand()*100})

    return pd.DataFrame(rows)



def carregar_hubs():

    dias = pd.date_range(DATA_INICIO.date(), DATA_FIM.date(), freq='D')

    rows = []

    for p in ISO3_COUNTRIES:

        for d in dias:

            rows.append({'country_iso3':p,'date':d,'hub_value':np.random.rand()*50})

    return pd.DataFrame(rows)



sat_df = carregar_satellites()

hub_df = carregar_hubs()

print('Satélites shape', sat_df.shape, 'Hubs shape', hub_df.shape)

In [None]:
# Seção 7: Cálculo da Métrica de Impacto Integrada

def normalizar_coluna(df, col):

    if df[col].nunique() <= 1:

        df[col+'_norm'] = 0.0

        return df

    mn, mx = df[col].min(), df[col].max()

    df[col+'_norm'] = (df[col]-mn)/(mx-mn)

    return df



def agregar_noticias(df_news:pd.DataFrame, freq:str='D'):

    if df_news.empty:

        return pd.DataFrame(columns=['country_iso3','date','news_intensity_sum','news_intensity_mean'])

    df_news['date'] = df_news['event_dt'].dt.date

    agg = df_news.groupby(['country_iso3','date']).agg(news_intensity_sum=('news_intensity','sum'), news_intensity_mean=('news_intensity','mean')).reset_index()

    return agg



news_agg = agregar_noticias(news_df, FREQUENCIA)



# Convert date column types

if not news_agg.empty:

    news_agg['date'] = pd.to_datetime(news_agg['date'])



# Normalizar sat e hub por país+dia

sat_norm = sat_df.copy()

hub_norm = hub_df.copy()

sat_norm = normalizar_coluna(sat_norm, 'sat_value')

hub_norm = normalizar_coluna(hub_norm, 'hub_value')



# Merge geral

impact_base = news_agg.merge(sat_norm, on=['country_iso3','date'], how='left').merge(hub_norm, on=['country_iso3','date'], how='left')

impact_base['sat_metric'] = impact_base['sat_value_norm'].fillna(0)

impact_base['hub_metric'] = impact_base['hub_value_norm'].fillna(0)

impact_base['news_metric'] = normalizar_coluna(impact_base.copy(), 'news_intensity_sum')['news_intensity_sum_norm'] if not impact_base.empty else []



if not impact_base.empty:

    impact_base['impact_raw'] = ALPHA*impact_base['news_metric'] + BETA*impact_base['sat_metric'] + GAMMA*impact_base['hub_metric']

    impact_base = normalizar_coluna(impact_base, 'impact_raw')

    impact_base.rename(columns={'impact_raw_norm':'impact_score'}, inplace=True)

else:

    impact_base['impact_score'] = []



print('Impact base size:', impact_base.shape)

In [None]:
# Seção 8: Agregação Temporal e por País

def agregacao_final(df:pd.DataFrame, freq:str='D'):

    if df.empty:

        return pd.DataFrame(columns=['country_iso3','date','impact_score_mean','impact_score_sum'])

    df['date_period'] = df['date']

    agg = df.groupby(['country_iso3','date_period']).agg(impact_score_mean=('impact_score','mean'), impact_score_sum=('impact_score','sum')).reset_index()

    return agg



final_agg = agregacao_final(impact_base, FREQUENCIA)

print('Final aggregation shape:', final_agg.shape)

In [None]:
# Seção 9: Geração de Tabelas Resumo

def ranking_paises(df:pd.DataFrame, top:int=15):

    if df.empty:

        return pd.DataFrame(columns=['country_iso3','impact_score_mean'])

    rank = df.groupby('country_iso3').impact_score_mean.mean().reset_index().sort_values('impact_score_mean', ascending=False)

    return rank.head(top)



ranking_df = ranking_paises(final_agg, 15)

print('Ranking de países (top 15):')

display(ranking_df.style.background_gradient(cmap='viridis'))

In [None]:
# Seção 10: Visualização Geoespacial (Mapas Coropléticos)

def carregar_mapa_mundo():

    try:

        world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))

        # Ajustar ISO codes

        world = world.rename(columns={'iso_a3':'country_iso3'})

        return world

    except Exception as e:

        print('Erro carregando mapa mundial:', e)

        return gpd.GeoDataFrame()



world = carregar_mapa_mundo()

if not world.empty and not final_agg.empty:

    latest = final_agg.sort_values('date_period').groupby('country_iso3').tail(1)

    geo_df = world.merge(latest, on='country_iso3', how='left')

    fig_map = px.choropleth(geo_df.to_crs('EPSG:4326'), locations='country_iso3', color='impact_score_mean', hover_name='country_iso3', projection='natural earth', title='Impacto Integrado por País (Último Período)')

    fig_map.show()

else:

    print('Mapa não gerado (dados insuficientes).')

In [None]:
# Seção 11: Visualizações Temporais (Séries / Heatmaps)

def serie_temporal(df:pd.DataFrame, pais:str):

    d = df[df.country_iso3==pais].sort_values('date_period')

    if d.empty:

        print('Sem dados para', pais)

        return None

    fig = px.line(d, x='date_period', y='impact_score_mean', title=f'Série de Impacto - {pais}')

    fig.show()

    return d



def heatmap_paises(df:pd.DataFrame):

    if df.empty: return

    pivot = df.pivot_table(index='country_iso3', columns='date_period', values='impact_score_mean', aggfunc='mean')

    pivot = pivot.fillna(0)

    fig = px.imshow(pivot.values, labels=dict(x='Período', y='País', color='Impacto'), x=pivot.columns, y=pivot.index, aspect='auto', title='Heatmap Impacto por País x Período')

    fig.show()



if not final_agg.empty:

    heatmap_paises(final_agg)

    serie_temporal(final_agg, ISO3_COUNTRIES[0])

In [None]:
# Seção 12: Dashboard Interativo com ipywidgets (Atualizado para fonte real/placeholder)

pais_widget = widgets.SelectMultiple(options=ISO3_COUNTRIES, value=tuple(ISO3_COUNTRIES[:3]), description='Países')

alpha_widget = widgets.FloatSlider(min=0,max=1,step=0.05,value=ALPHA,description='Alpha Notícias')

beta_widget = widgets.FloatSlider(min=0,max=1,step=0.05,value=BETA,description='Beta Sat')

gamma_widget = widgets.FloatSlider(min=0,max=1,step=0.05,value=GAMMA,description='Gamma Hub')

fonte_widget = widgets.ToggleButtons(options=['placeholder','real'], value='real' if 'real_base' in globals() and not real_base.empty else 'placeholder', description='Fonte')

out_dashboard = widgets.Output()



def recalcular(_):

    with out_dashboard:

        out_dashboard.clear_output()

        a,b,g = set_pesos(alpha_widget.value, beta_widget.value, gamma_widget.value)

        if fonte_widget.value == 'real' and 'real_base' in globals() and not real_base.empty:

            base_df = real_base.copy()

            # Já temos event_cnt_norm, tone_norm, hub_events_norm; recalcular impacto com novos pesos

            base_df['impact_raw'] = a*base_df['event_cnt_norm'] + b*base_df['tone_norm'] + g*base_df['hub_events_norm']

            base_df['impact_score'] = safe_norm(base_df['impact_raw'])

            base_df['date_period'] = pd.to_datetime(base_df['date'])

            sel = list(pais_widget.value)

            base_df = base_df[base_df.country_iso3.isin(sel)]

            agg = base_df.groupby(['country_iso3','date_period']).impact_score.mean().reset_index(name='impact_score_mean')

        else:

            if 'impact_base' not in globals() or impact_base.empty:

                print('Sem dados placeholder disponíveis.')

                return

            temp = impact_base.copy()

            temp['impact_raw'] = a*temp['news_metric'] + b*temp['sat_metric'] + g*temp['hub_metric']

            temp = normalizar_coluna(temp, 'impact_raw')

            temp.rename(columns={'impact_raw_norm':'impact_score'}, inplace=True)

            sel = list(pais_widget.value)

            temp = temp[temp.country_iso3.isin(sel)]

            agg = agregacao_final(temp, FREQUENCIA)

        if agg.empty:

            print('Sem dados após filtro.')

            return

        fig = px.line(agg, x='date_period', y='impact_score_mean', color='country_iso3', title=f'Impacto ({fonte_widget.value})')

        fig.show()

        rank = agg.groupby('country_iso3').impact_score_mean.mean().reset_index().sort_values('impact_score_mean', ascending=False)

        display(rank.head(len(list(pais_widget.value))))



for w in [pais_widget, alpha_widget, beta_widget, gamma_widget, fonte_widget]:

    w.observe(recalcular, names='value')



display(widgets.HBox([pais_widget, widgets.VBox([alpha_widget,beta_widget,gamma_widget,fonte_widget])]))

display(out_dashboard)

recalcular(None)

In [None]:
# Seção 13: Exportação dos Resultados (CSV / Parquet)

timestamp = datetime.utcnow().strftime('%Y%m%d_%H%M%S')

def exportar(df:pd.DataFrame, nome_base:str):

    if df.empty:

        print('Nada para exportar em', nome_base)

        return

    csv_path = f'{nome_base}_{timestamp}.csv'

    parquet_path = f'{nome_base}_{timestamp}.parquet'

    df.to_csv(csv_path, index=False)

    try:

        df.to_parquet(parquet_path, index=False)

    except Exception as e:

        print('Falha exportar parquet (instale pyarrow):', e)

    print('Exportado:', csv_path, parquet_path)



exportar(final_agg, 'impacto_pais_periodo')

In [None]:
# Seção 14: Rotina de Agendamento / Execução Automatizada (Opcional)

def rotina_principal():

    inicio = time.time()

    # Exemplo simplificado: recomputar impacto usando pesos correntes

    if impact_base.empty:

        print('Sem base para rotina.')

        return

    temp = impact_base.copy()

    temp['impact_raw'] = ALPHA*temp['news_metric'] + BETA*temp['sat_metric'] + GAMMA*temp['hub_metric']

    temp = normalizar_coluna(temp, 'impact_raw')

    dur = time.time() - inicio

    print(f'Rotina concluída em {dur:.2f}s. Linhas processadas: {len(temp)}')

    return temp



if __name__ == '__main__':

    rotina_principal()



# Fim do notebook.

In [None]:
# Integração Postgres: Conexão e Carregamento de hub_event / sat_event (Nova Seção)

import sqlalchemy as sa

from sqlalchemy.engine import URL



PG_HOST = os.getenv('PGHOST','postgres')

PG_DB   = os.getenv('PGDATABASE','gdelt')

PG_USER = os.getenv('PGUSER','gdelt_user')

PG_PWD  = os.getenv('PGPASSWORD','gdelt_pass')

PG_PORT = os.getenv('PGPORT','5432')

PG_SCHEMA = 'silver'



conn_url = URL.create(

    drivername='postgresql+psycopg2',

    username=PG_USER,

    password=PG_PWD,

    host=PG_HOST,

    port=PG_PORT,

    database=PG_DB

)

engine = sa.create_engine(conn_url)



def load_table(table_name:str):

    full_name = f'{PG_SCHEMA}.{table_name}'

    try:

        df = pd.read_sql(f'SELECT * FROM {full_name} LIMIT 200000', engine)

        print(f'Carregado {len(df)} linhas de {full_name}')

        return df

    except Exception as e:

        print('Erro carregando', full_name, e)

        return pd.DataFrame()



hub_event_df = load_table('hub_event')

sat_event_df = load_table('sat_event')



# Parsing de localização / países a partir de sat_event.locations

def extract_country_codes(loc_str):

    if not isinstance(loc_str,str) or loc_str.strip()=='' :

        return []

    parts = loc_str.split(';')

    codes = []

    for p in parts:

        # Estrutura comum GDELT LOC: geonameid#type#countrycode#adm1code#latitude#longitude#featureid

        segs = p.split('#')

        if len(segs) >= 3:

            cc = segs[2].upper()

            if re.fullmatch(r'[A-Z]{2,3}', cc):

                codes.append(cc)

    return list(set(codes))



if 'locations' in sat_event_df.columns:

    sat_event_df['country_codes'] = sat_event_df['locations'].apply(extract_country_codes)

    sat_exploded = sat_event_df.explode('country_codes')

else:

    sat_event_df['country_codes'] = []

    sat_exploded = sat_event_df



# Função para mapear ISO2->ISO3 quando necessário

iso2_to_iso3 = {c.alpha_2: c.alpha_3 for c in pycountry.countries}

def to_iso3(code):

    if code is None or not isinstance(code,str):

        return None

    code = code.upper()

    if len(code)==2 and code in iso2_to_iso3:

        return iso2_to_iso3[code]

    if len(code)==3:

        return code

    return None



sat_exploded['country_iso3'] = sat_exploded['country_codes'].apply(to_iso3)

sat_exploded = sat_exploded[~sat_exploded['country_iso3'].isna()]



# Derivar data (date_event já existe em sat_event). Caso contrário usar load_date

date_col = 'date_event' if 'date_event' in sat_exploded.columns else 'load_date'

sat_exploded['date'] = pd.to_datetime(sat_exploded[date_col]).dt.date



# Métricas agregadas de satélite (exemplo: média tone e polarity, contagem eventos)

sat_metrics = sat_exploded.groupby(['country_iso3','date']).agg(

    tone_mean=('tone','mean'),

    polarity_mean=('polarity','mean'),

    event_cnt=('event_hk','count')

).reset_index()



# Hub_event agregação (apenas contagem distinta de eventos por país derivado cruzando com satélite)

# hub_event não tem países -> usamos join por event_hk para trazer same country mapping

if not hub_event_df.empty and 'event_hk' in hub_event_df.columns and 'event_hk' in sat_exploded.columns:

    hub_with_country = hub_event_df.merge(sat_exploded[['event_hk','country_iso3','date']], on='event_hk', how='left')

    hub_metrics = hub_with_country.groupby(['country_iso3','date']).agg(hub_events=('event_hk','nunique')).reset_index()

else:

    hub_metrics = pd.DataFrame(columns=['country_iso3','date','hub_events'])



# Combinar métricas reais em base integrada

real_base = sat_metrics.merge(hub_metrics, on=['country_iso3','date'], how='left')

real_base['hub_events'] = real_base['hub_events'].fillna(0)



# Normalizações

def safe_norm(series):

    if series.empty: return series

    mn, mx = series.min(), series.max()

    if mn==mx: return pd.Series([0]*len(series), index=series.index)

    return (series-mn)/(mx-mn)



real_base['tone_norm'] = safe_norm(real_base['tone_mean'].fillna(0))

real_base['polarity_norm'] = safe_norm(real_base['polarity_mean'].fillna(0))

real_base['event_cnt_norm'] = safe_norm(real_base['event_cnt'])

real_base['hub_events_norm'] = safe_norm(real_base['hub_events'])



# Recalcular impacto usando: news_metric ~ event_cnt_norm, sat_metric ~ tone_norm, hub_metric ~ hub_events_norm (exemplo)

real_base['impact_raw'] = ALPHA*real_base['event_cnt_norm'] + BETA*real_base['tone_norm'] + GAMMA*real_base['hub_events_norm']

real_base['impact_score'] = safe_norm(real_base['impact_raw'])



print('Real base shape:', real_base.shape)



# Preview ranking real

real_ranking = real_base.groupby('country_iso3').impact_score.mean().reset_index().sort_values('impact_score', ascending=False)

display(real_ranking.head(15))



# Flag para dashboard usar dados reais

USE_REAL_DATA = True
