In [1]:
# ==============================================================================
# SENTINELA FLUVIAL - MEDGEMMA 4B INFERENCE PIPELINE
# ==============================================================================
# Installing dependencies for 4-bit execution (memory efficient)
!pip install -q transformers

print("✅ All packages installed!")

✅ All packages installed!


In [2]:
import json
import os

from huggingface_hub import login
from kaggle_secrets import UserSecretsClient

# HF token from Kaggle secrets
user_secrets = UserSecretsClient()
hf_token = user_secrets.get_secret("HF_TOKEN")

login(token=hf_token)

print("✅ Logged in to Hugging Face!")

✅ Logged in to Hugging Face!


In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model_id = "google/medgemma-4b-it"

print(f"Loading model and tokenizer: {model_id}...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    dtype=torch.bfloat16,
    device_map="auto"
)
print("✅ Model loaded successfully!")

Loading model and tokenizer: google/medgemma-4b-it...


tokenizer_config.json:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.69M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/33.4M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/35.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

chat_template.jinja:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

2026-02-23 04:17:59.394742: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1771820279.585104      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1771820279.636607      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1771820280.096490      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771820280.096539      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1771820280.096543      24 computation_placer.cc:177] computation placer alr

model.safetensors.index.json:   0%|          | 0.00/90.6k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.64G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

✅ Model loaded successfully!


In [4]:
# Dataset and Output paths
dataset_path = '/kaggle/input/datasets/richardsonsouza/prompts-medgemma-dataset/dataset_prompts_medgemma.jsonl'
# Note: /kaggle/input is read-only. We save to /kaggle/working/ and then move if needed.
output_path = '/kaggle/working/result.json'

print(f"STARTING BATCH PROCESSING: {dataset_path}\n")

# Generation Configuration
gen_config = {
    "max_new_tokens": 1024,
    "temperature": 0.2,
    "do_sample": True,
    "top_p": 0.95
}

# Test limit (Set to None to process the entire file)
LIMITE_TESTE = 50
contador = 0
results = []

try:
    with open(dataset_path, 'r', encoding='utf-8') as f:
        for line in f:
            # 1. Safety stop for testing
            if LIMITE_TESTE and contador >= LIMITE_TESTE:
                print(f"Test limit reached ({LIMITE_TESTE} cases). Stopping...")
                break
                
            # 2. JSON Parsing
            case = json.loads(line)
            
            # 3. Metadata Extraction
            meta = case.get('meta', {})
            municipio = meta.get('municipality', 'Unknown')
            competencia = meta.get('competence', 'N/A')
            estacao = meta.get('season', 'N/A')
            
            # 4. Prompt Extraction
            prompt_text = case.get('prompt')
            
            if not prompt_text:
                print(f"Skip: Empty prompt for ID {case.get('id')}")
                continue

            # 5. Model Formatting
            chat = [
                { "role": "user", "content": prompt_text },
            ]
            formatted_prompt = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
            
            # 6. Inference
            inputs = tokenizer(formatted_prompt, return_tensors="pt").to("cuda")
            
            with torch.no_grad():
                outputs = model.generate(**inputs, **gen_config)
            
            # 7. Decoding (Extracting only the new response)
            generated_tokens = outputs[0][inputs.input_ids.shape[-1]:]
            clean_response = tokenizer.decode(generated_tokens, skip_special_tokens=True)

            # 8. Report Display
            print(f"CASE {contador + 1}: {municipio.upper()} ({competencia}) | Season: {estacao}")
            print("-" * 80)
            print("MEDGEMMA RESPONSE (JSON):")
            print(clean_response)
            print("=" * 80 + "\n")
            
            # 9. Result Persistence and Formatting
            # Removing markdown decorators if present
            raw_json = clean_response.strip()
            if raw_json.startswith("```json"):
                raw_json = raw_json[7:]
            elif raw_json.startswith("```"):
                raw_json = raw_json[3:]
            
            if raw_json.endswith("```"):
                raw_json = raw_json[:-3]
            
            raw_json = raw_json.strip()

            try:
                json_response = json.loads(raw_json)
                results.append({
                    "id": case.get('id'),
                    "meta": meta,
                    "response": json_response
                })
            except json.JSONDecodeError:
                # Fallback: try to find the first { and last } if direct parsing fails
                try:
                    start_idx = raw_json.find('{')
                    end_idx = raw_json.rfind('}')
                    if start_idx != -1 and end_idx != -1:
                        json_response = json.loads(raw_json[start_idx:end_idx+1])
                        results.append({
                            "id": case.get('id'),
                            "meta": meta,
                            "response": json_response
                        })
                    else:
                        raise ValueError("No JSON delimiters found")
                except:
                    print("Warning: Response is not valid JSON. Storing as raw string.")
                    results.append({
                        "id": case.get('id'),
                        "meta": meta,
                        "response_raw": clean_response
                    })

            # 10. Memory Management and Increment
            del inputs, outputs
            torch.cuda.empty_cache()
            contador += 1

    # Saving consolidated results
    with open(output_path, 'w', encoding='utf-8') as out_f:
        json.dump(results, out_f, indent=4, ensure_ascii=False)
    print(f"✅ Successfully persisted {len(results)} results to {output_path}")

except FileNotFoundError:
    print("Error: Dataset file not found. Check the path.")
except Exception as e:
    print(f"Unexpected error: {e}")

print("Processing completed.")

STARTING BATCH PROCESSING: /kaggle/input/datasets/richardsonsouza/prompts-medgemma-dataset/dataset_prompts_medgemma.jsonl

CASE 1: MANAUS (2024-10) | Season: Vazante
--------------------------------------------------------------------------------
MEDGEMMA RESPONSE (JSON):
```json
{
  "analise_situacional": "A unidade móvel em Vazante, com foco em aferição de pressão arterial, enfrenta desafios específicos devido à estação seca e ao acesso difícil. A baixa vazão dos rios dificulta a logística e a disponibilidade de insumos. A prioridade é a prevenção e a atenção primária, com a aferição de pressão arterial como procedimento chave. A análise da inteligência computacional é limitada, mas a sazonalidade da estação Vazante exige uma abordagem estratégica. A baixa vazão dos rios pode aumentar o risco de água parada e contaminação de poços, impactando a saúde da população. A aferição de pressão arterial é crucial para identificar hipertensos, que são mais vulneráveis a complicações cardiovasc