In [None]:
import pandas as pd
import requests

In [None]:
# 1. Example URL of a public API or data source
DATA_URL = "https://api.portaldatransparencia.gov.br/api/v1/some-dataset"
API_KEY = "YOUR_API_KEY"  # Replace with your API key if needed

In [None]:
# 2. Function to fetch data
def fetch_data(url, headers=None):
    try:
        response = requests.get(url, headers=headers)
        response.raise_for_status()  # Raise error if any problem
        return pd.DataFrame(response.json())
    except Exception as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()


In [None]:
# 3. Function to validate the data
def validate_data(df):
    print("\n===== Data Validation =====")

    # Check dataset size
    print(f"Number of rows: {len(df)}")

    # Check for missing values
    missing_values = df.isnull().sum()
    print("\nMissing values per column:")
    print(missing_values[missing_values > 0])

    # Check for duplicates
    duplicates = df.duplicated().sum()
    print(f"\nDuplicated rows: {duplicates}")

    # Check consistency of numeric fields
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    print("\nStatistical summary of numeric fields:")
    print(df[numeric_cols].describe())


In [None]:
# 4. Function to check last update
def check_last_update(df, date_column):
    if date_column in df.columns:
        df[date_column] = pd.to_datetime(df[date_column], errors='coerce')
        last_date = df[date_column].max()
        print(f"\nLast recorded date: {last_date}")
        if (datetime.now() - last_date).days > 30:
            print("WARNING: Data may be outdated!\n")
        else:
            print("Data is up to date.\n")
    else:
        print("Date column not found for validation.")


In [None]:
# 5. Run everything

# Configure API headers if needed
headers = {"Authorization": f"Bearer {API_KEY}"} if API_KEY else None

# Download the data
print("Downloading data...")
data = fetch_data(DATA_URL, headers)

if not data.empty:
    # Validate the data
    validate_data(data)

    # Check last update
    check_last_update(data, date_column="dataAtualizacao")  # Replace with the actual date column name

    # Save data for later use
    data.to_csv("validated_data.csv", index=False)
    print("\nData saved as 'validated_data.csv'.")
else:
    print("No data was downloaded.")
