### Understand the Schema

In [None]:
import pandas as pd

# Load main dataset
unified_df = pd.read_csv('data/raw/ethiopia_fi_unified_data.csv')

# Load reference codes
ref_codes_df = pd.read_csv('data/raw/reference_codes.csv')

# Examine structure
print(unified_df.head())  # All records share columns like record_type, pillar, indicator_code, etc.
print(unified_df['record_type'].value_counts())  # Should show ~30 observations, 10 events, 14 impact_links, 3 targets
print(ref_codes_df.head())  # Valid values for categories, e.g., indicator_codes like ACC_OWNERSHIP

# Note challenges: Events have category (e.g., policy) but no pillar (to avoid bias). Impact_links connect via parent_id.

### Explore the Data

In [None]:
# Counts by type, pillar, source_type, confidence
print(unified_df.groupby(['record_type', 'pillar', 'source_type', 'confidence']).size())

# Temporal range (for observations)
observations = unified_df[unified_df['record_type'] == 'observation']
print(observations['observation_date'].min(), observations['observation_date'].max())  # Likely 2011-2024

# Unique indicators and coverage
print(observations['indicator_code'].unique())
print(observations.groupby('indicator_code')['observation_date'].count())  # Sparse for some?

# Events and impact_links
events = unified_df[unified_df['record_type'] == 'event']
print(events[['event_date', 'category', 'description']])  # List e.g., Telebirr launch 2021-05
impact_links = unified_df[unified_df['record_type'] == 'impact_link']
print(impact_links[['parent_id', 'pillar', 'related_indicator', 'impact_direction', 'impact_magnitude']])

### Enrich the Dataset

In [None]:
# Add a new observation row (append as dict or use pd.concat)
new_observation = {
    'record_type': 'observation',
    'pillar': 'access',
    'indicator': 'Account Ownership Gender Gap',
    'indicator_code': 'ACC_OWNERSHIP_GENDER_GAP',
    'value_numeric': 10,
    'observation_date': '2021-01-01',  # Use YYYY-MM-DD
    'source_name': 'Global Findex',
    'source_url': 'https://www.worldbank.org/en/publication/globalfindex/Data',
    'confidence': 'high',
    'original_text': 'Gender gap in account ownership: 10 percentage points',
    'collected_by': 'Rahel',
    'collection_date': '2026-02-01',
    'notes': 'Useful for analyzing gender drivers in Access forecasting'
}
enriched_df = pd.concat([unified_df, pd.DataFrame([new_observation])], ignore_index=True)

# Similarly add events and impact_links
# Save to processed
enriched_df.to_csv('data/processed/enriched_ethiopia_fi_unified_data.csv', index=False)