# Task 1: Data Exploration and Enrichment
This notebook focuses on understanding the starter dataset and enriching it with additional data for the forecasting task.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

# Add src to path
sys.path.append(os.path.abspath("../src"))
from utils.data_loader import load_data, preprocess_data
from utils.plotting import set_plot_style

# Set plot style
set_plot_style()

## 1. Load the starter dataset

In [2]:
df = load_data()
df = preprocess_data(df)
df.head()

Unnamed: 0,record_id,parent_id,record_type,category,pillar,indicator,indicator_code,indicator_direction,value_numeric,value_text,...,impact_direction,impact_magnitude,impact_estimate,lag_months,evidence_basis,comparable_country,collected_by,collection_date,original_text,notes
0,REC_0001,,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,22.0,,...,,,,,,Example_Trainee,2025-01-20,,Baseline year,
1,REC_0002,,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,35.0,,...,,,,,,Example_Trainee,2025-01-20,,,
2,REC_0003,,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,46.0,,...,,,,,,Example_Trainee,2025-01-20,,,
3,REC_0004,,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,56.0,,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,
4,REC_0005,,observation,,ACCESS,Account Ownership Rate,ACC_OWNERSHIP,higher_better,36.0,,...,,,,,,Example_Trainee,2025-01-20,,Gender disaggregated,


## 2. Explore the Data
### 2.1 Count records by record_type, pillar, source_type, and confidence

In [3]:
cols_to_count = ['record_type', 'pillar', 'source_type', 'confidence']
for col in cols_to_count:
    print(f"--- {col} counts ---")
    print(df[col].value_counts(dropna=False))
    print("\n")

--- record_type counts ---
record_type
observation    31
event          11
target          3
impact_link     2
Name: count, dtype: int64


--- pillar counts ---
pillar
ACCESS           18
USAGE            12
NaN              11
GENDER            5
AFFORDABILITY     1
Name: count, dtype: int64


--- source_type counts ---
source_type
operator      15
survey        10
regulator      8
research       5
policy         3
calculated     2
news           2
NaN            2
Name: count, dtype: int64


--- confidence counts ---
confidence
high      42
medium     3
NaN        2
Name: count, dtype: int64




### 2.2 Identify the temporal range of observations

In [4]:
obs_df = df[df['record_type'] == 'observation'].copy()
# observation_date is already converted to datetime by preprocess_data
print(f"Temporal range of observations: {obs_df['observation_date'].min()} to {obs_df['observation_date'].max()}")

Temporal range of observations: 2014-12-31 00:00:00 to 2025-12-31 00:00:00


### 2.3 List all unique indicators and their coverage

In [5]:
indicators = df[df['record_type'] == 'observation']['indicator_code'].unique()
print(f"Unique indicators: {indicators}")

# Coverage per indicator
df[df['record_type'] == 'observation'].groupby('indicator_code')['observation_date'].count().sort_values(ascending=False)

Unique indicators: ['ACC_OWNERSHIP' 'ACC_MM_ACCOUNT' 'ACC_4G_COV' 'ACC_MOBILE_PEN'
 'ACC_FAYDA' 'USG_P2P_COUNT' 'USG_P2P_VALUE' 'USG_ATM_COUNT'
 'USG_ATM_VALUE' 'USG_CROSSOVER' 'USG_TELEBIRR_USERS' 'USG_TELEBIRR_VALUE'
 'USG_MPESA_USERS' 'USG_MPESA_ACTIVE' 'USG_ACTIVE_RATE' 'AFF_DATA_INCOME'
 'GEN_GAP_ACC' 'GEN_MM_SHARE' 'GEN_GAP_MOBILE' 'USG_MM_COUNT']


indicator_code
ACC_OWNERSHIP         6
ACC_FAYDA             3
ACC_MM_ACCOUNT        2
ACC_4G_COV            2
USG_P2P_COUNT         2
GEN_GAP_ACC           2
ACC_MOBILE_PEN        1
AFF_DATA_INCOME       1
GEN_MM_SHARE          1
USG_ACTIVE_RATE       1
USG_ATM_COUNT         1
GEN_GAP_MOBILE        1
USG_ATM_VALUE         1
USG_CROSSOVER         1
USG_MPESA_ACTIVE      1
USG_MM_COUNT          1
USG_MPESA_USERS       1
USG_P2P_VALUE         1
USG_TELEBIRR_USERS    1
USG_TELEBIRR_VALUE    1
Name: observation_date, dtype: int64