# **GLOBAL HEALTH DATA ANALYSIS - Data Retrieval and Sources**

## **Mount Google Drive & Import Libraries**


In [2]:
# Mount Google Drive
#from google.colab import drive
#drive.mount('/content/drive')

**Importing required libraries**

In [None]:
# Importing all required libraries
import pandas as pd
import numpy as np
import requests
import json
from io import StringIO

**Setting Database Link from WHO**

In [3]:
health_url = "https://storage.googleapis.com/covid19-open-data/v3/health.csv"

In [None]:
try:
    response = requests.get(health_url, timeout=10)
    response.raise_for_status()  # Raises HTTPError for bad responses
    health = pd.read_csv(StringIO(response.text))
    print("Data successfully retrieved!")
except Exception as e:
    print("Error retrieving data:", e)

health.head()


Data successfully retrieved!


Unnamed: 0,location_key,life_expectancy,smoking_prevalence,diabetes_prevalence,infant_mortality_rate,adult_male_mortality_rate,adult_female_mortality_rate,pollution_mortality_rate,comorbidity_mortality_rate,hospital_beds_per_1000,nurses_per_1000,physicians_per_1000,health_expenditure_usd,out_of_pocket_health_expenditure_usd
0,AD,,33.5,7.7,2.7,,,,,,4.0128,3.3333,4040.786621,1688.12146
1,AE,77.814,28.9,16.3,6.5,69.555,44.863,54.7,16.8,,5.7271,2.5278,1357.017456,256.034485
2,AF,64.486,,9.2,47.9,237.554,192.532,211.1,29.8,0.5,0.1755,0.2782,67.12265,50.665913
3,AG,76.885,,13.1,5.0,126.917,83.136,29.9,22.6,,4.5171,2.956,673.85968,235.749039
4,AL,78.9,28.7,9.0,7.8,93.315,49.486,68.0,17.0,,3.6495,1.2164,,


In [7]:
# === 2, 
HEALTH_URL = "https://storage.googleapis.com/covid19-open-data/v3/health.csv"

print(f"Downloading health data from: {HEALTH_URL}")
health = pd.read_csv(HEALTH_URL)

print("✓ Raw Data Loaded from remote health.csv")
print(f"  Shape: {health.shape[0]:,} rows × {health.shape[1]} columns")
print(f"  Columns: {list(health.columns)[:10]} ...\n")

# Extract UNIQUE country codes (no subdivisions)
countries = (
    health.loc[~health["location_key"].str.contains("_", na=False), "location_key"]
    .dropna()
    .unique()
)

print(f"✓ Extracted {len(countries)} country codes for API enrichment")
print("  Sample:", ", ".join(countries[:10]), "\n")

Downloading health data from: https://storage.googleapis.com/covid19-open-data/v3/health.csv
✓ Raw Data Loaded from remote health.csv
  Shape: 3,504 rows × 14 columns
  Columns: ['location_key', 'life_expectancy', 'smoking_prevalence', 'diabetes_prevalence', 'infant_mortality_rate', 'adult_male_mortality_rate', 'adult_female_mortality_rate', 'pollution_mortality_rate', 'comorbidity_mortality_rate', 'hospital_beds_per_1000'] ...

✓ Extracted 209 country codes for API enrichment
  Sample: AD, AE, AF, AG, AL, AM, AO, AR, AT, AU 



In [8]:

# === 2. World Bank API setup ===
print("Endpoint: https://api.worldbank.org/v2/")

INDICATORS = {
    'NY.GDP.MKTP.CD': 'GDP (current US$)',
    'NY.GDP.PCAP.CD': 'GDP per capita (current US$)',
    'SL.UEM.TOTL.ZS': 'Unemployment rate (% of labor force)',
    'NY.GDP.MKTP.KD': 'GDP (constant 2015 US$)'
}

TARGET_YEAR = "2022"
GDP_PER_CAPITA_INDICATOR = "NY.GDP.PCAP.CD"

economic_data_list = []

try:
    print("Attempting to fetch REAL data from World Bank API...\n")

    # Try to fetch data for each country
    for country_code in countries:
        try:
            # Fetch GDP per capita indicator (REAL data)
            gdp_url = (
                f"https://api.worldbank.org/v2/country/"
                f"{country_code}/indicators/{GDP_PER_CAPITA_INDICATOR}"
                f"?format=json&date={TARGET_YEAR}"
            )

            print(f"  {country_code}: Connecting to API... ", end="")
            gdp_response = requests.get(gdp_url, timeout=60)

            if gdp_response.status_code == 200:
                gdp_data = gdp_response.json()

                # Extract REAL values from API response
                if gdp_data and len(gdp_data) > 1 and gdp_data[1]:
                    # Get the most recent data point
                    data_point = gdp_data[1][0]
                    gdp_value = data_point.get("value")

                    if gdp_value is not None:  # Only if we got REAL data
                        economic_data_list.append({
                            "location_key": country_code,
                            "gdp_per_capita_usd": float(gdp_value),
                            "data_year": data_point.get("date"),
                            "source": "World Bank API (REAL DATA)",
                        })
                        print(f"✓ Got REAL data: ${float(gdp_value):,.0f}")
                    else:
                        print("✗ No value in response")
                else:
                    print("✗ Empty response")
            else:
                print(f"✗ Status {gdp_response.status_code}")

        except requests.exceptions.Timeout:
            print("⚠ Timeout")
        except Exception as e:
            print(f"✗ Error ({str(e)[:40]})")

except Exception as e:
    print(f"⚠ API Error: {str(e)[:50]}")

# Create DataFrame from fetched data
if economic_data_list:
    economic_data = pd.DataFrame(economic_data_list)
    print(f"\nSuccessfully retrieved {len(economic_data)} countries with REAL World Bank data!")
    print("\n   Sample:")
    print(economic_data[["location_key", "gdp_per_capita_usd", "data_year"]].head())
else:
    print("\n⚠ No REAL data retrieved from API")
    print("   Creating empty DataFrame...")
    economic_data = pd.DataFrame({
        "location_key": countries[:5],
        "gdp_per_capita_usd": [np.nan] * 5,
        "source": "World Bank API (Attempted - No Data Retrieved)",
    })


Endpoint: https://api.worldbank.org/v2/
Attempting to fetch REAL data from World Bank API...

  AD: Connecting to API... ✓ Got REAL data: $42,414
  AE: Connecting to API... ✓ Got REAL data: $49,899
  AF: Connecting to API... ✓ Got REAL data: $357
  AG: Connecting to API... ✓ Got REAL data: $20,105
  AL: Connecting to API... ✓ Got REAL data: $6,846
  AM: Connecting to API... ✓ Got REAL data: $6,572
  AO: Connecting to API... ✗ Status 400
  AR: Connecting to API... ✓ Got REAL data: $13,936
  AT: Connecting to API... ✓ Got REAL data: $52,177
  AU: Connecting to API... ✓ Got REAL data: $64,997
  AW: Connecting to API... ✗ Status 400
  AZ: Connecting to API... ✗ Status 400
  BA: Connecting to API... ✗ Status 400
  BB: Connecting to API... ✗ Status 400
  BD: Connecting to API... ✗ Status 400
  BE: Connecting to API... ✓ Got REAL data: $50,822
  BF: Connecting to API... ✓ Got REAL data: $836
  BG: Connecting to API... ✓ Got REAL data: $14,000
  BH: Connecting to API... ✓ Got REAL data: $30,47

In [None]:
# === 3. MERGE INTO CLEAN DATAFRAME ===

df_clean = health.merge(economic_data, how="left", on="location_key")
print(f"✓ Combined dataset: {df_clean.shape[0]:,} rows, {df_clean.shape[1]} columns\n")


In [None]:
# === 4. SAVE FILES TO GOOGLE DRIVE ===

from google.colab import drive
drive.mount('/content/drive')

SAVE_DIR = "/content/drive/MyDrive/global_health_project/"

health.to_csv(SAVE_DIR + "health.csv", index=False)
economic_data.to_csv(SAVE_DIR + "economic_data.csv", index=False)
df_clean.to_csv(SAVE_DIR + "df_clean.csv", index=False)

print("✓ Saved files to Google Drive successfully!")


ModuleNotFoundError: No module named 'google.colab'