In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import requests
import xml.etree.ElementTree as ET

url = "https://apps.who.int/gho/athena/api/GHO"
response = requests.get(url, headers={"Accept": "application/xml"})
root = ET.fromstring(response.content)


In [None]:
import requests
import xml.etree.ElementTree as ET
import pandas as pd

# Function to parse elements and extract Label and Display/text pairs
def parse_label_text_pairs(element):
    data = []

    # Traverse each attribute or dataset-like element and extract Label and Display/text
    for child in element:
        label = child.attrib.get("Label", None)  # Get the 'Label' attribute
        display_text = None

        # Check for 'Display' tag or text content within the child element
        display_element = child.find("Display")
        if display_element is not None:
            display_text = display_element.text
        elif child.text and child.text.strip():
            display_text = child.text.strip()

        if label and display_text:
            data.append({"Label": label, "Display_Text": display_text})

        # Handle nested elements if needed
        data.extend(parse_label_text_pairs(child))

    return data

# Fetch and parse the XML data
try:
    response = requests.get(url, headers={"Accept": "application/xml"})
    if response.status_code != 200:
        print(f"Request failed with status code: {response.status_code}")
    else:
        # Parse the XML content
        root = ET.fromstring(response.content)

        # Parse all 'Label' and 'Display/text' pairs under 'Metadata'
        metadata_data = parse_label_text_pairs(root.find("Metadata"))

        # Convert to DataFrame
        df_measures = pd.DataFrame(metadata_data)


except requests.exceptions.RequestException as e:
    print("Request failed:", e)
except ET.ParseError as e:
    print("Failed to parse XML:", e)


In [None]:
df_measures

Unnamed: 0,Label,Display_Text
0,CATEGORY,Category
1,RENDERER_ID,Renderer ID
2,DEFINITION_XML,Definition (XML)
3,IMR_ID,IMR identifier
4,DISPLAY_FR,Français
...,...,...
3147,M2_adult_representative,Most recent survey among adults was representa...
3148,M3_adult_periodic,Survey previous to the most recent survey amon...
3149,M4_youth_recent,Most recent school-based survey among adolesce...
3150,M5_youth_representative,Most recent school-based survey among adolesce...


In [None]:
df_measures.to_csv('/content/drive/MyDrive/MIT805_Exam/WHO_HealthData_Meta.csv')

In [None]:
available_measures = df_measures['Label'][7:].to_list()

In [None]:
import requests
from io import StringIO
import pandas as pd

key_columns = ['GHO', 'PUBLISHSTATE', 'YEAR', 'REGION', 'COUNTRY', 'Display Value', 'Numeric', 'Low', 'High', 'StdErr', 'StdDev', 'Comments']

# List to store each DataFrame
dataframes = []

for measures in available_measures:
    print(f"Processing measure: {measures}")
    url = "http://apps.who.int/gho/athena/api/GHO/{}.csv".format(measures)
    response = requests.get(url, headers={"Accept": "application/csv"})

    # Check if the response was successful
    if response.status_code == 200 and response.text.strip():  # Ensure response text is not empty
        data = StringIO(response.text)  # Use StringIO to treat text as file-like

        try:
            # Read each CSV into a DataFrame
            df = pd.read_csv(data)

            # Check if DataFrame has non-zero shape
            if df.shape[0] > 0 and df.shape[1] > 0:
                # Check if the 'YEAR' column is missing, and add a default if necessary
                if 'YEAR' not in df.columns:
                    df['YEAR'] = 'Unknown'  # or set it to a specific year if appropriate

                # Reshape the DataFrame
                df = pd.melt(df, id_vars=[col for col in key_columns if col in df.columns],
                             var_name="attribute", value_name="value")

                # Append the reshaped DataFrame to the list
                dataframes.append(df)
            else:
                print(f"Discarded empty DataFrame for measure: {measures}")
        except pd.errors.EmptyDataError:
            print(f"No data to parse for measure: {measures}")
    else:
        print(f"Failed to retrieve data for measure: {measures} with status code: {response.status_code}")

# Concatenate all DataFrames in the list into a single DataFrame
if dataframes:
    final_df = pd.concat(dataframes, ignore_index=True)
    # Display the concatenated DataFrame
    print(final_df.head())
else:
    print("No valid DataFrames to concatenate.")


Processing measure: MDG_0000000001
Processing measure: MDG_0000000003
Processing measure: MDG_0000000005
Discarded empty DataFrame for measure: MDG_0000000005
Processing measure: MDG_0000000007
Processing measure: MDG_0000000010
Processing measure: MDG_0000000011
Processing measure: MDG_0000000013
Discarded empty DataFrame for measure: MDG_0000000013
Processing measure: MDG_0000000014
Discarded empty DataFrame for measure: MDG_0000000014
Processing measure: MDG_0000000015
Processing measure: MDG_0000000016
Discarded empty DataFrame for measure: MDG_0000000016
Processing measure: MDG_0000000017
Processing measure: MDG_0000000018
Discarded empty DataFrame for measure: MDG_0000000018
Processing measure: MDG_0000000019
Discarded empty DataFrame for measure: MDG_0000000019
Processing measure: MDG_0000000020
Processing measure: MDG_0000000021
Processing measure: MDG_0000000022
Discarded empty DataFrame for measure: MDG_0000000022
Processing measure: MDG_0000000023
Discarded empty DataFrame f

In [None]:
final_df

Unnamed: 0,GHO,PUBLISHSTATE,YEAR,REGION,COUNTRY,Display Value,Numeric,Low,High,StdErr,StdDev,Comments,attribute,value
0,MDG_0000000001,PUBLISHED,2000,AFR,,90.84 [89.31-92.62],90.83788,89.30656,92.61595,,,,SEX,BTSX
1,MDG_0000000001,PUBLISHED,2001,AFR,,87.98 [86.49-89.72],87.97622,86.49111,89.72169,,,,SEX,BTSX
2,MDG_0000000001,PUBLISHED,2002,AFR,,85.05 [83.58-86.74],85.04704,83.58373,86.74262,,,,SEX,BTSX
3,MDG_0000000001,PUBLISHED,2003,AFR,,82.02 [80.56-83.68],82.01636,80.56035,83.67830,,,,SEX,BTSX
4,MDG_0000000001,PUBLISHED,2017,AFR,,54.12 [51.03-58.43],54.11607,51.02641,58.42746,,,,SEX,BTSX
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10917363,R_type_variant,PUBLISHED,2022,WPR,SLB,,,,,,,,TOBACCO_NICOTINE_PRODUCT,TOBACCO_SMOKELESS_MOSTSOLD
10917364,R_type_variant,PUBLISHED,2022,WPR,TON,,,,,,,,TOBACCO_NICOTINE_PRODUCT,TOBACCO_SMOKELESS_MOSTSOLD
10917365,R_type_variant,PUBLISHED,2022,WPR,TUV,,,,,,,,TOBACCO_NICOTINE_PRODUCT,TOBACCO_SMOKELESS_MOSTSOLD
10917366,R_type_variant,PUBLISHED,2022,WPR,VUT,,,,,,,,TOBACCO_NICOTINE_PRODUCT,TOBACCO_SMOKELESS_MOSTSOLD


In [None]:
final_df.to_csv('/content/drive/MyDrive/HealthData.csv')