In [None]:
# --- SECTION 1: IMPORTS & SETUP ---
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Set the visual style for graphs
sns.set(style="whitegrid")

# --- SECTION 2: DATA LOADING ---
# Note: We use sep=';' for files that use semicolons and sep=',' for standard CSVs
print("Loading datasets...")

try:
    # 1. Health Expenditure (Comma separated)
    df_health = pd.read_csv('public-healthcare-spending-share-gdp.csv', sep=',')

    # 2. Obesity Rates (Semicolon separated)
    df_obesity = pd.read_csv('share-of-adults-defined-as-obese.csv', sep=';')

    # 3. Internet Usage (Semicolon separated)
    df_internet = pd.read_csv('share-of-individuals-using-the-internet.csv', sep=';')

    # 4. Urbanization (Semicolon separated)
    df_urban = pd.read_csv('share-of-population-urban.csv', sep=';')

    # 5. GDP Per Capita (Semicolon separated)
    df_gdp = pd.read_csv('gdp-per-capita-worldbank.csv', sep=';')
    
    print("All datasets loaded successfully!")

except FileNotFoundError as e:
    print(f"Error: {e}")
    print("Please make sure all .csv files are in the same folder as this notebook.")

# --- SECTION 3: DATA CLEANING ---
# Standardize column names to make them easy to work with
df_health = df_health.rename(columns={'Domestic general government health expenditure (% of GDP)': 'Health_Expenditure'})
df_obesity.columns = ['Entity', 'Code', 'Year', 'Obesity_Rate']
df_internet.columns = ['Entity', 'Code', 'Year', 'Internet_Usage']
df_urban = df_urban.rename(columns={'Urban population (% of total population)': 'Urban_Rate'})

# GDP file often has extra columns, keep only the first 4
df_gdp = df_gdp.iloc[:, :4]
df_gdp.columns = ['Entity', 'Code', 'Year', 'GDP_Per_Capita']

# Clean 'Year' column (convert to number, remove bad data)
for df in [df_health, df_obesity, df_internet, df_urban, df_gdp]:
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df.dropna(subset=['Year'], inplace=True)
    df['Year'] = df['Year'].astype(int)

# --- SECTION 4: MERGING DATA ---
# We use an 'inner' join to only keep rows where we have data for ALL variables
df_merged = pd.merge(df_obesity[['Entity', 'Year', 'Obesity_Rate']],
                     df_health[['Entity', 'Year', 'Health_Expenditure']],
                     on=['Entity', 'Year'], how='inner')

df_merged = pd.merge(df_merged, df_internet[['Entity', 'Year', 'Internet_Usage']],
                     on=['Entity', 'Year'], how='inner')

df_merged = pd.merge(df_merged, df_urban[['Entity', 'Year', 'Urban_Rate']],
                     on=['Entity', 'Year'], how='inner')

df_merged = pd.merge(df_merged, df_gdp[['Entity', 'Year', 'GDP_Per_Capita']],
                     on=['Entity', 'Year'], how='inner')

print(f"Merged Dataset Shape: {df_merged.shape}")
print(df_merged.head())

# --- SECTION 5: CORRELATION MATRIX ---
plt.figure(figsize=(10, 8))
corr_matrix = df_merged[['Obesity_Rate', 'Health_Expenditure', 'Internet_Usage', 'Urban_Rate', 'GDP_Per_Capita']].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix (All Years)')
plt.show()

# --- SECTION 6: HYPOTHESIS TESTING (2016 SNAPSHOT) ---
# We use 2016 because it often has the most complete recent data
df_2016 = df_merged[df_merged['Year'] == 2016]

plt.figure(figsize=(18, 5))

# Plot 1: Testing H1 (Health Spend vs Obesity)
plt.subplot(1, 3, 1)
sns.regplot(data=df_2016, x='Health_Expenditure', y='Obesity_Rate', line_kws={'color': 'red'})
plt.title('H1: Health Spend vs Obesity (2016)')
plt.xlabel('Health Expenditure (% of GDP)')
plt.ylabel('Obesity Rate (%)')

# Plot 2: Testing H3 (Internet vs Obesity)
plt.subplot(1, 3, 2)
sns.regplot(data=df_2016, x='Internet_Usage', y='Obesity_Rate', line_kws={'color': 'green'})
plt.title('H3: Internet Usage vs Obesity (2016)')
plt.xlabel('Internet Usage (%)')

# Plot 3: Testing H2 (Urbanization & GDP)
plt.subplot(1, 3, 3)
sns.scatterplot(data=df_2016, x='GDP_Per_Capita', y='Obesity_Rate', 
                size='Urban_Rate', sizes=(20, 200), hue='Urban_Rate', palette='viridis')
plt.title('H2: GDP vs Obesity (Size = Urbanization)')
plt.xlabel('GDP Per Capita')

plt.tight_layout()
plt.show()