# Task A: Correlation Between Population and GDP (2001–2020)
Pre-process data, compute average values, and evaluate the Pearson correlation coefficient.

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

In [None]:
# Load and prepare data from GitHub
gdp_url = "https://raw.githubusercontent.com/spike2025-art/eportfolio-uoe/main/Unit04%20Global_GDP.csv"
pop_url = "https://raw.githubusercontent.com/spike2025-art/eportfolio-uoe/main/Unit04%20Global_Population.csv"

gdp_df = pd.read_csv(gdp_url)
pop_df = pd.read_csv(pop_url)
year_range = [str(year) for year in range(2001, 2021)]

# Convert year columns to numeric to ensure calculation compatibility
for year in year_range:
    gdp_df[year] = pd.to_numeric(gdp_df[year], errors='coerce')
    pop_df[year] = pd.to_numeric(pop_df[year], errors='coerce')

# Fill missing values row-wise
gdp_df[year_range] = gdp_df[year_range].apply(lambda row: row.fillna(row.mean()), axis=1)
pop_df[year_range] = pop_df[year_range].apply(lambda row: row.fillna(row.mean()), axis=1)

# Compute means
gdp_df['Mean_GDP_per_capita'] = gdp_df[year_range].mean(axis=1)
pop_df['Mean_Population'] = pop_df[year_range].mean(axis=1)

In [None]:
merged = pd.merge(pop_df[['Country Name', 'Mean_Population']],
                  gdp_df[['Country Name', 'Mean_GDP_per_capita']],
                  on='Country Name')
merged = merged.replace([np.inf, -np.inf], np.nan).dropna()

corr, _ = pearsonr(merged['Mean_Population'], merged['Mean_GDP_per_capita'])
print(f'Pearson Correlation Coefficient: {corr:.4f}')

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(merged['Mean_Population'], merged['Mean_GDP_per_capita'], alpha=0.6)
plt.xlabel('Mean Population (2001–2020)')
plt.ylabel('Mean GDP per Capita (2001–2020)')
plt.title('Task A: Population vs GDP Correlation')
plt.grid(True)
plt.tight_layout()
plt.show()