# Task B: Linear Regression on Population and GDP (2001–2020)
Using linear regression to model the relationship between average population and GDP per capita.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from scipy.stats import pearsonr

In [None]:
# Load data
gdp_df = pd.read_csv('Unit04 Global_GDP.csv')
pop_df = pd.read_csv('Unit04 Global_Population.csv')
year_range = [str(year) for year in range(2001, 2021)]

# Convert to numeric and fill missing with row mean
for year in year_range:
    gdp_df[year] = pd.to_numeric(gdp_df[year], errors='coerce')
    pop_df[year] = pd.to_numeric(pop_df[year], errors='coerce')

gdp_df[year_range] = gdp_df[year_range].apply(lambda row: row.fillna(row.mean()), axis=1)
pop_df[year_range] = pop_df[year_range].apply(lambda row: row.fillna(row.mean()), axis=1)

gdp_df['Mean_GDP_per_capita'] = gdp_df[year_range].mean(axis=1)
pop_df['Mean_Population'] = pop_df[year_range].mean(axis=1)

In [None]:
merged = pd.merge(pop_df[['Country Name', 'Mean_Population']],
                  gdp_df[['Country Name', 'Mean_GDP_per_capita']],
                  on='Country Name')
merged = merged.replace([np.inf, -np.inf], np.nan).dropna()

X = merged[['Mean_Population']]
y = merged['Mean_GDP_per_capita']
model = LinearRegression().fit(X, y)
r2 = model.score(X, y)
slope = model.coef_[0]
intercept = model.intercept_
print(f'Regression Coefficient: {slope:.4f}')
print(f'Intercept: {intercept:.4f}')
print(f'R² Score: {r2:.4f}')

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(X, y, alpha=0.6, label='Data')
plt.plot(X, model.predict(X), color='red', label='Regression Line')
plt.xlabel('Mean Population (2001–2020)')
plt.ylabel('Mean GDP per Capita (2001–2020)')
plt.title('Task B: Linear Regression - Population vs GDP')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()