# Data Preprocessing

Stan Bakker (15840530), Thijn Poland (14947854), Oliver Martin en Sebastian Rociu (15617084)

Groepsnummer: Groep C2 (#8)

The following code is used to preprocess the datasets into files that are used for the data story.

In [1]:
import pandas as pd
import numpy  as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

sns.set_style("whitegrid")
plt.rcParams.update({'font.size': 11})

INTERNET_CSV  = "global_internet_adoption_monthly_2010_2025.csv"
ECON_CSV      = "economic_indicators_dataset_2010_2023.csv"
HAPPY_CSV     = "World Happiness Report 2005-2021.csv"

internet_df  = pd.read_csv(INTERNET_CSV)
econ_df      = pd.read_csv(ECON_CSV)
happiness_df = pd.read_csv(HAPPY_CSV)

country_fix = {"USA": "United States"}
for df in [internet_df, econ_df, happiness_df]:
    col = next(c for c in df.columns if "Country" in c)
    df[col] = df[col].replace(country_fix)
    df.rename(columns={col: "Country"}, inplace=True)

for df in [internet_df, econ_df]:
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce")
    df["Year"] = df["Date"].dt.year

internet_avg_10_21 = (
    internet_df.query("2010 <= Year <= 2021")
               .groupby("Country")["Internet_Penetration (%)"]
               .mean().rename("Internet (%)")
               .reset_index()
)
gdp_avg_10_21 = (
    econ_df.query("2010 <= Year <= 2021")
           .groupby("Country")["GDP Growth Rate (%)"]
           .mean().rename("GDP growth (%)")
           .reset_index()
)
combined_internet_gdp = internet_avg_10_21.merge(gdp_avg_10_21, on="Country")

internet_yearly   = (internet_df.groupby("Year")["Internet_Penetration (%)"]
                                .mean().reset_index())
unemp_yearly      = (econ_df.groupby("Year")["Unemployment Rate (%)"]
                                .mean().reset_index())
merged_yearly     = internet_yearly.merge(unemp_yearly, on="Year")

internet_by_cty_yr = (internet_df.groupby(["Country", "Year"])
                                   ["Internet_Penetration (%)"]
                                   .mean().reset_index())
happiness_df["Social support"] *= 100        # naar 0-100 schaal
support_by_cty_yr = (happiness_df.groupby(["Country", "Year"])
                                   ["Social support"]
                                   .mean().reset_index())
generosity_by_cty_yr = (happiness_df.groupby(["Country", "Year"])
                                       ["Generosity"]
                                       .mean().reset_index())

internet_support = internet_by_cty_yr.merge(support_by_cty_yr, on=["Country", "Year"])
internet_generos = internet_by_cty_yr.merge(generosity_by_cty_yr, on=["Country", "Year"])

growth_2010_2023 = (
    internet_by_cty_yr.pivot(index="Country", columns="Year",
                             values="Internet_Penetration (%)")
                      .dropna(subset=[2010, 2023])
)
growth_2010_2023["Growth (%)"] = growth_2010_2023[2023] - growth_2010_2023[2010]
growth_df = growth_2010_2023[["Growth (%)"]].reset_index()

unemp_cty_mean = (econ_df.query("2010 <= Year <= 2023")
                          .groupby("Country")["Unemployment Rate (%)"]
                          .mean().reset_index())
growth_unemp = unemp_cty_mean.merge(growth_df, on="Country")

DATA = dict(
    internet_df=internet_df,
    econ_df=econ_df,
    happiness_df=happiness_df,
    combined_internet_gdp=combined_internet_gdp,
    merged_yearly=merged_yearly,
    internet_support=internet_support,
    internet_generos=internet_generos,
    growth_unemp=growth_unemp
)