<a href="https://colab.research.google.com/github/safiyenarman/DSA210-Project/blob/main/DataCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleaning SDG Scores (2019–2022)

- Data frame will only include `country`, `year` and `sdg_index_score`.
- It will be sorted by ascending score.

In [7]:
import pandas as pd

df = pd.read_csv("sdg_scores.csv")

sdg_filtered = df[["country", "year", "sdg_index_score"]]

sdg_filtered = sdg_filtered.rename(columns={
    "country": "Country",
    "year": "Year",
    "sdg_index_score": "SDG Score"
})

sdg_cleaned = sdg_filtered[(sdg_filtered["Year"] >= 2019) & (sdg_filtered["Year"] <= 2022)] \
    .sort_values(by=["Year", "SDG Score"], ascending=[True, True]) \
    .reset_index(drop=True)

In [8]:
sdg_cleaned.to_csv("cleanedSdgData.csv", index=False)

from google.colab import files
files.download("cleanedSdgData.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
print("")
print("Sorted and Cleaned SDG Scores: ")
print("")
display(sdg_cleaned.head(5))

middle_index = len(sdg_cleaned) // 2

print("\n. . .\n")
display(sdg_cleaned.iloc[middle_index - 2: middle_index + 3])

print("\n. . .\n")
display(sdg_cleaned.tail(5))


Sorted and Cleaned SDG Scores: 



Unnamed: 0,Country,Year,SDG Score
0,South Sudan,2019,36.8
1,Central African Republic,2019,38.5
2,Chad,2019,42.1
3,Afghanistan,2019,46.4
4,Niger,2019,46.9



. . .



Unnamed: 0,Country,Year,SDG Score
358,Sweden,2020,86.3
359,Finland,2020,86.4
360,South Sudan,2021,37.0
361,Central African Republic,2021,38.8
362,Chad,2021,43.7



. . .



Unnamed: 0,Country,Year,SDG Score
715,Austria,2022,82.3
716,Germany,2022,83.4
717,Denmark,2022,85.7
718,Sweden,2022,86.0
719,Finland,2022,86.8


# Cleaning Happiness Scores (2019–2022)

- Data frame will only include `country`, `year` and `happiness score`.
- It will be sorted by ascending score.

In [11]:
paths = {
    2019: "happiness_rank_2019.csv",
    2020: "happiness_rank_2020.csv",
    2021: "happiness_rank_2021.csv",
    2022: "happiness_rank_2022.csv"
}

dfs = []

for year, path in paths.items():
    df = pd.read_csv(path)

    if year == 2019:
        df = df.rename(columns={
            "Country or region": "Country",
            "Score": "Happiness score",
        })

    elif year == 2022:
        df = df.rename(columns={
            "Country": "Country",
            "Happiness score": "Happiness score",
        })
        df["Happiness score"] = df["Happiness score"].str.replace(",", ".").astype(float)

    else:
        df = df.rename(columns={
            "Country name": "Country",
            "Ladder score": "Happiness score",
        })

    df["Year"] = year
    df = df[["Country", "Happiness score", "Year"]]
    dfs.append(df)

happiness_all = pd.concat(dfs, ignore_index=True)

happiness_all["Country"] = happiness_all["Country"].str.replace("*", "", regex=False).str.strip()
happiness_all = happiness_all[~happiness_all["Country"].str.lower().isin(["xx", "na", ""])]

happiness_all_sorted = happiness_all.sort_values(by=["Year", "Happiness score"], ascending=[True, True]).reset_index(drop=True)

happiness_all_sorted["Happiness score"] = pd.to_numeric(happiness_all_sorted["Happiness score"], errors="coerce").round(2)

country_year_counts = happiness_all_sorted.groupby("Country")["Year"].nunique()
countries_in_all_years = country_year_counts[country_year_counts == 4].index
happiness_all_sorted = happiness_all_sorted[happiness_all_sorted["Country"].isin(countries_in_all_years)].reset_index(drop=True)

happiness_all_sorted.to_csv("cleanedHappinessData.csv", index=False)
from google.colab import files
files.download("cleanedHappinessData.csv")

print("")
print("Sorted and Cleaned Happiness Scores: ")
print("")
display(happiness_all_sorted.head(5).style.format({
    "Happiness score": "{:.2f}",
}))

middle_index = len(happiness_all_sorted) // 2
print("\n. . .\n")
display(happiness_all_sorted.iloc[middle_index - 2: middle_index + 3].style.format({
    "Happiness score": "{:.2f}",
}))
print("\n. . .\n")
display(happiness_all_sorted.tail(5).style.format({
    "Happiness score": "{:.2f}",
}))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


Sorted and Cleaned Happiness Scores: 



Unnamed: 0,Country,Happiness score,Year
0,Afghanistan,3.2,2019
1,Tanzania,3.23,2019
2,Rwanda,3.33,2019
3,Yemen,3.38,2019
4,Malawi,3.41,2019



. . .



Unnamed: 0,Country,Happiness score,Year
276,Denmark,7.65,2020
277,Finland,7.81,2020
278,Afghanistan,2.52,2021
279,Zimbabwe,3.14,2021
280,Rwanda,3.42,2021



. . .



Unnamed: 0,Country,Happiness score,Year
551,Netherlands,7.42,2022
552,Switzerland,7.51,2022
553,Iceland,7.56,2022
554,Denmark,7.64,2022
555,Finland,7.82,2022
