# Homework 5 (line plots and distributions)

For this homework assignment you are required to use Python (pandas, matplotlib, and seaborn) to do the exercises. Please write your own code and provide your code along with your responses to the questions. 

In [None]:
from collections.abc import Generator, Iterator
from datetime import datetime

import matplotlib.pyplot as plt
from matplotlib.axes import Axes
import seaborn as sns
import pandas as pd

## Part 1 (time charts)

For this part we will plot population change in some of the world's most populous countries. To get started, read in the CSV file provided on Canvas. 
* Population data source:  https://data.worldbank.org/indicator/SP.POP.TOTL 



In [None]:
df = pd.read_csv("country_populations.csv", index_col = 0)
df.head()

### Problem 1 (2 points)

Create two line plots on the same figure. 
* On the first chart, plot the populations of India, China, and the United States from 1960 to 2023.
* On the second chart, plot the populations of the United States, Indonesia, Pakistan, Nigeria, Brazil, and Bangladesh from 1960 to 2023. 

Do some additional formatting to the line charts:
* Use different colors for each country (the color for the United States should be the same in both)
* Label lines with country names directly, using the country color as the text color
* Add horizontal gridlines 
* Remove unnecessary axis lines and tick marks
* Add chart and figure titles, and label units appropriately
   *  Scales should not be the same in both charts (second should be "zoomed in" to see populations of smaller countries)
* Make sure all text is horizontal 

**Hint**: it might be helpful to transpose the dataframe so that years are rows and countries are columns

In [None]:
countries1 = ["United States", "China", "India"]
countries2 = ["United States", "Indonesia", "Pakistan", "Nigeria", "Brazil", "Bangladesh"]

# Your answer here

def _line_plot(
    axes: Axes,
    data: pd.DataFrame,
    title: str | None = None,
    countries: list[str] | None = None,
):
    timespan = int(data.index.values[-1]) - int(data.index.values[0])
    colors: Generator[str] = (
        color
        for color in (
            ("tab:blue", "tab:green", "tab:red", "tab:pink", "tab:cyan")
            if not countries
            else ["tab:green", "tab:blue"] + ["tab:gray"] * (len(data))
        )
    )

    for country in data.columns:
        color = "tab:orange" if country == "United States" else next(colors)
        data[country].plot(ax=axes, label=country, c=color)
        if not countries or country in countries:
            axes.text(timespan, data.loc["2023", country], country, color=color)

    if isinstance(title, str):
        axes.set_title(title)
    axes.grid(axis="y")
    axes.spines[["left", "right", "top", "bottom"]].set_visible(False)


def line_plots(
    df: pd.DataFrame,
    info_iter: Iterator[list[str]],
    super_title: str,
    *,
    rows: int = 1,
    subtitles: Generator[str] | None = None,
    selection: list[str] | None = None,
):
    fig, ax = plt.subplots(1, rows)
    fig.suptitle(super_title)

    if rows > 1:
        if not subtitles:
            raise ValueError("Subtitles were not declared.")
        for axes in ax:
            _line_plot(
                axes,
                df.loc[next(info_iter), next(info_iter)].T,
                next(subtitles),
                selection,
            )
    else:
        _line_plot(ax, df.loc[next(info_iter), next(info_iter)].T, countries=selection)

    plt.show()


info_iter: Generator[list[str]] = (info for info in (countries1, [f"{number}" for number in range(1960, 2024)], countries2, [f"{number}" for number in range(1960, 2024)]))
subtitles = (title for title in ("Population (in Billions)", "Population (in Millions)"))
line_plots(
        df,
        info_iter,
        "Population in Large Countries is shrinking (1960-2023)",
        rows=2,
        subtitles=subtitles,
)

### Problem 2a (1 point)

Create a new dataframe that computes the **yearly percent change** in population each year for all countries
* Adjust the values in the dataframe so that they represent percents (multiply by 100 if necessary)
* Reference:  https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pct_change.html 

In [None]:
# Your answer here
pct_change_df = df.T.pct_change(fill_method=None).T

### Problem 2b (1.5 points)

Plot the percent change in population from **1970** to 2023 of the following 9 countries on the same graph:
* The United States, China, India, Indonesia, Pakistan, Nigeria, Brazil, Bangladesh, and Mexico

Follow the same principles for designing the line chart as in problem 1, except:
* Only use color for the United States, China, and India (use gray for all other countries)
* Only label the lines for the United States, China, and India 


In [None]:
countries = ["United States", "China", "India", "Indonesia", "Pakistan", "Nigeria", "Brazil", "Bangladesh", "Mexico"]

# Your answer here
years = [str(number) for number in range(1970, 2024)]

info_iter2: Generator[list[str]] = (info for info in (countries, years))

line_plots(
    pct_change_df,
    iter(info_iter2),
    "Population growth is slowing in larger countries (1970-2023)",
    selection=countries1,
)

### Problem 2c (1.5 points)

Redesign the previous figure using ***small multiples***. Create a 3x3 grid of line plots, using the same y-axis range across all plots. 
* Label each plot with the country name
* Remove unnecessary axis lines and tick marks as before 

In [None]:
countries = [["United States", "China", "India"],
             ["Indonesia", "Pakistan", "Nigeria"],
             ["Brazil", "Bangladesh", "Mexico"]]

# Your answer here
def grid_plot(df: pd.DataFrame, countries: list[str], years: list[str], title: str):
    fig, ax = plt.subplots(3, 3)
    maximum = df.loc[countries, years].max().max()

    fig.suptitle(title)

    info_iter = iter(countries)

    for row in ax:
        for axes in row:
            country = next(info_iter)
            axes.plot(df.loc[country, years], label=country)
            axes.set_title(country)
            axes.set_xticks(years[::6])
            axes.set_ylim(0, maximum)
            axes.grid(axis="y")
            axes.spines[["left", "right", "top", "bottom"]].set_visible(False)

    plt.show()


grid_plot(
    pct_change_df, countries, years, "Populations in 12 Countries (1970-2023)"
)

## Part 2 (Histograms)

For this part we will get another look at the Delhi, India climate data set using histograms and density plots. 
* To get started, load the climate data provided on Canvas:

In [None]:
df_temps = pd.read_csv("DailyDelhiClimateTrain.csv")
df_temps.head()

### Problem 3 (2 points)

Create 3 histograms on the same figure to represent temperature counts across all available years. 
* Use bin sizes = 1 degree C, 2 degrees C, and 5 degrees C respectively 
* Include the kernel density for reference with each histogram 
* Label the units appropriately on each chart 
* Title each chart with the bin size

Reference:  https://seaborn.pydata.org/generated/seaborn.histplot.html 

In [None]:
# Your answer here
def hist_plot(df: pd.DataFrame):
    _, ax = plt.subplots(1, 3)
    bin_sizes: Generator[int] = (bin_size for bin_size in (1, 2, 5))

    for axes in ax:
        bin_size = next(bin_sizes)
        sns.histplot(df, ax=axes, binwidth=bin_size, kde=True)
        axes.set_xlabel("Average Temperature (Celsius)")
        axes.set_ylabel("Number of Days")
        axes.set_title(f"Bin size: {bin_size}")
        axes.lines[0].set_color("black")

    plt.show()
    
hist_plot(df_temps.meantemp.T)

### Problem 4 (2 points)

Create a single chart with the kernel densities for temperature plotted for each year (2013, 2014, 2015, and 2016).
* Label units appropriately 
* Include a legend to show which curve belongs to which year 
* What do you notice about the change in shape of the density curves?

Reference:  https://seaborn.pydata.org/generated/seaborn.kdeplot.html 

In [None]:
# Your answer here
def kde_plot(data: dict[str, pd.DataFrame], title: str):
    fig, ax = plt.subplots(1, 1)
    # print(data)
    # print(years)

    colors: Generator[str] = (
        color for color in ("tab:orange", "tab:blue", "tab:green", "tab:red")
    )

    for year, df in data.items():
        # print(year)
        sns.kdeplot(df["meantemp"], ax=ax, color=next(colors), label=year)

    ax.set_xlabel("Average Temperature (Celsius)")
    ax.legend()
    ax.set_title(title)

    plt.show()

date_dict_all = {}

for date, temp in zip(df_temps.date.values, df_temps.meantemp.values):
    date_dict_all[datetime.strptime(date.split()[0], "%Y-%m-%d")] = temp

unformated_years = {2013: {}, 2014: {}, 2015: {}, 2016: {}}
formated_years = {}

for day in date_dict_all:
    year = day.year
    if year in unformated_years:
        unformated_years[year][day] = date_dict_all[day]

for year in unformated_years:
    formated_years[year] = pd.DataFrame.from_dict(
        unformated_years[year], orient="index", columns=["meantemp"]
    )

kde_plot(formated_years, "Propensity of temperatures accross the years")