In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Load data
df = pd.read_csv("filtered_data.csv")
print(df.shape)
df

In [None]:
# ===========================================================
# Compute each year's temperature
# ===========================================================

# For each station and for each day, compute the midpoint temperature by
# averaging the min and max temperatures
mid_temps = df.where(np.logical_or(df["element"] == "TMIN",
                            df["element"] == "TMAX")).groupby(by=["station_id", "year", "month", "day"]).mean().reset_index()

# For each station and for each year, compute the average temperature across that year
avg_yearly_temps_per_station = mid_temps.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

# For each year, compute the average temperature across stations
avg_yearly_temps = mid_temps.groupby(by=["year"]).mean()["value"].reset_index()

# All temperatures are in tenths of degree Celsius, so divide by 10 to get
# actual Celsius temperatures
avg_yearly_temps["value"] /= 10

In [None]:
avg_yearly_temps["year"] = avg_yearly_temps["year"].astype(int)
avg_yearly_temps = avg_yearly_temps.set_index("year").squeeze()
print(avg_yearly_temps.shape)
avg_yearly_temps


In [None]:
# ==================================
# Compute each year's precipitation
# ==================================

# Extract the precipitation data
prcp = df.where(df['element'] == 'PRCP').dropna()

# Delete missing data, and unnecessary columns
prcp = prcp[prcp["value"] != -9999]

# For each station and for each year, compute the average temperature across that year
avg_yearly_prcp_per_station = prcp.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

# For each year, compute the average temperature across stations
avg_yearly_prcp = avg_yearly_prcp_per_station.groupby(by=["year"]).mean()["value"].reset_index()


In [None]:
avg_yearly_prcp["year"] = avg_yearly_prcp["year"].astype(int)
avg_yearly_prcp = avg_yearly_prcp.set_index("year").squeeze()
print(avg_yearly_prcp.shape)
avg_yearly_prcp

In [None]:
# Join the temperature and precipitation data (on the index a.k.a. the year column)
temp_prcp = pd.concat((avg_yearly_temps, avg_yearly_prcp), axis=1, join="inner", ignore_index=False)
temp_prcp

Things to analyze

* We can do a two-sample t-test between yearly difference in temperature and yearly difference in precipitation. That way we can see that if difference in temperature increases, whether or not difference in precipitation increases by chance. --> Do this for every climate indicator, not just precipitation
* Question: do we really need to use the differences between values, or can we just use the values themselves?
* Look into ozone depletion and CFCs getting banned in 1970s?