In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import tqdm

In [2]:
# Load data
df = pd.read_csv("filtered_data.csv")
print(df.shape)
df

(18322712, 7)


Unnamed: 0.1,Unnamed: 0,station_id,year,month,element,day,value
0,0,AGM00060490,1957,1,TMAX,1,178
1,1,AGM00060490,1957,1,TMAX,2,150
2,2,AGM00060490,1957,1,TMAX,3,161
3,3,AGM00060490,1957,1,TMAX,4,172
4,4,AGM00060490,1957,1,TMAX,5,172
...,...,...,...,...,...,...,...
18322707,15215,ZA000067753,1990,11,PRCP,26,0
18322708,15216,ZA000067753,1990,11,PRCP,27,0
18322709,15217,ZA000067753,1990,11,PRCP,28,0
18322710,15218,ZA000067753,1990,11,PRCP,29,0


In [3]:
# ===========================================================
# Compute each year's temperature
# ===========================================================

# For each station and for each day, compute the midpoint temperature by
# averaging the min and max temperatures
mid_temps = df.where(np.logical_or(df["element"] == "TMIN",
                            df["element"] == "TMAX")).groupby(by=["station_id", "year", "month", "day"]).mean().reset_index()

# For each station and for each year, compute the average temperature across that year
avg_yearly_temps_per_station = mid_temps.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

# For each year, compute the average temperature across stations
avg_yearly_temps = mid_temps.groupby(by=["year"]).mean()["value"].reset_index()

# All temperatures are in tenths of degree Celsius, so divide by 10 to get
# actual Celsius temperatures
avg_yearly_temps["value"] /= 10

In [4]:
avg_yearly_temps["year"] = avg_yearly_temps["year"].astype(int)
avg_yearly_temps = avg_yearly_temps.set_index("year").squeeze()
print(avg_yearly_temps.shape)
avg_yearly_temps.rename("temp", inplace=True)
avg_yearly_temps


(145,)


year
1877     8.820109
1878     7.758888
1879     3.143174
1880     3.977254
1881     4.746990
          ...    
2017    10.677159
2018    10.307024
2019    10.116904
2020    10.444746
2021     2.146452
Name: temp, Length: 145, dtype: float64

In [5]:
# ===========================================================
# Compute each year's precipitation
# ===========================================================

def compute_yearly_metrics(metrics):
    metrics_data = []
    for metric in tqdm.tqdm(metrics):
        # Extract the data
        data = df.where(df['element'] == metric).dropna()

        # Delete missing data, and unnecessary columns
        data = data[data["value"] != -9999]

        # For each station and for each year, compute the average temperature across that year
        avg_yearly_per_station = data.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

        # For each year, compute the average temperature across stations
        avg_yearly = avg_yearly_per_station.groupby(by=["year"]).mean()["value"].reset_index()

        avg_yearly["year"] = avg_yearly["year"].astype(int)
        avg_yearly = avg_yearly.set_index("year").squeeze()
        avg_yearly.rename(metric, inplace=True)

        metrics_data.append(avg_yearly)

    return metrics_data
    


In [9]:
# Join the temperature and metric data (on the index a.k.a. the year column)
metrics_list = ["PRCP", "SNOW"]
data = [avg_yearly_temps] + compute_yearly_metrics(metrics_list)
data = pd.concat(data, axis=1, join="inner", ignore_index=False)
data

(172,)
(152,)


Unnamed: 0_level_0,temp,PRCP,SNOW
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1877,8.820109,15.316494,4.471920
1878,7.758888,21.908112,3.014515
1879,3.143174,18.989336,7.251486
1880,3.977254,14.674011,4.565574
1881,4.746990,16.090796,2.852521
...,...,...,...
2017,10.677159,36.918720,5.729140
2018,10.307024,40.338285,5.829240
2019,10.116904,36.929213,7.169056
2020,10.444746,37.480173,5.522127


In [7]:
# ===========================================================
# Compute % differences in temperature and precipitation
# year-to-year
# ===========================================================
changes = np.log(1 + data.pct_change()) * 100
changes.drop(labels=changes.index[0], inplace=True) # drop the first row because it's just NaN values
changes

Unnamed: 0_level_0,temp,PRCP,SNOW
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1878,-12.819520,35.792670,-39.437895
1879,-90.360585,-14.297940,87.776747
1880,23.535852,-25.779960,-46.266223
1881,17.691917,9.216944,-47.034102
1882,7.122093,7.373779,22.847724
...,...,...,...
2017,-2.344578,-5.956306,21.271004
2018,-3.528117,8.858228,1.732118
2019,-1.861789,-8.829810,20.688725
2020,3.189134,1.480917,-26.101090


In [11]:
# Run a two-sample t-test to test between whether there's a difference in means of the percent changes for temperature and precipitation
for metric in metrics_list:
    print(metric, ":", stats.ttest_ind(changes["temp"], changes[metric], equal_var=False, alternative="two-sided"))

PRCP : Ttest_indResult(statistic=-0.7788559211868984, pvalue=0.4368743456054145)
SNOW : Ttest_indResult(statistic=-0.406283392995795, pvalue=0.6849498152073565)


Things to analyze

* We can do a two-sample t-test between yearly difference in temperature and yearly difference in precipitation. That way we can see that if difference in temperature increases, whether or not difference in precipitation increases by chance. --> Do this for every climate indicator, not just precipitation
* Question: do we really need to use the differences between values, or can we just use the values themselves?
* Look into ozone depletion and CFCs getting banned in 1970s?