In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import tqdm
from statsmodels.stats.descriptivestats import sign_test

In [9]:
# Load data
df = pd.read_csv("filtered_data.csv")

# Test run on a sample of the data
#df = df.sample(frac=0.01)

print(df.shape)
df

(18322712, 7)


Unnamed: 0.1,Unnamed: 0,station_id,year,month,element,day,value
0,0,AGM00060490,1957,1,TMAX,1,178
1,1,AGM00060490,1957,1,TMAX,2,150
2,2,AGM00060490,1957,1,TMAX,3,161
3,3,AGM00060490,1957,1,TMAX,4,172
4,4,AGM00060490,1957,1,TMAX,5,172
...,...,...,...,...,...,...,...
18322707,15215,ZA000067753,1990,11,PRCP,26,0
18322708,15216,ZA000067753,1990,11,PRCP,27,0
18322709,15217,ZA000067753,1990,11,PRCP,28,0
18322710,15218,ZA000067753,1990,11,PRCP,29,0


In [10]:
# ===========================================================
# Compute each station's annual mean temperature
# ===========================================================

# For each station and for each day, compute the midpoint temperature by
# averaging the min and max temperatures
mid_temps = df.where(np.logical_or(df["element"] == "TMIN",
                            df["element"] == "TMAX")).groupby(by=["station_id", "year", "month", "day"]).mean().reset_index()

# For each station and for each year, compute the average temperature across that year
temps = mid_temps.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

# All temperatures are in tenths of degree Celsius, so divide by 10 to get
# actual Celsius temperatures
temps["value"] /= 10

# Convert year to int type
temps["year"] = temps["year"].astype(int)

temps = temps.set_index(["station_id", "year"])
temps.rename(columns={"value": "temp"}, inplace=True)
print(temps.shape)
temps

(11486, 1)


Unnamed: 0_level_0,Unnamed: 1_level_0,temp
station_id,year,Unnamed: 2_level_1
AGM00060490,1957,17.622492
AGM00060490,1958,18.110000
AGM00060490,1959,18.911058
AGM00060490,1960,19.716121
AGM00060490,1961,20.243947
...,...,...
VMM00048808,2017,20.512021
VMM00048808,2018,21.082609
VMM00048808,2019,23.860252
VMM00048808,2020,20.951423


In [11]:
# ===========================================================
# Compute each year's metrics (e.g. precipitation, snowfall)
# ===========================================================

def compute_yearly_metrics(metrics):
    metrics_data = []
    for metric in tqdm.tqdm(metrics):
        # Extract the data
        data = df.where(df['element'] == metric).dropna()
        
        # Delete missing data, and unnecessary columns
        data = data[data["value"] != -9999]

        # For each station and for each year, compute the average metric across that year
        result = data.groupby(by=["station_id", "year"]).mean()["value"].reset_index()

        # Convert year to int type
        result["year"] = result["year"].astype(int)
        
        result = result.set_index(["station_id", "year"])
        result.rename(columns={"value": metric}, inplace=True)
        
        metrics_data.append(result)

    return metrics_data

# Join the temperature and metric data (on the index a.k.a. the year column)
metrics_list = ["PRCP", "SNOW"]
data = [temps] + compute_yearly_metrics(metrics_list)
print([x.shape for x in data])
data = pd.concat(data, axis=1, join="inner", ignore_index=False)

100%|██████████| 2/2 [00:26<00:00, 13.03s/it]


[(11486, 1), (26080, 1), (9600, 1)]


In [12]:
print(data.shape)
data

(5321, 3)


Unnamed: 0_level_0,Unnamed: 1_level_0,temp,PRCP,SNOW
station_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA001012046,1941,12.177250,58.684066,0.139726
CA001012046,1942,10.615616,38.252055,1.723288
CA001012046,1943,9.303836,38.821918,2.309589
CA001012046,1944,10.156967,41.245902,0.139344
CA001012046,1945,10.031096,62.969863,0.904110
...,...,...,...,...
USW00064776,2015,7.254670,20.580822,0.000000
USW00064776,2016,8.224862,18.169399,0.000000
USW00064776,2017,7.915753,21.128767,0.000000
USW00064776,2018,7.595994,21.000000,0.000000


In [13]:
# Remove year as an index and make it a regular column
data.reset_index(level="year", inplace=True)
data

Unnamed: 0_level_0,year,temp,PRCP,SNOW
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA001012046,1941,12.177250,58.684066,0.139726
CA001012046,1942,10.615616,38.252055,1.723288
CA001012046,1943,9.303836,38.821918,2.309589
CA001012046,1944,10.156967,41.245902,0.139344
CA001012046,1945,10.031096,62.969863,0.904110
...,...,...,...,...
USW00064776,2015,7.254670,20.580822,0.000000
USW00064776,2016,8.224862,18.169399,0.000000
USW00064776,2017,7.915753,21.128767,0.000000
USW00064776,2018,7.595994,21.000000,0.000000


In [60]:
year_column = data["year"]
all_years = year_column.unique()
all_years.sort()
all_adj_years = np.lib.stride_tricks.sliding_window_view(all_years, 2)

# Keep track of the number of pairs of years where temperature AND precipitation both had a statistically significant increase
num_sig_prcp = 0

# Keep track of the number of pairs of years where temperature AND snowfall both had a statistically significant increase
num_sig_snow = 0

# Significance level
alpha = 0.05

for (year1, year2) in tqdm.tqdm(all_adj_years):
    # Get station data for both years
    data_year1 = data[year_column == year1]
    data_year2 = data[year_column == year2]
    
    # Get specific columns
    temp_year1 = data_year1["temp"]
    temp_year2 = data_year2["temp"]
    prcp_year1 = data_year1["PRCP"]
    prcp_year2 = data_year2["PRCP"]
    snow_year1 = data_year1["SNOW"]
    snow_year2 = data_year2["SNOW"]

    # Inner join on station id to find observations common between both years, and
    # then compute the differences
    temp_year_diff = pd.DataFrame(temp_year1).join(temp_year2, how="inner",
                                                     lsuffix="_{}".format(year1),
                                                     rsuffix="_{}".format(year2)).diff(axis=1).iloc[:, 1]
    prcp_year_diff = pd.DataFrame(prcp_year1).join(prcp_year2, how="inner",
                                                     lsuffix="_{}".format(year1),
                                                     rsuffix="_{}".format(year2)).diff(axis=1).iloc[:, 1]    
    snow_year_diff = pd.DataFrame(snow_year1).join(snow_year2, how="inner",
                                                     lsuffix="_{}".format(year1),
                                                     rsuffix="_{}".format(year2)).diff(axis=1).iloc[:, 1]
    
    # Count the number of differences that are positive
    #
    # These three values will serve as our test statistics for a sign test on temperature,
    # a sign test on precipitation, and a sign test on snowfall, respectively
    test_stat_temp = np.count_nonzero(temp_year_diff > 0)
    test_stat_prcp = np.count_nonzero(prcp_year_diff > 0)
    test_stat_snow = np.count_nonzero(snow_year_diff > 0)

    # Convert each test statistic into a p-value using the binomial test. In this case, each
    # test statistic is defined as the number of successes (a.k.a. the number of stations
    # for which temperature increased, precipitation increased, and snowfall increased,
    # respectively) out of all stations
    p_value_temp = stats.binomtest(k=test_stat_temp, n=temp_year_diff.size, alternative='two-sided').pvalue
    p_value_prcp = stats.binomtest(k=test_stat_prcp, n=prcp_year_diff.size, alternative='two-sided').pvalue
    p_value_snow = stats.binomtest(k=test_stat_snow, n=snow_year_diff.size, alternative='two-sided').pvalue

    # If the p-values for temperature and precipitation are BOTH significant, then it means
    # that between year1 and year2, temperature and precipitation BOTH had a statistically
    # significant increase
    if p_value_temp < alpha and p_value_prcp < alpha:
        num_sig_prcp += 1
    
    # If the p-values for temperature and snowfall are BOTH significant, then it means
    # that between year1 and year2, temperature and snowfall BOTH had a statistically
    # significant increase
    if p_value_temp < alpha and p_value_snow < alpha:
        num_sig_snow += 1

100%|██████████| 144/144 [00:01<00:00, 87.04it/s]


In [None]:

print(f'Significant Proportions of 1 Year Differences - Precipitation = {num_sig_prcp/len(all_adj_years_10)}')
print(f'Significant Proportions of 1 Year Differences - Snowfall = {num_sig_snow/len(all_adj_years_10)}')

In [64]:
num_sig_prcp

19

In [65]:
num_sig_snow

22

In [66]:
len(all_adj_years)

144

In [70]:
all_adj_years_5 = np.lib.stride_tricks.sliding_window_view(all_years, 6)
all_adj_years_5 = [[win[0], win[-1]]for win in all_adj_years_5]

# Keep track of the number of pairs of years where temperature AND precipitation both had a statistically significant increase
num_sig_prcp = 0

# Keep track of the number of pairs of years where temperature AND snowfall both had a statistically significant increase
num_sig_snow = 0

# Significance level
alpha = 0.05

for (year1, year2) in tqdm.tqdm(all_adj_years_5):
    # Get station data for both years
    data_year1 = data[year_column == year1]
    data_year2 = data[year_column == year2]
    
    # Get specific columns
    temp_year1 = data_year1["temp"]
    temp_year2 = data_year2["temp"]
    prcp_year1 = data_year1["PRCP"]
    prcp_year2 = data_year2["PRCP"]
    snow_year1 = data_year1["SNOW"]
    snow_year2 = data_year2["SNOW"]

    # Inner join on station id to find observations common between both years, and
    # then compute the differences
    temp_year_diff = pd.DataFrame(temp_year1).join(temp_year2, how="inner",
                                                     lsuffix="_{}".format(year1),
                                                     rsuffix="_{}".format(year2)).diff(axis=1).iloc[:, 1]
    prcp_year_diff = pd.DataFrame(prcp_year1).join(prcp_year2, how="inner",
                                                     lsuffix="_{}".format(year1),
                                                     rsuffix="_{}".format(year2)).diff(axis=1).iloc[:, 1]    
    snow_year_diff = pd.DataFrame(snow_year1).join(snow_year2, how="inner",
                                                     lsuffix="_{}".format(year1),
                                                     rsuffix="_{}".format(year2)).diff(axis=1).iloc[:, 1]
    
    # Count the number of differences that are positive
    #
    # These three values will serve as our test statistics for a sign test on temperature,
    # a sign test on precipitation, and a sign test on snowfall, respectively
    test_stat_temp = np.count_nonzero(temp_year_diff > 0)
    test_stat_prcp = np.count_nonzero(prcp_year_diff > 0)
    test_stat_snow = np.count_nonzero(snow_year_diff > 0)

    # Convert each test statistic into a p-value using the binomial test. In this case, each
    # test statistic is defined as the number of successes (a.k.a. the number of stations
    # for which temperature increased, precipitation increased, and snowfall increased,
    # respectively) out of all stations
    p_value_temp = stats.binomtest(k=test_stat_temp, n=temp_year_diff.size, alternative='two-sided').pvalue
    p_value_prcp = stats.binomtest(k=test_stat_prcp, n=prcp_year_diff.size, alternative='two-sided').pvalue
    p_value_snow = stats.binomtest(k=test_stat_snow, n=snow_year_diff.size, alternative='two-sided').pvalue

    # If the p-values for temperature and precipitation are BOTH significant, then it means
    # that between year1 and year2, temperature and precipitation BOTH had a statistically
    # significant increase
    if p_value_temp < alpha and p_value_prcp < alpha:
        num_sig_prcp += 1
    
    # If the p-values for temperature and snowfall are BOTH significant, then it means
    # that between year1 and year2, temperature and snowfall BOTH had a statistically
    # significant increase
    if p_value_temp < alpha and p_value_snow < alpha:
        num_sig_snow += 1

100%|██████████| 140/140 [00:01<00:00, 84.22it/s]


In [71]:
print(num_sig_prcp, num_sig_snow, len(all_adj_years_5))
print(f'Significant Proportions of 5 Year Differences - Precipitation = {num_sig_prcp/len(all_adj_years_10)}')
print(f'Significant Proportions of 5 Year Differences - Snowfall = {num_sig_snow/len(all_adj_years_10)}')

9 16 140
Significant Proportions of 5 Year Differences - Precipitation = 0.06666666666666667
Significant Proportions of 5 Year Differences - Snowfall = 0.11851851851851852


In [56]:
all_adj_years_10 = np.lib.stride_tricks.sliding_window_view(all_years, 11)
all_adj_years_10 = [[win[0], win[-1]]for win in all_adj_years_10]

# Keep track of the number of pairs of years where temperature AND precipitation both had a statistically significant increase
num_sig_prcp = 0

# Keep track of the number of pairs of years where temperature AND snowfall both had a statistically significant increase
num_sig_snow = 0

# Significance level
alpha = 0.05

for (year1, year2) in tqdm.tqdm(all_adj_years_10):
    # Get station data for both years
    data_year1 = data[year_column == year1]
    data_year2 = data[year_column == year2]
    
    # Get specific columns
    temp_year1 = data_year1["temp"]
    temp_year2 = data_year2["temp"]
    prcp_year1 = data_year1["PRCP"]
    prcp_year2 = data_year2["PRCP"]
    snow_year1 = data_year1["SNOW"]
    snow_year2 = data_year2["SNOW"]

    # Inner join on station id to find observations common between both years, and
    # then compute the differences
    temp_year_diff = pd.DataFrame(temp_year1).join(temp_year2, how="inner",
                                                     lsuffix="_{}".format(year1),
                                                     rsuffix="_{}".format(year2)).diff(axis=1).iloc[:, 1]
    prcp_year_diff = pd.DataFrame(prcp_year1).join(prcp_year2, how="inner",
                                                     lsuffix="_{}".format(year1),
                                                     rsuffix="_{}".format(year2)).diff(axis=1).iloc[:, 1]    
    snow_year_diff = pd.DataFrame(snow_year1).join(snow_year2, how="inner",
                                                     lsuffix="_{}".format(year1),
                                                     rsuffix="_{}".format(year2)).diff(axis=1).iloc[:, 1]
    
    # Count the number of differences that are positive
    #
    # These three values will serve as our test statistics for a sign test on temperature,
    # a sign test on precipitation, and a sign test on snowfall, respectively
    test_stat_temp = np.count_nonzero(temp_year_diff > 0)
    test_stat_prcp = np.count_nonzero(prcp_year_diff > 0)
    test_stat_snow = np.count_nonzero(snow_year_diff > 0)

    # Convert each test statistic into a p-value using the binomial test. In this case, each
    # test statistic is defined as the number of successes (a.k.a. the number of stations
    # for which temperature increased, precipitation increased, and snowfall increased,
    # respectively) out of all stations

    p_value_temp = stats.binomtest(k=test_stat_temp, n=temp_year_diff.size, alternative='two-sided').pvalue
    p_value_prcp = stats.binomtest(k=test_stat_prcp, n=prcp_year_diff.size, alternative='two-sided').pvalue
    p_value_snow = stats.binomtest(k=test_stat_snow, n=snow_year_diff.size, alternative='two-sided').pvalue

    # If the p-values for temperature and precipitation are BOTH significant, then it means
    # that between year1 and year2, temperature and precipitation BOTH had a statistically
    # significant increase
    if p_value_temp < alpha and p_value_prcp < alpha:
        num_sig_prcp += 1
    
    # If the p-values for temperature and snowfall are BOTH significant, then it means
    # that between year1 and year2, temperature and snowfall BOTH had a statistically
    # significant increase
    if p_value_temp < alpha and p_value_snow < alpha:
        num_sig_snow += 1

100%|██████████| 135/135 [00:01<00:00, 93.58it/s]


In [68]:
print(num_sig_prcp, num_sig_snow, len(all_adj_years_10))
print(f'Significant Proportions of 10 Year Differences - Precipitation = {num_sig_prcp/len(all_adj_years_10)}')
print(f'Significant Proportions of 10 Year Differences - Snowfall = {num_sig_snow/len(all_adj_years_10)}')

19 22 135
Significant Proportions of 10 Year Differences - Precipitation = 0.14074074074074075
Significant Proportions of 10 Year Differences - Snowfall = 0.16296296296296298
